Coverage for tests/test_ingest.py: 18%
249 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 03:00 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 03:00 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import json
23import os
24import pickle
25import shutil
26import tempfile
27import unittest
29import lsst.daf.butler.tests as butlerTests
30from lsst.daf.butler import Butler, Config, DataCoordinate, Registry
31from lsst.daf.butler.registry import ConflictingDefinitionError
32from lsst.obs.base import RawIngestTask
33from lsst.obs.base.ingest_tests import IngestTestBase
34from lsst.obs.base.instrument_tests import DummyCam
35from lsst.utils.introspection import get_full_type_name
37TESTDIR = os.path.abspath(os.path.dirname(__file__))
38INGESTDIR = os.path.join(TESTDIR, "data", "ingest")
41class RawIngestTestCase(IngestTestBase, unittest.TestCase):
42 """Test ingest using JSON sidecar files."""
44 ingestDatasetTypeName = "raw_dict"
45 rawIngestTask = get_full_type_name(RawIngestTask)
46 curatedCalibrationDatasetTypes = ("testCalib",)
47 ingestDir = TESTDIR
48 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam"
49 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")
50 dataIds = [{"instrument": "DummyCam", "exposure": 100, "detector": 0}]
51 seed_config = os.path.join(TESTDIR, "data", "curated", "seed.yaml")
53 @property
54 def visits(self):
55 butler = Butler(self.root, collections=[self.outputRun])
56 return {
57 DataCoordinate.standardize(instrument="DummyCam", visit=100, universe=butler.dimensions): [
58 DataCoordinate.standardize(instrument="DummyCam", exposure=100, universe=butler.dimensions)
59 ]
60 }
62 def testWriteCuratedCalibrations(self):
63 # Inject the "data package" location.
64 DummyCam.dataPackageDir = os.path.join(TESTDIR, "data", "curated")
65 return super().testWriteCuratedCalibrations()
67 def _check_obscore(self, registry: Registry, has_visits: bool) -> None:
68 # Docstring inherited from base class.
69 assert registry.obsCoreTableManager is not None
70 with registry.obsCoreTableManager.query(lsst_run=self.outputRun) as result:
71 rows = list(result)
72 self.assertEqual(len(rows), 1)
73 row = rows[0]
75 # No spatial information until visits are defined
76 if not has_visits:
77 self.assertIsNone(row.s_ra)
78 self.assertIsNone(row.s_dec)
79 self.assertIsNone(row.s_fov)
80 self.assertIsNone(row.s_region)
81 else:
82 self.assertIsNotNone(row.s_ra)
83 self.assertIsNotNone(row.s_dec)
84 self.assertIsNotNone(row.s_fov)
85 self.assertRegex(row.s_region, "POLYGON ICRS .*")
88class RawIngestImpliedIndexTestCase(RawIngestTestCase):
89 """Test ingest using JSON index files."""
91 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml")
94class RawIngestEdgeCaseTestCase(unittest.TestCase):
95 """Test ingest using non-standard approaches including failures.
97 Must create a new butler for each test because dimension records are
98 globals.
99 """
101 def setUp(self):
102 butlerConfig = """
103datastore:
104 # Want to ingest real files so can't use in-memory datastore
105 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
106"""
107 self.root = tempfile.mkdtemp(dir=TESTDIR)
108 self.creatorButler = butlerTests.makeTestRepo(self.root, {}, config=Config.fromYaml(butlerConfig))
109 DummyCam().register(self.creatorButler.registry)
111 self.butler = butlerTests.makeTestCollection(self.creatorButler, uniqueId=self.id())
112 self.outputRun = self.butler.run
114 config = RawIngestTask.ConfigClass()
115 self.task = RawIngestTask(config=config, butler=self.butler)
117 # Different test files.
118 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits")
119 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml")
120 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits")
122 def tearDown(self):
123 if self.root is not None:
124 shutil.rmtree(self.root, ignore_errors=True)
126 def testSimpleIngest(self):
127 # Use the default per-instrument run for this.
128 self.task.run([self.good_file])
129 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all"))
130 self.assertEqual(len(datasets), 1)
132 # Now parallelized.
133 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")]
134 self.task.run(files, processes=2, run=self.outputRun)
135 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
136 self.assertEqual(len(datasets), 2)
138 def testTimeStampWarning(self):
139 # Now ingest a dataset which should generate a warning because of
140 # the end time being before the begin time.
141 return
142 files = [os.path.join(INGESTDIR, "sidecar_data", "dataset_end.yaml")]
143 with self.assertLogs("lsst.obs.base._instrument", level="WARNING") as cm:
144 self.task.run(files, run=self.outputRun)
146 self.assertIn("has end time before begin time", cm.output[0])
147 records = list(
148 self.butler.registry.queryDimensionRecords(
149 "exposure",
150 where="exposure = exp AND instrument = inst",
151 bind={"exp": 3000, "inst": "DummyCam"},
152 )
153 )
154 record = records[0]
155 timespan = record.timespan
156 self.assertEqual(timespan.begin.isot, timespan.end.isot)
158 def testExplicitIndex(self):
159 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")]
160 self.task.run(files, run=self.outputRun)
162 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
163 self.assertEqual(len(datasets), 2)
165 # Try again with an explicit index and a file that is in that index.
166 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml"))
167 new_run = self.outputRun + "b"
168 self.task.run(files, run=new_run)
170 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
171 self.assertEqual(len(datasets), 2)
173 # Now with two index files that point to the same files.
174 # Look for the warning from duplication.
175 files = [
176 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
177 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"),
178 ]
179 new_run = self.outputRun + "c"
181 with self.assertLogs(level="WARNING") as cm:
182 self.task.run(files, run=new_run)
183 self.assertIn("already specified in an index file, ignoring content", cm.output[0])
185 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
186 self.assertEqual(len(datasets), 2)
188 # Again with an index file of metadata and one of translated.
189 # Translated should win.
190 # Put the metadata one first to test that order is preserved.
191 files = [
192 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
193 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
194 ]
195 new_run = self.outputRun + "d"
196 with self.assertLogs(level="WARNING") as cm:
197 self.task.run(files, run=new_run)
198 self.assertIn("already specified in an index file but overriding", cm.output[0])
200 # Reversing the order should change the warning.
201 # Again with an index file of metadata and one of translated.
202 # Translated should win.
203 # Put the metadata one first to test that order is preserved.
204 files = [
205 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
206 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
207 ]
209 new_run = self.outputRun + "e"
210 with self.assertLogs(level="WARNING") as cm:
211 self.task.run(files, run=new_run)
212 self.assertIn("already specified in an index file, ignoring", cm.output[0])
214 # Bad index file.
215 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
216 with self.assertRaises(RuntimeError):
217 self.task.run(files, run=self.outputRun)
219 # Bad index file due to bad instrument.
220 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")]
221 with self.assertLogs(level="WARNING") as cm:
222 with self.assertRaises(RuntimeError):
223 self.task.run(files, run=self.outputRun)
224 self.assertIn("Instrument HSC for file", cm.output[0])
226 def testBadExposure(self):
227 """Test that bad exposures trigger the correct failure modes.
229 This is the only test that uses the bad definition of dataset 4
230 because exposure definitions are defined globally in a butler registry.
231 """
232 # Ingest 3 files. 2 of them will implicitly find an index and one
233 # will use a sidecar.
234 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)]
235 new_run = self.outputRun
236 self.task.run(files, run=new_run)
238 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run))
239 self.assertEqual(len(datasets), 3)
241 # Test fail fast.
242 self.task.config.failFast = True
244 # Ingest files with conflicting exposure definitions.
245 # Ingest 3 files. One of them will implicitly find an index and one
246 # will use a sidecar. The 3rd will fail due to exposure conflict.
247 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)]
248 new_run = self.outputRun + "_bad_exposure"
249 with self.assertRaises(ConflictingDefinitionError):
250 self.task.run(files, run=new_run)
252 def testBadFile(self):
253 """Try to ingest a bad file."""
254 files = [self.bad_metadata_file]
256 with self.assertRaises(RuntimeError) as cm:
257 # Default is to raise an error at the end.
258 self.task.run(files, run=self.outputRun)
259 self.assertIn("Some failures", str(cm.exception))
261 # Including a good file will result in ingest working but still
262 # raises (we might want to move this to solely happen in the
263 # command line invocation).
264 files.append(self.good_file)
266 # Also include a file with unknown instrument.
267 files.append(self.bad_instrument_file)
269 with self.assertRaises(RuntimeError):
270 self.task.run(files, run=self.outputRun)
271 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
272 self.assertEqual(len(datasets), 1)
274 # Fail fast will trigger a run time error with different text.
275 # Use a different output run to be sure we are not failing because
276 # of the attempt to ingest twice.
277 self.task.config.failFast = True
278 new_run = self.outputRun + "b"
279 with self.assertRaises(RuntimeError) as cm:
280 self.task.run([self.bad_metadata_file, self.good_file], run=new_run)
281 self.assertIn("Problem extracting metadata", str(cm.exception))
283 # Attempt to ingest good file again -- this will fail for a different
284 # reason than failed metadata extraction.
285 with self.assertRaises(ConflictingDefinitionError):
286 self.task.run([self.good_file], run=self.outputRun)
288 # Ingest a file with good metadata but unknown instrument.
289 with self.assertRaises(RuntimeError) as cm:
290 self.task.run([self.bad_instrument_file], run=self.outputRun)
291 self.assertIn("Instrument HSC", str(cm.exception))
293 # Ingest of a metadata index file that will fail translation.
294 with self.assertRaises(RuntimeError) as cm:
295 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")])
296 self.assertIn("Problem extracting metadata", str(cm.exception))
298 # Ingest of a bad index file.
299 with self.assertRaises(RuntimeError) as cm:
300 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")])
301 self.assertIn("Problem reading index file", str(cm.exception))
303 # Ingest of an implied bad index file.
304 with self.assertRaises(RuntimeError) as cm:
305 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")])
307 def testCallbacks(self):
308 """Test the callbacks for failures."""
309 # Define the callbacks.
310 metadata_failures = []
311 successes = []
312 ingest_failures = []
314 def on_metadata_failure(filename, exc):
315 metadata_failures.append(filename)
317 def on_success(datasets):
318 successes.append(datasets)
320 def on_ingest_failure(exposure, exc):
321 ingest_failures.append(exposure)
323 # Need our own task instance
324 config = RawIngestTask.ConfigClass()
325 self.task = RawIngestTask(
326 config=config,
327 butler=self.butler,
328 on_metadata_failure=on_metadata_failure,
329 on_success=on_success,
330 on_ingest_failure=on_ingest_failure,
331 )
333 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file]
335 with self.assertRaises(RuntimeError):
336 self.task.run(files, run=self.outputRun)
338 self.assertEqual(len(successes), 1)
339 self.assertEqual(len(metadata_failures), 2)
340 self.assertEqual(len(ingest_failures), 0)
342 # Try the good one a second time.
343 with self.assertRaises(RuntimeError):
344 self.task.run([self.good_file], run=self.outputRun)
346 self.assertEqual(len(successes), 1)
347 self.assertEqual(len(ingest_failures), 1)
349 # An index file with metadata that won't translate.
350 metadata_failures[:] = []
351 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
352 with self.assertRaises(RuntimeError):
353 self.task.run(files, run=self.outputRun)
354 self.assertEqual(len(metadata_failures), 2)
356 # Bad index file.
357 metadata_failures[:] = []
358 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
359 with self.assertRaises(RuntimeError):
360 self.task.run(files, run=self.outputRun)
361 self.assertEqual(len(metadata_failures), 1)
363 # Ingest two files that have conflicting exposure metadata.
364 ingest_failures[:] = []
365 successes[:] = []
366 # Ingest 4 files. 2 of them will implicitly find an index and one
367 # will use a sidecar. The 4th will fail due to exposure conflict.
368 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)]
369 new_run = self.outputRun + "_fail"
370 with self.assertRaises(RuntimeError):
371 self.task.run(files, run=new_run)
372 self.assertEqual(len(ingest_failures), 1)
373 self.assertEqual(len(successes), 3)
375 def testSkipExistingExposures(self):
376 """Test that skip_existing_exposures=True avoids exceptions from trying
377 to ingest the same file twice.
379 Notes
380 -----
381 This option also prevents not-ingested-yet raws from being ingested
382 when exposure already exists, but that's (A) hard to test given the
383 test data we have now and (B) not really ideal behavior, just behavior
384 we can live with in order to have a way to avoid keep duplicate ingests
385 from being an error.
386 """
387 # Ingest the first time.
388 self.task.run([self.good_file], run=self.outputRun)
389 # Attempt to ingest a second time with skip_existing_exposures=False
390 # (default). This should fail.
391 with self.assertRaises(RuntimeError):
392 self.task.run([self.good_file], run=self.outputRun)
393 # Try again with `skip_existing_exposures=True.
394 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True)
396 def testUpdateExposureRecords(self):
397 """Test that update_exposure_records=True allows metadata to be
398 modified.
399 """
400 config = RawIngestTask.ConfigClass(failFast=True)
401 task = RawIngestTask(config=config, butler=self.butler)
402 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json")) as file:
403 metadata = json.load(file)
404 # Modify unique identifiers to avoid clashes with ingests from
405 # other test methods in this test case, because those share a a
406 # data repository.
407 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords"
408 metadata["observation_counter"] = 10
409 metadata["exposure_id"] = 500
410 metadata["exposure_group"] = "50"
411 metadata["visit_id"] = 500
412 base_filename = "dataset"
413 try:
414 # Copy the original file to be ingested (.yaml) to a temporary
415 # directory, and write the new metadata next to it.
416 tmp_dir = tempfile.mkdtemp(dir=TESTDIR)
417 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml")
418 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json")
419 shutil.copy(self.good_file, raw_filename)
420 with open(sidecar_filename, "w") as sidecar_file:
421 json.dump(metadata, sidecar_file)
422 task.run([raw_filename], run=self.outputRun)
423 (record1,) = set(
424 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500)
425 )
426 self.assertEqual(record1.exposure_time, metadata["exposure_time"])
427 # Modify some metadata and repeat the process to update the
428 # exposure.
429 metadata["exposure_time"] *= 2.0
430 with open(sidecar_filename, "w") as sidecar_file:
431 json.dump(metadata, sidecar_file)
432 task.run(
433 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True
434 )
435 (record2,) = set(
436 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500)
437 )
438 self.assertEqual(record2.exposure_time, record1.exposure_time * 2)
439 finally:
440 shutil.rmtree(tmp_dir, ignore_errors=True)
443class TestRawIngestTaskPickle(unittest.TestCase):
444 """Test that pickling of the RawIngestTask works properly."""
446 @classmethod
447 def setUpClass(cls):
448 cls.root = tempfile.mkdtemp(dir=TESTDIR)
449 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {})
451 @classmethod
452 def tearDownClass(cls):
453 if cls.root is not None:
454 shutil.rmtree(cls.root, ignore_errors=True)
456 def setUp(self):
457 self.butler = butlerTests.makeTestCollection(self.creatorButler, uniqueId=self.id())
459 self.config = RawIngestTask.ConfigClass()
460 self.config.transfer = "copy" # safe non-default value
461 self.task = RawIngestTask(config=self.config, butler=self.butler)
463 def testPickleTask(self):
464 stream = pickle.dumps(self.task)
465 copy = pickle.loads(stream)
466 self.assertEqual(self.task.getFullName(), copy.getFullName())
467 self.assertEqual(self.task.log.name, copy.log.name)
468 self.assertEqual(self.task.config, copy.config)
469 self.assertEqual(self.task.butler._config, copy.butler._config)
470 self.assertEqual(self.task.butler.collections, copy.butler.collections)
471 self.assertEqual(self.task.butler.run, copy.butler.run)
472 self.assertEqual(self.task.universe, copy.universe)
473 self.assertEqual(self.task.datasetType, copy.datasetType)
476if __name__ == "__main__": 476 ↛ 477line 476 didn't jump to line 477, because the condition on line 476 was never true
477 unittest.main()