Coverage for tests/test_ingest.py: 19%
250 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-18 09:42 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-18 09:42 +0000
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import json
23import os
24import pickle
25import shutil
26import tempfile
27import unittest
29import lsst.daf.butler.tests as butlerTests
30from lsst.daf.butler import Butler, Config, DataCoordinate, Registry
31from lsst.daf.butler.registry import ConflictingDefinitionError
32from lsst.obs.base import RawIngestTask
33from lsst.obs.base.ingest_tests import IngestTestBase
34from lsst.obs.base.instrument_tests import DummyCam
35from lsst.utils.introspection import get_full_type_name
37TESTDIR = os.path.abspath(os.path.dirname(__file__))
38INGESTDIR = os.path.join(TESTDIR, "data", "ingest")
41class RawIngestTestCase(IngestTestBase, unittest.TestCase):
42 """Test ingest using JSON sidecar files."""
44 ingestDatasetTypeName = "raw_dict"
45 rawIngestTask = get_full_type_name(RawIngestTask)
46 curatedCalibrationDatasetTypes = ("testCalib",)
47 ingestDir = TESTDIR
48 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam"
49 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")
50 dataIds = [{"instrument": "DummyCam", "exposure": 100, "detector": 0}]
51 seed_config = os.path.join(TESTDIR, "data", "curated", "seed.yaml")
53 @property
54 def visits(self):
55 butler = Butler(self.root, collections=[self.outputRun])
56 return {
57 DataCoordinate.standardize(instrument="DummyCam", visit=100, universe=butler.dimensions): [
58 DataCoordinate.standardize(instrument="DummyCam", exposure=100, universe=butler.dimensions)
59 ]
60 }
62 def testWriteCuratedCalibrations(self):
63 # Inject the "data package" location.
64 DummyCam.dataPackageDir = os.path.join(TESTDIR, "data", "curated")
65 return super().testWriteCuratedCalibrations()
67 def _check_obscore(self, registry: Registry, has_visits: bool) -> None:
68 # Docstring inherited from base class.
69 assert registry.obsCoreTableManager is not None
70 with registry.obsCoreTableManager.query(lsst_run=self.outputRun) as result:
71 rows = list(result)
72 self.assertEqual(len(rows), 1)
73 row = rows[0]
75 # No spatial information until visits are defined
76 if not has_visits:
77 self.assertIsNone(row.s_ra)
78 self.assertIsNone(row.s_dec)
79 self.assertIsNone(row.s_fov)
80 self.assertIsNone(row.s_region)
81 else:
82 self.assertIsNotNone(row.s_ra)
83 self.assertIsNotNone(row.s_dec)
84 self.assertIsNotNone(row.s_fov)
85 self.assertRegex(row.s_region, "POLYGON ICRS .*")
88class RawIngestImpliedIndexTestCase(RawIngestTestCase):
89 """Test ingest using JSON index files."""
91 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml")
94class RawIngestEdgeCaseTestCase(unittest.TestCase):
95 """Test ingest using non-standard approaches including failures."""
97 @classmethod
98 def setUpClass(cls):
99 butlerConfig = """
100datastore:
101 # Want to ingest real files so can't use in-memory datastore
102 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
103"""
104 cls.root = tempfile.mkdtemp(dir=TESTDIR)
105 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig))
106 DummyCam().register(cls.creatorButler.registry)
108 @classmethod
109 def tearDownClass(cls):
110 if cls.root is not None:
111 shutil.rmtree(cls.root, ignore_errors=True)
113 def setUp(self):
114 self.butler = butlerTests.makeTestCollection(self.creatorButler)
115 self.outputRun = self.butler.run
117 config = RawIngestTask.ConfigClass()
118 self.task = RawIngestTask(config=config, butler=self.butler)
120 # Different test files.
121 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits")
122 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml")
123 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits")
125 def testSimpleIngest(self):
126 # Use the default per-instrument run for this.
127 self.task.run([self.good_file])
128 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all"))
129 self.assertEqual(len(datasets), 1)
131 # Now parallelized.
132 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")]
133 self.task.run(files, processes=2, run=self.outputRun)
134 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
135 self.assertEqual(len(datasets), 2)
137 def testExplicitIndex(self):
138 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")]
139 self.task.run(files, run=self.outputRun)
141 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
142 self.assertEqual(len(datasets), 2)
144 # Try again with an explicit index and a file that is in that index.
145 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml"))
146 new_run = self.outputRun + "b"
147 self.task.run(files, run=new_run)
149 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
150 self.assertEqual(len(datasets), 2)
152 # Now with two index files that point to the same files.
153 # Look for the warning from duplication.
154 files = [
155 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
156 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"),
157 ]
158 new_run = self.outputRun + "c"
160 with self.assertLogs(level="WARNING") as cm:
161 self.task.run(files, run=new_run)
162 self.assertIn("already specified in an index file, ignoring content", cm.output[0])
164 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
165 self.assertEqual(len(datasets), 2)
167 # Again with an index file of metadata and one of translated.
168 # Translated should win.
169 # Put the metadata one first to test that order is preserved.
170 files = [
171 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
172 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
173 ]
174 new_run = self.outputRun + "d"
175 with self.assertLogs(level="WARNING") as cm:
176 self.task.run(files, run=new_run)
177 self.assertIn("already specified in an index file but overriding", cm.output[0])
179 # Reversing the order should change the warning.
180 # Again with an index file of metadata and one of translated.
181 # Translated should win.
182 # Put the metadata one first to test that order is preserved.
183 files = [
184 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
185 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
186 ]
188 new_run = self.outputRun + "e"
189 with self.assertLogs(level="WARNING") as cm:
190 self.task.run(files, run=new_run)
191 self.assertIn("already specified in an index file, ignoring", cm.output[0])
193 # Bad index file.
194 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
195 with self.assertRaises(RuntimeError):
196 self.task.run(files, run=self.outputRun)
198 # Bad index file due to bad instrument.
199 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")]
200 with self.assertLogs(level="WARNING") as cm:
201 with self.assertRaises(RuntimeError):
202 self.task.run(files, run=self.outputRun)
203 self.assertIn("Instrument HSC for file", cm.output[0])
205 def testBadExposure(self):
206 """Test that bad exposures trigger the correct failure modes.
208 This is the only test that uses the bad definition of dataset 4
209 because exposure definitions are defined globally in a butler registry.
210 """
211 # Ingest 3 files. 2 of them will implicitly find an index and one
212 # will use a sidecar.
213 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)]
214 new_run = self.outputRun
215 self.task.run(files, run=new_run)
217 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run))
218 self.assertEqual(len(datasets), 3)
220 # Test fail fast.
221 self.task.config.failFast = True
223 # Ingest files with conflicting exposure definitions.
224 # Ingest 3 files. One of them will implicitly find an index and one
225 # will use a sidecar. The 3rd will fail due to exposure conflict.
226 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)]
227 new_run = self.outputRun + "_bad_exposure"
228 with self.assertRaises(ConflictingDefinitionError):
229 self.task.run(files, run=new_run)
231 def testBadFile(self):
232 """Try to ingest a bad file."""
233 files = [self.bad_metadata_file]
235 with self.assertRaises(RuntimeError) as cm:
236 # Default is to raise an error at the end.
237 self.task.run(files, run=self.outputRun)
238 self.assertIn("Some failures", str(cm.exception))
240 # Including a good file will result in ingest working but still
241 # raises (we might want to move this to solely happen in the
242 # command line invocation).
243 files.append(self.good_file)
245 # Also include a file with unknown instrument.
246 files.append(self.bad_instrument_file)
248 with self.assertRaises(RuntimeError):
249 self.task.run(files, run=self.outputRun)
250 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
251 self.assertEqual(len(datasets), 1)
253 # Fail fast will trigger a run time error with different text.
254 # Use a different output run to be sure we are not failing because
255 # of the attempt to ingest twice.
256 self.task.config.failFast = True
257 new_run = self.outputRun + "b"
258 with self.assertRaises(RuntimeError) as cm:
259 self.task.run([self.bad_metadata_file, self.good_file], run=new_run)
260 self.assertIn("Problem extracting metadata", str(cm.exception))
262 # Attempt to ingest good file again -- this will fail for a different
263 # reason than failed metadata extraction.
264 with self.assertRaises(ConflictingDefinitionError):
265 self.task.run([self.good_file], run=self.outputRun)
267 # Ingest a file with good metadata but unknown instrument.
268 with self.assertRaises(RuntimeError) as cm:
269 self.task.run([self.bad_instrument_file], run=self.outputRun)
270 self.assertIn("Instrument HSC", str(cm.exception))
272 # Ingest of a metadata index file that will fail translation.
273 with self.assertRaises(RuntimeError) as cm:
274 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")])
275 self.assertIn("Problem extracting metadata", str(cm.exception))
277 # Ingest of a bad index file.
278 with self.assertRaises(RuntimeError) as cm:
279 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")])
280 self.assertIn("Problem reading index file", str(cm.exception))
282 # Ingest of an implied bad index file.
283 with self.assertRaises(RuntimeError) as cm:
284 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")])
286 def testCallbacks(self):
287 """Test the callbacks for failures."""
288 # Define the callbacks.
289 metadata_failures = []
290 successes = []
291 ingest_failures = []
293 def on_metadata_failure(filename, exc):
294 metadata_failures.append(filename)
296 def on_success(datasets):
297 successes.append(datasets)
299 def on_ingest_failure(exposure, exc):
300 ingest_failures.append(exposure)
302 # Need our own task instance
303 config = RawIngestTask.ConfigClass()
304 self.task = RawIngestTask(
305 config=config,
306 butler=self.butler,
307 on_metadata_failure=on_metadata_failure,
308 on_success=on_success,
309 on_ingest_failure=on_ingest_failure,
310 )
312 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file]
314 with self.assertRaises(RuntimeError):
315 self.task.run(files, run=self.outputRun)
317 self.assertEqual(len(successes), 1)
318 self.assertEqual(len(metadata_failures), 2)
319 self.assertEqual(len(ingest_failures), 0)
321 # Try the good one a second time.
322 with self.assertRaises(RuntimeError):
323 self.task.run([self.good_file], run=self.outputRun)
325 self.assertEqual(len(successes), 1)
326 self.assertEqual(len(ingest_failures), 1)
328 # An index file with metadata that won't translate.
329 metadata_failures[:] = []
330 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
331 with self.assertRaises(RuntimeError):
332 self.task.run(files, run=self.outputRun)
333 self.assertEqual(len(metadata_failures), 2)
335 # Bad index file.
336 metadata_failures[:] = []
337 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
338 with self.assertRaises(RuntimeError):
339 self.task.run(files, run=self.outputRun)
340 self.assertEqual(len(metadata_failures), 1)
342 # Ingest two files that have conflicting exposure metadata.
343 ingest_failures[:] = []
344 successes[:] = []
345 # Ingest 4 files. 2 of them will implicitly find an index and one
346 # will use a sidecar. The 4th will fail due to exposure conflict.
347 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)]
348 new_run = self.outputRun + "_fail"
349 with self.assertRaises(RuntimeError):
350 self.task.run(files, run=new_run)
351 self.assertEqual(len(ingest_failures), 1)
352 self.assertEqual(len(successes), 3)
354 def testSkipExistingExposures(self):
355 """Test that skip_existing_exposures=True avoids exceptions from trying
356 to ingest the same file twice.
358 Notes
359 -----
360 This option also prevents not-ingested-yet raws from being ingested
361 when exposure already exists, but that's (A) hard to test given the
362 test data we have now and (B) not really ideal behavior, just behavior
363 we can live with in order to have a way to avoid keep duplicate ingests
364 from being an error.
365 """
366 # Ingest the first time.
367 self.task.run([self.good_file], run=self.outputRun)
368 # Attempt to ingest a second time with skip_existing_exposures=False
369 # (default). This should fail.
370 with self.assertRaises(RuntimeError):
371 self.task.run([self.good_file], run=self.outputRun)
372 # Try again with `skip_existing_exposures=True.
373 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True)
375 def testUpdateExposureRecords(self):
376 """Test that update_exposure_records=True allows metadata to be
377 modified.
378 """
379 config = RawIngestTask.ConfigClass(failFast=True)
380 task = RawIngestTask(config=config, butler=self.butler)
381 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json")) as file:
382 metadata = json.load(file)
383 # Modify unique identifiers to avoid clashes with ingests from
384 # other test methods in this test case, because those share a a
385 # data repository.
386 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords"
387 metadata["observation_counter"] = 10
388 metadata["exposure_id"] = 500
389 metadata["exposure_group"] = "50"
390 metadata["visit_id"] = 500
391 base_filename = "dataset"
392 try:
393 # Copy the original file to be ingested (.yaml) to a temporary
394 # directory, and write the new metadata next to it.
395 tmp_dir = tempfile.mkdtemp(dir=TESTDIR)
396 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml")
397 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json")
398 shutil.copy(self.good_file, raw_filename)
399 with open(sidecar_filename, "w") as sidecar_file:
400 json.dump(metadata, sidecar_file)
401 task.run([raw_filename], run=self.outputRun)
402 (record1,) = set(
403 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500)
404 )
405 self.assertEqual(record1.exposure_time, metadata["exposure_time"])
406 # Modify some metadata and repeat the process to update the
407 # exposure.
408 metadata["exposure_time"] *= 2.0
409 with open(sidecar_filename, "w") as sidecar_file:
410 json.dump(metadata, sidecar_file)
411 task.run(
412 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True
413 )
414 (record2,) = set(
415 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500)
416 )
417 self.assertEqual(record2.exposure_time, record1.exposure_time * 2)
418 finally:
419 shutil.rmtree(tmp_dir, ignore_errors=True)
422class TestRawIngestTaskPickle(unittest.TestCase):
423 """Test that pickling of the RawIngestTask works properly."""
425 @classmethod
426 def setUpClass(cls):
427 cls.root = tempfile.mkdtemp(dir=TESTDIR)
428 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {})
430 @classmethod
431 def tearDownClass(cls):
432 if cls.root is not None:
433 shutil.rmtree(cls.root, ignore_errors=True)
435 def setUp(self):
436 self.butler = butlerTests.makeTestCollection(self.creatorButler)
438 self.config = RawIngestTask.ConfigClass()
439 self.config.transfer = "copy" # safe non-default value
440 self.task = RawIngestTask(config=self.config, butler=self.butler)
442 def testPickleTask(self):
443 stream = pickle.dumps(self.task)
444 copy = pickle.loads(stream)
445 self.assertEqual(self.task.getFullName(), copy.getFullName())
446 self.assertEqual(self.task.log.name, copy.log.name)
447 self.assertEqual(self.task.config, copy.config)
448 self.assertEqual(self.task.butler._config, copy.butler._config)
449 self.assertEqual(self.task.butler.collections, copy.butler.collections)
450 self.assertEqual(self.task.butler.run, copy.butler.run)
451 self.assertEqual(self.task.universe, copy.universe)
452 self.assertEqual(self.task.datasetType, copy.datasetType)
455if __name__ == "__main__": 455 ↛ 456line 455 didn't jump to line 456, because the condition on line 455 was never true
456 unittest.main()