Coverage for tests/test_ingest.py: 17%
250 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-06 02:49 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-06 02:49 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import json
23import os
24import pickle
25import shutil
26import tempfile
27import unittest
29import lsst.daf.butler.tests as butlerTests
30from lsst.daf.butler import Butler, Config, DataCoordinate, Registry
31from lsst.daf.butler.registry import ConflictingDefinitionError
32from lsst.obs.base import RawIngestTask
33from lsst.obs.base.ingest_tests import IngestTestBase
34from lsst.obs.base.instrument_tests import DummyCam
35from lsst.utils.introspection import get_full_type_name
37TESTDIR = os.path.abspath(os.path.dirname(__file__))
38INGESTDIR = os.path.join(TESTDIR, "data", "ingest")
41class RawIngestTestCase(IngestTestBase, unittest.TestCase):
42 """Test ingest using JSON sidecar files."""
44 ingestDatasetTypeName = "raw_dict"
45 rawIngestTask = get_full_type_name(RawIngestTask)
46 curatedCalibrationDatasetTypes = ("testCalib",)
47 ingestDir = TESTDIR
48 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam"
49 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")
50 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)]
51 seed_config = os.path.join(TESTDIR, "data", "curated", "seed.yaml")
53 @property
54 def visits(self):
55 butler = Butler(self.root, collections=[self.outputRun])
56 return {
57 DataCoordinate.standardize(
58 instrument="DummyCam", visit=100, universe=butler.registry.dimensions
59 ): [
60 DataCoordinate.standardize(
61 instrument="DummyCam", exposure=100, universe=butler.registry.dimensions
62 )
63 ]
64 }
66 def testWriteCuratedCalibrations(self):
67 # Inject the "data package" location.
68 DummyCam.dataPackageDir = os.path.join(TESTDIR, "data", "curated")
69 return super().testWriteCuratedCalibrations()
71 def _check_obscore(self, registry: Registry, has_visits: bool) -> None:
72 # Docstring inherited from base class.
73 assert registry.obsCoreTableManager is not None
74 with registry.obsCoreTableManager.query(lsst_run=self.outputRun) as result:
75 rows = list(result)
76 self.assertEqual(len(rows), 1)
77 row = rows[0]
79 # No spatial information until visits are defined
80 if not has_visits:
81 self.assertIsNone(row.s_ra)
82 self.assertIsNone(row.s_dec)
83 self.assertIsNone(row.s_fov)
84 self.assertIsNone(row.s_region)
85 else:
86 self.assertIsNotNone(row.s_ra)
87 self.assertIsNotNone(row.s_dec)
88 self.assertIsNotNone(row.s_fov)
89 self.assertRegex(row.s_region, "POLYGON ICRS .*")
92class RawIngestImpliedIndexTestCase(RawIngestTestCase):
93 """Test ingest using JSON index files."""
95 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml")
98class RawIngestEdgeCaseTestCase(unittest.TestCase):
99 """Test ingest using non-standard approaches including failures."""
101 @classmethod
102 def setUpClass(cls):
103 butlerConfig = """
104datastore:
105 # Want to ingest real files so can't use in-memory datastore
106 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
107"""
108 cls.root = tempfile.mkdtemp(dir=TESTDIR)
109 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig))
110 DummyCam().register(cls.creatorButler.registry)
112 @classmethod
113 def tearDownClass(cls):
114 if cls.root is not None:
115 shutil.rmtree(cls.root, ignore_errors=True)
117 def setUp(self):
118 self.butler = butlerTests.makeTestCollection(self.creatorButler)
119 self.outputRun = self.butler.run
121 config = RawIngestTask.ConfigClass()
122 self.task = RawIngestTask(config=config, butler=self.butler)
124 # Different test files.
125 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits")
126 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml")
127 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits")
129 def testSimpleIngest(self):
130 # Use the default per-instrument run for this.
131 self.task.run([self.good_file])
132 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all"))
133 self.assertEqual(len(datasets), 1)
135 # Now parallelized.
136 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")]
137 self.task.run(files, processes=2, run=self.outputRun)
138 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
139 self.assertEqual(len(datasets), 2)
141 def testExplicitIndex(self):
142 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")]
143 self.task.run(files, run=self.outputRun)
145 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
146 self.assertEqual(len(datasets), 2)
148 # Try again with an explicit index and a file that is in that index.
149 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml"))
150 new_run = self.outputRun + "b"
151 self.task.run(files, run=new_run)
153 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
154 self.assertEqual(len(datasets), 2)
156 # Now with two index files that point to the same files.
157 # Look for the warning from duplication.
158 files = [
159 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
160 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"),
161 ]
162 new_run = self.outputRun + "c"
164 with self.assertLogs(level="WARNING") as cm:
165 self.task.run(files, run=new_run)
166 self.assertIn("already specified in an index file, ignoring content", cm.output[0])
168 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
169 self.assertEqual(len(datasets), 2)
171 # Again with an index file of metadata and one of translated.
172 # Translated should win.
173 # Put the metadata one first to test that order is preserved.
174 files = [
175 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
176 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
177 ]
178 new_run = self.outputRun + "d"
179 with self.assertLogs(level="WARNING") as cm:
180 self.task.run(files, run=new_run)
181 self.assertIn("already specified in an index file but overriding", cm.output[0])
183 # Reversing the order should change the warning.
184 # Again with an index file of metadata and one of translated.
185 # Translated should win.
186 # Put the metadata one first to test that order is preserved.
187 files = [
188 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
189 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
190 ]
192 new_run = self.outputRun + "e"
193 with self.assertLogs(level="WARNING") as cm:
194 self.task.run(files, run=new_run)
195 self.assertIn("already specified in an index file, ignoring", cm.output[0])
197 # Bad index file.
198 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
199 with self.assertRaises(RuntimeError):
200 self.task.run(files, run=self.outputRun)
202 # Bad index file due to bad instrument.
203 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")]
204 with self.assertLogs(level="WARNING") as cm:
205 with self.assertRaises(RuntimeError):
206 self.task.run(files, run=self.outputRun)
207 self.assertIn("Instrument HSC for file", cm.output[0])
209 def testBadExposure(self):
210 """Test that bad exposures trigger the correct failure modes.
212 This is the only test that uses the bad definition of dataset 4
213 because exposure definitions are defined globally in a butler registry.
214 """
216 # Ingest 3 files. 2 of them will implicitly find an index and one
217 # will use a sidecar.
218 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)]
219 new_run = self.outputRun
220 self.task.run(files, run=new_run)
222 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run))
223 self.assertEqual(len(datasets), 3)
225 # Test fail fast.
226 self.task.config.failFast = True
228 # Ingest files with conflicting exposure definitions.
229 # Ingest 3 files. One of them will implicitly find an index and one
230 # will use a sidecar. The 3rd will fail due to exposure conflict.
231 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)]
232 new_run = self.outputRun + "_bad_exposure"
233 with self.assertRaises(ConflictingDefinitionError):
234 self.task.run(files, run=new_run)
236 def testBadFile(self):
237 """Try to ingest a bad file."""
238 files = [self.bad_metadata_file]
240 with self.assertRaises(RuntimeError) as cm:
241 # Default is to raise an error at the end.
242 self.task.run(files, run=self.outputRun)
243 self.assertIn("Some failures", str(cm.exception))
245 # Including a good file will result in ingest working but still
246 # raises (we might want to move this to solely happen in the
247 # command line invocation).
248 files.append(self.good_file)
250 # Also include a file with unknown instrument.
251 files.append(self.bad_instrument_file)
253 with self.assertRaises(RuntimeError):
254 self.task.run(files, run=self.outputRun)
255 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
256 self.assertEqual(len(datasets), 1)
258 # Fail fast will trigger a run time error with different text.
259 # Use a different output run to be sure we are not failing because
260 # of the attempt to ingest twice.
261 self.task.config.failFast = True
262 new_run = self.outputRun + "b"
263 with self.assertRaises(RuntimeError) as cm:
264 self.task.run([self.bad_metadata_file, self.good_file], run=new_run)
265 self.assertIn("Problem extracting metadata", str(cm.exception))
267 # Attempt to ingest good file again -- this will fail for a different
268 # reason than failed metadata extraction.
269 with self.assertRaises(ConflictingDefinitionError):
270 self.task.run([self.good_file], run=self.outputRun)
272 # Ingest a file with good metadata but unknown instrument.
273 with self.assertRaises(RuntimeError) as cm:
274 self.task.run([self.bad_instrument_file], run=self.outputRun)
275 self.assertIn("Instrument HSC", str(cm.exception))
277 # Ingest of a metadata index file that will fail translation.
278 with self.assertRaises(RuntimeError) as cm:
279 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")])
280 self.assertIn("Problem extracting metadata", str(cm.exception))
282 # Ingest of a bad index file.
283 with self.assertRaises(RuntimeError) as cm:
284 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")])
285 self.assertIn("Problem reading index file", str(cm.exception))
287 # Ingest of an implied bad index file.
288 with self.assertRaises(RuntimeError) as cm:
289 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")])
291 def testCallbacks(self):
292 """Test the callbacks for failures."""
294 # Define the callbacks.
295 metadata_failures = []
296 successes = []
297 ingest_failures = []
299 def on_metadata_failure(filename, exc):
300 metadata_failures.append(filename)
302 def on_success(datasets):
303 successes.append(datasets)
305 def on_ingest_failure(exposure, exc):
306 ingest_failures.append(exposure)
308 # Need our own task instance
309 config = RawIngestTask.ConfigClass()
310 self.task = RawIngestTask(
311 config=config,
312 butler=self.butler,
313 on_metadata_failure=on_metadata_failure,
314 on_success=on_success,
315 on_ingest_failure=on_ingest_failure,
316 )
318 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file]
320 with self.assertRaises(RuntimeError):
321 self.task.run(files, run=self.outputRun)
323 self.assertEqual(len(successes), 1)
324 self.assertEqual(len(metadata_failures), 2)
325 self.assertEqual(len(ingest_failures), 0)
327 # Try the good one a second time.
328 with self.assertRaises(RuntimeError):
329 self.task.run([self.good_file], run=self.outputRun)
331 self.assertEqual(len(successes), 1)
332 self.assertEqual(len(ingest_failures), 1)
334 # An index file with metadata that won't translate.
335 metadata_failures[:] = []
336 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
337 with self.assertRaises(RuntimeError):
338 self.task.run(files, run=self.outputRun)
339 self.assertEqual(len(metadata_failures), 2)
341 # Bad index file.
342 metadata_failures[:] = []
343 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
344 with self.assertRaises(RuntimeError):
345 self.task.run(files, run=self.outputRun)
346 self.assertEqual(len(metadata_failures), 1)
348 # Ingest two files that have conflicting exposure metadata.
349 ingest_failures[:] = []
350 successes[:] = []
351 # Ingest 4 files. 2 of them will implicitly find an index and one
352 # will use a sidecar. The 4th will fail due to exposure conflict.
353 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)]
354 new_run = self.outputRun + "_fail"
355 with self.assertRaises(RuntimeError):
356 self.task.run(files, run=new_run)
357 self.assertEqual(len(ingest_failures), 1)
358 self.assertEqual(len(successes), 3)
360 def testSkipExistingExposures(self):
361 """Test that skip_existing_exposures=True avoids exceptions from trying
362 to ingest the same file twice.
364 Notes
365 -----
366 This option also prevents not-ingested-yet raws from being ingested
367 when exposure already exists, but that's (A) hard to test given the
368 test data we have now and (B) not really ideal behavior, just behavior
369 we can live with in order to have a way to avoid keep duplicate ingests
370 from being an error.
371 """
372 # Ingest the first time.
373 self.task.run([self.good_file], run=self.outputRun)
374 # Attempt to ingest a second time with skip_existing_exposures=False
375 # (default). This should fail.
376 with self.assertRaises(RuntimeError):
377 self.task.run([self.good_file], run=self.outputRun)
378 # Try again with `skip_existing_exposures=True.
379 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True)
381 def testUpdateExposureRecords(self):
382 """Test that update_exposure_records=True allows metadata to be
383 modified.
384 """
385 config = RawIngestTask.ConfigClass(failFast=True)
386 task = RawIngestTask(config=config, butler=self.butler)
387 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), "r") as file:
388 metadata = json.load(file)
389 # Modify unique identifiers to avoid clashes with ingests from
390 # other test methods in this test case, because those share a a
391 # data repository.
392 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords"
393 metadata["observation_counter"] = 10
394 metadata["exposure_id"] = 500
395 metadata["exposure_group"] = "50"
396 metadata["visit_id"] = 500
397 base_filename = "dataset"
398 try:
399 # Copy the original file to be ingested (.yaml) to a temporary
400 # directory, and write the new metadata next to it.
401 tmp_dir = tempfile.mkdtemp(dir=TESTDIR)
402 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml")
403 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json")
404 shutil.copy(self.good_file, raw_filename)
405 with open(sidecar_filename, "w") as sidecar_file:
406 json.dump(metadata, sidecar_file)
407 task.run([raw_filename], run=self.outputRun)
408 (record1,) = set(
409 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500)
410 )
411 self.assertEqual(record1.exposure_time, metadata["exposure_time"])
412 # Modify some metadata and repeat the process to update the
413 # exposure.
414 metadata["exposure_time"] *= 2.0
415 with open(sidecar_filename, "w") as sidecar_file:
416 json.dump(metadata, sidecar_file)
417 task.run(
418 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True
419 )
420 (record2,) = set(
421 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500)
422 )
423 self.assertEqual(record2.exposure_time, record1.exposure_time * 2)
424 finally:
425 shutil.rmtree(tmp_dir, ignore_errors=True)
428class TestRawIngestTaskPickle(unittest.TestCase):
429 """Test that pickling of the RawIngestTask works properly."""
431 @classmethod
432 def setUpClass(cls):
433 cls.root = tempfile.mkdtemp(dir=TESTDIR)
434 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {})
436 @classmethod
437 def tearDownClass(cls):
438 if cls.root is not None:
439 shutil.rmtree(cls.root, ignore_errors=True)
441 def setUp(self):
442 self.butler = butlerTests.makeTestCollection(self.creatorButler)
444 self.config = RawIngestTask.ConfigClass()
445 self.config.transfer = "copy" # safe non-default value
446 self.task = RawIngestTask(config=self.config, butler=self.butler)
448 def testPickleTask(self):
449 stream = pickle.dumps(self.task)
450 copy = pickle.loads(stream)
451 self.assertEqual(self.task.getFullName(), copy.getFullName())
452 self.assertEqual(self.task.log.name, copy.log.name)
453 self.assertEqual(self.task.config, copy.config)
454 self.assertEqual(self.task.butler._config, copy.butler._config)
455 self.assertEqual(self.task.butler.collections, copy.butler.collections)
456 self.assertEqual(self.task.butler.run, copy.butler.run)
457 self.assertEqual(self.task.universe, copy.universe)
458 self.assertEqual(self.task.datasetType, copy.datasetType)
461if __name__ == "__main__": 461 ↛ 462line 461 didn't jump to line 462, because the condition on line 461 was never true
462 unittest.main()