Coverage for tests/test_ingest.py: 18%
236 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:43 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:43 -0700
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import json
23import os
24import pickle
25import shutil
26import tempfile
27import unittest
29import lsst.daf.butler.tests as butlerTests
30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetType
31from lsst.daf.butler.registry import ConflictingDefinitionError
32from lsst.obs.base import RawIngestTask
33from lsst.obs.base.ingest_tests import IngestTestBase
34from lsst.obs.base.instrument_tests import DummyCam
35from lsst.utils.introspection import get_full_type_name
37TESTDIR = os.path.abspath(os.path.dirname(__file__))
38INGESTDIR = os.path.join(TESTDIR, "data", "ingest")
41class DummyCamRawIngestTask(RawIngestTask):
42 """For DummyCam we ingest a different dataset type that can return
43 a non-Exposure."""
45 def getDatasetType(self):
46 """Return the DatasetType of the datasets ingested by this Task."""
47 return DatasetType(
48 "raw_dict",
49 ("instrument", "detector", "exposure"),
50 "StructuredDataDict",
51 universe=self.butler.registry.dimensions,
52 )
55class RawIngestTestCase(IngestTestBase, unittest.TestCase):
56 """Test ingest using JSON sidecar files."""
58 ingestDatasetTypeName = "raw_dict"
59 rawIngestTask = get_full_type_name(DummyCamRawIngestTask)
60 curatedCalibrationDatasetTypes = ()
61 ingestDir = TESTDIR
62 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam"
63 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")
64 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)]
66 @property
67 def visits(self):
68 butler = Butler(self.root, collections=[self.outputRun])
69 return {
70 DataCoordinate.standardize(
71 instrument="DummyCam", visit=100, universe=butler.registry.dimensions
72 ): [
73 DataCoordinate.standardize(
74 instrument="DummyCam", exposure=100, universe=butler.registry.dimensions
75 )
76 ]
77 }
79 def testWriteCuratedCalibrations(self):
80 """There are no curated calibrations in this test instrument"""
81 pass
84class RawIngestImpliedIndexTestCase(RawIngestTestCase):
85 """Test ingest using JSON index files."""
87 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml")
90class RawIngestEdgeCaseTestCase(unittest.TestCase):
91 """Test ingest using non-standard approaches including failures."""
93 @classmethod
94 def setUpClass(cls):
95 butlerConfig = """
96datastore:
97 # Want to ingest real files so can't use in-memory datastore
98 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
99"""
100 cls.root = tempfile.mkdtemp(dir=TESTDIR)
101 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig))
102 DummyCam().register(cls.creatorButler.registry)
104 @classmethod
105 def tearDownClass(cls):
106 if cls.root is not None:
107 shutil.rmtree(cls.root, ignore_errors=True)
109 def setUp(self):
110 self.butler = butlerTests.makeTestCollection(self.creatorButler)
111 self.outputRun = self.butler.run
113 config = RawIngestTask.ConfigClass()
114 self.task = DummyCamRawIngestTask(config=config, butler=self.butler)
116 # Different test files.
117 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits")
118 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml")
119 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits")
121 def testSimpleIngest(self):
122 # Use the default per-instrument run for this.
123 self.task.run([self.good_file])
124 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all"))
125 self.assertEqual(len(datasets), 1)
127 # Now parallelized.
128 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")]
129 self.task.run(files, processes=2, run=self.outputRun)
130 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
131 self.assertEqual(len(datasets), 2)
133 def testExplicitIndex(self):
134 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")]
135 self.task.run(files, run=self.outputRun)
137 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
138 self.assertEqual(len(datasets), 2)
140 # Try again with an explicit index and a file that is in that index.
141 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml"))
142 new_run = self.outputRun + "b"
143 self.task.run(files, run=new_run)
145 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
146 self.assertEqual(len(datasets), 2)
148 # Now with two index files that point to the same files.
149 # Look for the warning from duplication.
150 files = [
151 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
152 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"),
153 ]
154 new_run = self.outputRun + "c"
156 with self.assertLogs(level="WARNING") as cm:
157 self.task.run(files, run=new_run)
158 self.assertIn("already specified in an index file, ignoring content", cm.output[0])
160 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
161 self.assertEqual(len(datasets), 2)
163 # Again with an index file of metadata and one of translated.
164 # Translated should win.
165 # Put the metadata one first to test that order is preserved.
166 files = [
167 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
168 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
169 ]
170 new_run = self.outputRun + "d"
171 with self.assertLogs(level="WARNING") as cm:
172 self.task.run(files, run=new_run)
173 self.assertIn("already specified in an index file but overriding", cm.output[0])
175 # Reversing the order should change the warning.
176 # Again with an index file of metadata and one of translated.
177 # Translated should win.
178 # Put the metadata one first to test that order is preserved.
179 files = [
180 os.path.join(INGESTDIR, "indexed_data", "_index.json"),
181 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
182 ]
184 new_run = self.outputRun + "e"
185 with self.assertLogs(level="WARNING") as cm:
186 self.task.run(files, run=new_run)
187 self.assertIn("already specified in an index file, ignoring", cm.output[0])
189 # Bad index file.
190 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
191 with self.assertRaises(RuntimeError):
192 self.task.run(files, run=self.outputRun)
194 # Bad index file due to bad instrument.
195 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")]
196 with self.assertLogs(level="WARNING") as cm:
197 with self.assertRaises(RuntimeError):
198 self.task.run(files, run=self.outputRun)
199 self.assertIn("Instrument HSC for file", cm.output[0])
201 def testBadExposure(self):
202 """Test that bad exposures trigger the correct failure modes.
204 This is the only test that uses the bad definition of dataset 4
205 because exposure definitions are defined globally in a butler registry.
206 """
208 # Ingest 3 files. 2 of them will implicitly find an index and one
209 # will use a sidecar.
210 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)]
211 new_run = self.outputRun
212 self.task.run(files, run=new_run)
214 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run))
215 self.assertEqual(len(datasets), 3)
217 # Test fail fast.
218 self.task.config.failFast = True
220 # Ingest files with conflicting exposure definitions.
221 # Ingest 3 files. One of them will implicitly find an index and one
222 # will use a sidecar. The 3rd will fail due to exposure conflict.
223 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)]
224 new_run = self.outputRun + "_bad_exposure"
225 with self.assertRaises(ConflictingDefinitionError):
226 self.task.run(files, run=new_run)
228 def testBadFile(self):
229 """Try to ingest a bad file."""
230 files = [self.bad_metadata_file]
232 with self.assertRaises(RuntimeError) as cm:
233 # Default is to raise an error at the end.
234 self.task.run(files, run=self.outputRun)
235 self.assertIn("Some failures", str(cm.exception))
237 # Including a good file will result in ingest working but still
238 # raises (we might want to move this to solely happen in the
239 # command line invocation).
240 files.append(self.good_file)
242 # Also include a file with unknown instrument.
243 files.append(self.bad_instrument_file)
245 with self.assertRaises(RuntimeError):
246 self.task.run(files, run=self.outputRun)
247 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
248 self.assertEqual(len(datasets), 1)
250 # Fail fast will trigger a run time error with different text.
251 # Use a different output run to be sure we are not failing because
252 # of the attempt to ingest twice.
253 self.task.config.failFast = True
254 new_run = self.outputRun + "b"
255 with self.assertRaises(RuntimeError) as cm:
256 self.task.run([self.bad_metadata_file, self.good_file], run=new_run)
257 self.assertIn("Problem extracting metadata", str(cm.exception))
259 # Attempt to ingest good file again -- this will fail for a different
260 # reason than failed metadata extraction.
261 with self.assertRaises(ConflictingDefinitionError):
262 self.task.run([self.good_file], run=self.outputRun)
264 # Ingest a file with good metadata but unknown instrument.
265 with self.assertRaises(RuntimeError) as cm:
266 self.task.run([self.bad_instrument_file], run=self.outputRun)
267 self.assertIn("Instrument HSC", str(cm.exception))
269 # Ingest of a metadata index file that will fail translation.
270 with self.assertRaises(RuntimeError) as cm:
271 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")])
272 self.assertIn("Problem extracting metadata", str(cm.exception))
274 # Ingest of a bad index file.
275 with self.assertRaises(RuntimeError) as cm:
276 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")])
277 self.assertIn("Problem reading index file", str(cm.exception))
279 # Ingest of an implied bad index file.
280 with self.assertRaises(RuntimeError) as cm:
281 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")])
283 def testCallbacks(self):
284 """Test the callbacks for failures."""
286 # Define the callbacks.
287 metadata_failures = []
288 successes = []
289 ingest_failures = []
291 def on_metadata_failure(filename, exc):
292 metadata_failures.append(filename)
294 def on_success(datasets):
295 successes.append(datasets)
297 def on_ingest_failure(exposure, exc):
298 ingest_failures.append(exposure)
300 # Need our own task instance
301 config = RawIngestTask.ConfigClass()
302 self.task = DummyCamRawIngestTask(
303 config=config,
304 butler=self.butler,
305 on_metadata_failure=on_metadata_failure,
306 on_success=on_success,
307 on_ingest_failure=on_ingest_failure,
308 )
310 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file]
312 with self.assertRaises(RuntimeError):
313 self.task.run(files, run=self.outputRun)
315 self.assertEqual(len(successes), 1)
316 self.assertEqual(len(metadata_failures), 2)
317 self.assertEqual(len(ingest_failures), 0)
319 # Try the good one a second time.
320 with self.assertRaises(RuntimeError):
321 self.task.run([self.good_file], run=self.outputRun)
323 self.assertEqual(len(successes), 1)
324 self.assertEqual(len(ingest_failures), 1)
326 # An index file with metadata that won't translate.
327 metadata_failures[:] = []
328 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
329 with self.assertRaises(RuntimeError):
330 self.task.run(files, run=self.outputRun)
331 self.assertEqual(len(metadata_failures), 2)
333 # Bad index file.
334 metadata_failures[:] = []
335 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
336 with self.assertRaises(RuntimeError):
337 self.task.run(files, run=self.outputRun)
338 self.assertEqual(len(metadata_failures), 1)
340 # Ingest two files that have conflicting exposure metadata.
341 ingest_failures[:] = []
342 successes[:] = []
343 # Ingest 4 files. 2 of them will implicitly find an index and one
344 # will use a sidecar. The 4th will fail due to exposure conflict.
345 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)]
346 new_run = self.outputRun + "_fail"
347 with self.assertRaises(RuntimeError):
348 self.task.run(files, run=new_run)
349 self.assertEqual(len(ingest_failures), 1)
350 self.assertEqual(len(successes), 3)
352 def testSkipExistingExposures(self):
353 """Test that skip_existing_exposures=True avoids exceptions from trying
354 to ingest the same file twice.
356 Notes
357 -----
358 This option also prevents not-ingested-yet raws from being ingested
359 when exposure already exists, but that's (A) hard to test given the
360 test data we have now and (B) not really ideal behavior, just behavior
361 we can live with in order to have a way to avoid keep duplicate ingests
362 from being an error.
363 """
364 # Ingest the first time.
365 self.task.run([self.good_file], run=self.outputRun)
366 # Attempt to ingest a second time with skip_existing_exposures=False
367 # (default). This should fail.
368 with self.assertRaises(RuntimeError):
369 self.task.run([self.good_file], run=self.outputRun)
370 # Try again with `skip_existing_exposures=True.
371 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True)
373 def testUpdateExposureRecords(self):
374 """Test that update_exposure_records=True allows metadata to be
375 modified.
376 """
377 config = RawIngestTask.ConfigClass(failFast=True)
378 task = DummyCamRawIngestTask(config=config, butler=self.butler)
379 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), "r") as file:
380 metadata = json.load(file)
381 # Modify unique identifiers to avoid clashes with ingests from
382 # other test methods in this test case, because those share a a
383 # data repository.
384 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords"
385 metadata["observation_counter"] = 10
386 metadata["exposure_id"] = 500
387 metadata["exposure_group"] = "50"
388 metadata["visit_id"] = 500
389 base_filename = "dataset"
390 try:
391 # Copy the original file to be ingested (.yaml) to a temporary
392 # directory, and write the new metadata next to it.
393 tmp_dir = tempfile.mkdtemp(dir=TESTDIR)
394 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml")
395 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json")
396 shutil.copy(self.good_file, raw_filename)
397 with open(sidecar_filename, "w") as sidecar_file:
398 json.dump(metadata, sidecar_file)
399 task.run([raw_filename], run=self.outputRun)
400 (record1,) = set(
401 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500)
402 )
403 self.assertEqual(record1.exposure_time, metadata["exposure_time"])
404 # Modify some metadata and repeat the process to update the
405 # exposure.
406 metadata["exposure_time"] *= 2.0
407 with open(sidecar_filename, "w") as sidecar_file:
408 json.dump(metadata, sidecar_file)
409 task.run(
410 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True
411 )
412 (record2,) = set(
413 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500)
414 )
415 self.assertEqual(record2.exposure_time, record1.exposure_time * 2)
416 finally:
417 shutil.rmtree(tmp_dir, ignore_errors=True)
420class TestRawIngestTaskPickle(unittest.TestCase):
421 """Test that pickling of the RawIngestTask works properly."""
423 @classmethod
424 def setUpClass(cls):
425 cls.root = tempfile.mkdtemp(dir=TESTDIR)
426 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {})
428 @classmethod
429 def tearDownClass(cls):
430 if cls.root is not None:
431 shutil.rmtree(cls.root, ignore_errors=True)
433 def setUp(self):
434 self.butler = butlerTests.makeTestCollection(self.creatorButler)
436 self.config = RawIngestTask.ConfigClass()
437 self.config.transfer = "copy" # safe non-default value
438 self.task = RawIngestTask(config=self.config, butler=self.butler)
440 def testPickleTask(self):
441 stream = pickle.dumps(self.task)
442 copy = pickle.loads(stream)
443 self.assertEqual(self.task.getFullName(), copy.getFullName())
444 self.assertEqual(self.task.log.name, copy.log.name)
445 self.assertEqual(self.task.config, copy.config)
446 self.assertEqual(self.task.butler._config, copy.butler._config)
447 self.assertEqual(self.task.butler.collections, copy.butler.collections)
448 self.assertEqual(self.task.butler.run, copy.butler.run)
449 self.assertEqual(self.task.universe, copy.universe)
450 self.assertEqual(self.task.datasetType, copy.datasetType)
453if __name__ == "__main__": 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true
454 unittest.main()