Coverage for tests/test_ingest.py : 21%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import json
23import os
24import pickle
25import shutil
26import tempfile
27import unittest
29import lsst.log
30import lsst.daf.butler.tests as butlerTests
31from lsst.daf.butler import DatasetType, Butler, DataCoordinate, Config
32from lsst.daf.butler.registry import ConflictingDefinitionError
33from lsst.daf.butler.core.utils import getFullTypeName
35from lsst.obs.base.ingest_tests import IngestTestBase
36from lsst.obs.base.instrument_tests import DummyCam
37from lsst.obs.base import RawIngestTask
40TESTDIR = os.path.abspath(os.path.dirname(__file__))
41INGESTDIR = os.path.join(TESTDIR, "data", "ingest")
44class DummyCamRawIngestTask(RawIngestTask):
45 """For DummyCam we ingest a different dataset type that can return
46 a non-Exposure."""
48 def getDatasetType(self):
49 """Return the DatasetType of the datasets ingested by this Task.
50 """
51 return DatasetType("raw_dict", ("instrument", "detector", "exposure"), "StructuredDataDict",
52 universe=self.butler.registry.dimensions)
55class RawIngestTestCase(IngestTestBase, unittest.TestCase):
56 """Test ingest using JSON sidecar files."""
58 ingestDatasetTypeName = "raw_dict"
59 rawIngestTask = getFullTypeName(DummyCamRawIngestTask)
60 curatedCalibrationDatasetTypes = ()
61 ingestDir = TESTDIR
62 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam"
63 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")
64 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)]
66 @property
67 def visits(self):
68 butler = Butler(self.root, collections=[self.outputRun])
69 return {
70 DataCoordinate.standardize(
71 instrument="DummyCam",
72 visit=100,
73 universe=butler.registry.dimensions
74 ): [
75 DataCoordinate.standardize(
76 instrument="DummyCam",
77 exposure=100,
78 universe=butler.registry.dimensions
79 )
80 ]
81 }
83 def testWriteCuratedCalibrations(self):
84 """There are no curated calibrations in this test instrument"""
85 pass
88class RawIngestImpliedIndexTestCase(RawIngestTestCase):
89 """Test ingest using JSON index files."""
90 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml")
93class RawIngestEdgeCaseTestCase(unittest.TestCase):
94 """Test ingest using non-standard approaches including failures."""
96 @classmethod
97 def setUpClass(cls):
98 butlerConfig = """
99datastore:
100 # Want to ingest real files so can't use in-memory datastore
101 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
102"""
103 cls.root = tempfile.mkdtemp(dir=TESTDIR)
104 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig))
105 DummyCam().register(cls.creatorButler.registry)
107 @classmethod
108 def tearDownClass(cls):
109 if cls.root is not None:
110 shutil.rmtree(cls.root, ignore_errors=True)
112 def setUp(self):
113 self.butler = butlerTests.makeTestCollection(self.creatorButler)
114 self.outputRun = self.butler.run
116 config = RawIngestTask.ConfigClass()
117 self.task = DummyCamRawIngestTask(config=config, butler=self.butler)
119 # Different test files.
120 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits")
121 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml")
122 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits")
124 def testSimpleIngest(self):
125 # Use the default per-instrument run for this.
126 self.task.run([self.good_file])
127 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all"))
128 self.assertEqual(len(datasets), 1)
130 # Now parallelized.
131 files = [self.good_file,
132 os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")]
133 self.task.run(files, processes=2, run=self.outputRun)
134 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
135 self.assertEqual(len(datasets), 2)
137 def testExplicitIndex(self):
138 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")]
139 self.task.run(files, run=self.outputRun)
141 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
142 self.assertEqual(len(datasets), 2)
144 # Try again with an explicit index and a file that is in that index.
145 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml"))
146 new_run = self.outputRun + "b"
147 self.task.run(files, run=new_run)
149 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
150 self.assertEqual(len(datasets), 2)
152 # Now with two index files that point to the same files.
153 # Look for the warning from duplication.
154 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"),
155 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json")]
156 new_run = self.outputRun + "c"
158 with self.assertLogs(level="WARNING") as cm:
159 with lsst.log.UsePythonLogging():
160 self.task.run(files, run=new_run)
161 self.assertIn("already specified in an index file, ignoring content", cm.output[0])
163 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
164 self.assertEqual(len(datasets), 2)
166 # Again with an index file of metadata and one of translated.
167 # Translated should win.
168 # Put the metadata one first to test that order is preserved.
169 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
170 os.path.join(INGESTDIR, "indexed_data", "_index.json")]
171 new_run = self.outputRun + "d"
172 with self.assertLogs(level="WARNING") as cm:
173 with lsst.log.UsePythonLogging():
174 self.task.run(files, run=new_run)
175 self.assertIn("already specified in an index file but overriding", cm.output[0])
177 # Reversing the order should change the warning.
178 # Again with an index file of metadata and one of translated.
179 # Translated should win.
180 # Put the metadata one first to test that order is preserved.
181 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"),
182 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
184 new_run = self.outputRun + "e"
185 with self.assertLogs(level="WARNING") as cm:
186 with lsst.log.UsePythonLogging():
187 self.task.run(files, run=new_run)
188 self.assertIn("already specified in an index file, ignoring", cm.output[0])
190 # Bad index file.
191 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
192 with self.assertRaises(RuntimeError):
193 self.task.run(files, run=self.outputRun)
195 # Bad index file due to bad instrument.
196 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")]
197 with self.assertLogs(level="WARNING") as cm:
198 with lsst.log.UsePythonLogging():
199 with self.assertRaises(RuntimeError):
200 self.task.run(files, run=self.outputRun)
201 self.assertIn("Instrument HSC for file", cm.output[0])
203 def testBadExposure(self):
204 """Test that bad exposures trigger the correct failure modes.
206 This is the only test that uses the bad definition of dataset 4
207 because exposure definitions are defined globally in a butler registry.
208 """
210 # Ingest 3 files. 2 of them will implicitly find an index and one
211 # will use a sidecar.
212 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)]
213 new_run = self.outputRun
214 self.task.run(files, run=new_run)
216 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run))
217 self.assertEqual(len(datasets), 3)
219 # Test fail fast.
220 self.task.config.failFast = True
222 # Ingest files with conflicting exposure definitions.
223 # Ingest 3 files. One of them will implicitly find an index and one
224 # will use a sidecar. The 3rd will fail due to exposure conflict.
225 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)]
226 new_run = self.outputRun + "_bad_exposure"
227 with self.assertRaises(ConflictingDefinitionError):
228 self.task.run(files, run=new_run)
230 def testBadFile(self):
231 """Try to ingest a bad file."""
232 files = [self.bad_metadata_file]
234 with self.assertRaises(RuntimeError) as cm:
235 # Default is to raise an error at the end.
236 self.task.run(files, run=self.outputRun)
237 self.assertIn("Some failures", str(cm.exception))
239 # Including a good file will result in ingest working but still
240 # raises (we might want to move this to solely happen in the
241 # command line invocation).
242 files.append(self.good_file)
244 # Also include a file with unknown instrument.
245 files.append(self.bad_instrument_file)
247 with self.assertRaises(RuntimeError):
248 self.task.run(files, run=self.outputRun)
249 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
250 self.assertEqual(len(datasets), 1)
252 # Fail fast will trigger a run time error with different text.
253 # Use a different output run to be sure we are not failing because
254 # of the attempt to ingest twice.
255 self.task.config.failFast = True
256 new_run = self.outputRun + "b"
257 with self.assertRaises(RuntimeError) as cm:
258 self.task.run([self.bad_metadata_file, self.good_file], run=new_run)
259 self.assertIn("Problem extracting metadata", str(cm.exception))
261 # Attempt to ingest good file again -- this will fail for a different
262 # reason than failed metadata extraction.
263 with self.assertRaises(ConflictingDefinitionError):
264 self.task.run([self.good_file], run=self.outputRun)
266 # Ingest a file with good metadata but unknown instrument.
267 with self.assertRaises(RuntimeError) as cm:
268 self.task.run([self.bad_instrument_file], run=self.outputRun)
269 self.assertIn("Instrument HSC", str(cm.exception))
271 # Ingest of a metadata index file that will fail translation.
272 with self.assertRaises(RuntimeError) as cm:
273 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")])
274 self.assertIn("Problem extracting metadata", str(cm.exception))
276 # Ingest of a bad index file.
277 with self.assertRaises(RuntimeError) as cm:
278 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")])
279 self.assertIn("Problem reading index file", str(cm.exception))
281 # Ingest of an implied bad index file.
282 with self.assertRaises(RuntimeError) as cm:
283 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")])
285 def testCallbacks(self):
286 """Test the callbacks for failures."""
288 # Define the callbacks.
289 metadata_failures = []
290 successes = []
291 ingest_failures = []
293 def on_metadata_failure(filename, exc):
294 metadata_failures.append(filename)
296 def on_success(datasets):
297 successes.append(datasets)
299 def on_ingest_failure(exposure, exc):
300 ingest_failures.append(exposure)
302 # Need our own task instance
303 config = RawIngestTask.ConfigClass()
304 self.task = DummyCamRawIngestTask(config=config, butler=self.butler,
305 on_metadata_failure=on_metadata_failure,
306 on_success=on_success,
307 on_ingest_failure=on_ingest_failure)
309 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file]
311 with self.assertRaises(RuntimeError):
312 self.task.run(files, run=self.outputRun)
314 self.assertEqual(len(successes), 1)
315 self.assertEqual(len(metadata_failures), 2)
316 self.assertEqual(len(ingest_failures), 0)
318 # Try the good one a second time.
319 with self.assertRaises(RuntimeError):
320 self.task.run([self.good_file], run=self.outputRun)
322 self.assertEqual(len(successes), 1)
323 self.assertEqual(len(ingest_failures), 1)
325 # An index file with metadata that won't translate.
326 metadata_failures[:] = []
327 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
328 with self.assertRaises(RuntimeError):
329 self.task.run(files, run=self.outputRun)
330 self.assertEqual(len(metadata_failures), 2)
332 # Bad index file.
333 metadata_failures[:] = []
334 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
335 with self.assertRaises(RuntimeError):
336 self.task.run(files, run=self.outputRun)
337 self.assertEqual(len(metadata_failures), 1)
339 # Ingest two files that have conflicting exposure metadata.
340 ingest_failures[:] = []
341 successes[:] = []
342 # Ingest 4 files. 2 of them will implicitly find an index and one
343 # will use a sidecar. The 4th will fail due to exposure conflict.
344 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)]
345 new_run = self.outputRun + "_fail"
346 with self.assertRaises(RuntimeError):
347 self.task.run(files, run=new_run)
348 self.assertEqual(len(ingest_failures), 1)
349 self.assertEqual(len(successes), 3)
351 def testSkipExistingExposures(self):
352 """Test that skip_existing_exposures=True avoids exceptions from trying
353 to ingest the same file twice.
355 Notes
356 -----
357 This option also prevents not-ingested-yet raws from being ingested
358 when exposure already exists, but that's (A) hard to test given the
359 test data we have now and (B) not really ideal behavior, just behavior
360 we can live with in order to have a way to avoid keep duplicate ingests
361 from being an error.
362 """
363 # Ingest the first time.
364 self.task.run([self.good_file], run=self.outputRun)
365 # Attempt to ingest a second time with skip_existing_exposures=False
366 # (default). This should fail.
367 with self.assertRaises(RuntimeError):
368 self.task.run([self.good_file], run=self.outputRun)
369 # Try again with `skip_existing_exposures=True.
370 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True)
372 def testUpdateExposureRecords(self):
373 """Test that update_exposure_records=True allows metadata to be
374 modified.
375 """
376 config = RawIngestTask.ConfigClass(failFast=True)
377 task = DummyCamRawIngestTask(config=config, butler=self.butler)
378 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), 'r') as file:
379 metadata = json.load(file)
380 # Modify unique identifiers to avoid clashes with ingests from
381 # other test methods in this test case, because those share a a
382 # data repository.
383 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords"
384 metadata["observation_counter"] = 10
385 metadata["exposure_id"] = 500
386 metadata["exposure_group"] = "50"
387 metadata["visit_id"] = 500
388 base_filename = "dataset"
389 try:
390 # Copy the original file to be ingested (.yaml) to a temporary
391 # directory, and write the new metadata next to it.
392 tmp_dir = tempfile.mkdtemp(dir=TESTDIR)
393 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml")
394 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json")
395 shutil.copy(self.good_file, raw_filename)
396 with open(sidecar_filename, "w") as sidecar_file:
397 json.dump(metadata, sidecar_file)
398 task.run([raw_filename], run=self.outputRun)
399 (record1,) = set(self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam",
400 exposure=500))
401 self.assertEqual(record1.exposure_time, metadata["exposure_time"])
402 # Modify some metadata and repeat the process to update the
403 # exposure.
404 metadata["exposure_time"] *= 2.0
405 with open(sidecar_filename, "w") as sidecar_file:
406 json.dump(metadata, sidecar_file)
407 task.run([raw_filename], run=self.outputRun, skip_existing_exposures=True,
408 update_exposure_records=True)
409 (record2,) = set(self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam",
410 exposure=500))
411 self.assertEqual(record2.exposure_time, record1.exposure_time*2)
412 finally:
413 shutil.rmtree(tmp_dir, ignore_errors=True)
416class TestRawIngestTaskPickle(unittest.TestCase):
417 """Test that pickling of the RawIngestTask works properly."""
419 @classmethod
420 def setUpClass(cls):
421 cls.root = tempfile.mkdtemp(dir=TESTDIR)
422 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {})
424 @classmethod
425 def tearDownClass(cls):
426 if cls.root is not None:
427 shutil.rmtree(cls.root, ignore_errors=True)
429 def setUp(self):
430 self.butler = butlerTests.makeTestCollection(self.creatorButler)
432 self.config = RawIngestTask.ConfigClass()
433 self.config.transfer = "copy" # safe non-default value
434 self.task = RawIngestTask(config=self.config, butler=self.butler)
436 def testPickleTask(self):
437 stream = pickle.dumps(self.task)
438 copy = pickle.loads(stream)
439 self.assertEqual(self.task.getFullName(), copy.getFullName())
440 self.assertEqual(self.task.log.name, copy.log.name)
441 self.assertEqual(self.task.config, copy.config)
442 self.assertEqual(self.task.butler._config, copy.butler._config)
443 self.assertEqual(self.task.butler.collections, copy.butler.collections)
444 self.assertEqual(self.task.butler.run, copy.butler.run)
445 self.assertEqual(self.task.universe, copy.universe)
446 self.assertEqual(self.task.datasetType, copy.datasetType)
449if __name__ == "__main__": 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true
450 unittest.main()