Coverage for tests/test_ingest.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import os
23import pickle
24import shutil
25import tempfile
26import unittest
28import lsst.log
29import lsst.daf.butler.tests as butlerTests
30from lsst.daf.butler import DatasetType, Butler, DataCoordinate, Config
31from lsst.daf.butler.registry import ConflictingDefinitionError
32from lsst.daf.butler.core.utils import getFullTypeName
34from lsst.obs.base.ingest_tests import IngestTestBase
35from lsst.obs.base.instrument_tests import DummyCam
36from lsst.obs.base import RawIngestTask
39TESTDIR = os.path.abspath(os.path.dirname(__file__))
40INGESTDIR = os.path.join(TESTDIR, "data", "ingest")
43class DummyCamRawIngestTask(RawIngestTask):
44 """For DummyCam we ingest a different dataset type that can return
45 a non-Exposure."""
47 def getDatasetType(self):
48 """Return the DatasetType of the datasets ingested by this Task.
49 """
50 return DatasetType("raw_dict", ("instrument", "detector", "exposure"), "StructuredDataDict",
51 universe=self.butler.registry.dimensions)
54class RawIngestTestCase(IngestTestBase, unittest.TestCase):
55 """Test ingest using JSON sidecar files."""
57 ingestDatasetTypeName = "raw_dict"
58 rawIngestTask = getFullTypeName(DummyCamRawIngestTask)
59 curatedCalibrationDatasetTypes = ()
60 ingestDir = TESTDIR
61 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam"
62 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")
63 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)]
65 @property
66 def visits(self):
67 butler = Butler(self.root, collections=[self.outputRun])
68 return {
69 DataCoordinate.standardize(
70 instrument="DummyCam",
71 visit=100,
72 universe=butler.registry.dimensions
73 ): [
74 DataCoordinate.standardize(
75 instrument="DummyCam",
76 exposure=100,
77 universe=butler.registry.dimensions
78 )
79 ]
80 }
82 def testWriteCuratedCalibrations(self):
83 """There are no curated calibrations in this test instrument"""
84 pass
87class RawIngestImpliedIndexTestCase(RawIngestTestCase):
88 """Test ingest using JSON index files."""
89 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml")
92class RawIngestEdgeCaseTestCase(unittest.TestCase):
93 """Test ingest using non-standard approaches including failures."""
95 @classmethod
96 def setUpClass(cls):
97 butlerConfig = """
98datastore:
99 # Want to ingest real files so can't use in-memory datastore
100 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
101"""
102 cls.root = tempfile.mkdtemp(dir=TESTDIR)
103 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig))
104 DummyCam().register(cls.creatorButler.registry)
106 @classmethod
107 def tearDownClass(cls):
108 if cls.root is not None:
109 shutil.rmtree(cls.root, ignore_errors=True)
111 def setUp(self):
112 self.butler = butlerTests.makeTestCollection(self.creatorButler)
113 self.outputRun = self.butler.run
115 config = RawIngestTask.ConfigClass()
116 self.task = DummyCamRawIngestTask(config=config, butler=self.butler)
118 # Different test files.
119 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits")
120 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml")
121 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits")
123 def testSimpleIngest(self):
124 # Use the default per-instrument run for this.
125 self.task.run([self.good_file])
126 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all"))
127 self.assertEqual(len(datasets), 1)
129 # Now parallelized.
130 files = [self.good_file,
131 os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")]
132 self.task.run(files, processes=2, run=self.outputRun)
133 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
134 self.assertEqual(len(datasets), 2)
136 def testExplicitIndex(self):
137 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")]
138 self.task.run(files, run=self.outputRun)
140 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
141 self.assertEqual(len(datasets), 2)
143 # Try again with an explicit index and a file that is in that index.
144 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml"))
145 new_run = self.outputRun + "b"
146 self.task.run(files, run=new_run)
148 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
149 self.assertEqual(len(datasets), 2)
151 # Now with two index files that point to the same files.
152 # Look for the warning from duplication.
153 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"),
154 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json")]
155 new_run = self.outputRun + "c"
157 with self.assertLogs(level="WARNING") as cm:
158 with lsst.log.UsePythonLogging():
159 self.task.run(files, run=new_run)
160 self.assertIn("already specified in an index file, ignoring content", cm.output[0])
162 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
163 self.assertEqual(len(datasets), 2)
165 # Again with an index file of metadata and one of translated.
166 # Translated should win.
167 # Put the metadata one first to test that order is preserved.
168 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
169 os.path.join(INGESTDIR, "indexed_data", "_index.json")]
170 new_run = self.outputRun + "d"
171 with self.assertLogs(level="WARNING") as cm:
172 with lsst.log.UsePythonLogging():
173 self.task.run(files, run=new_run)
174 self.assertIn("already specified in an index file but overriding", cm.output[0])
176 # Reversing the order should change the warning.
177 # Again with an index file of metadata and one of translated.
178 # Translated should win.
179 # Put the metadata one first to test that order is preserved.
180 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"),
181 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
183 new_run = self.outputRun + "e"
184 with self.assertLogs(level="WARNING") as cm:
185 with lsst.log.UsePythonLogging():
186 self.task.run(files, run=new_run)
187 self.assertIn("already specified in an index file, ignoring", cm.output[0])
189 # Bad index file.
190 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
191 with self.assertRaises(RuntimeError):
192 self.task.run(files, run=self.outputRun)
194 # Bad index file due to bad instrument.
195 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")]
196 with self.assertLogs(level="WARNING") as cm:
197 with lsst.log.UsePythonLogging():
198 with self.assertRaises(RuntimeError):
199 self.task.run(files, run=self.outputRun)
200 self.assertIn("Instrument HSC for file", cm.output[0])
202 def testBadExposure(self):
203 """Test that bad exposures trigger the correct failure modes.
205 This is the only test that uses the bad definition of dataset 4
206 because exposure definitions are defined globally in a butler registry.
207 """
209 # Ingest 3 files. 2 of them will implicitly find an index and one
210 # will use a sidecar.
211 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)]
212 new_run = self.outputRun
213 self.task.run(files, run=new_run)
215 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run))
216 self.assertEqual(len(datasets), 3)
218 # Test fail fast.
219 self.task.config.failFast = True
221 # Ingest files with conflicting exposure definitions.
222 # Ingest 3 files. One of them will implicitly find an index and one
223 # will use a sidecar. The 3rd will fail due to exposure conflict.
224 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)]
225 new_run = self.outputRun + "_bad_exposure"
226 with self.assertRaises(ConflictingDefinitionError):
227 self.task.run(files, run=new_run)
229 def testBadFile(self):
230 """Try to ingest a bad file."""
231 files = [self.bad_metadata_file]
233 with self.assertRaises(RuntimeError) as cm:
234 # Default is to raise an error at the end.
235 self.task.run(files, run=self.outputRun)
236 self.assertIn("Some failures", str(cm.exception))
238 # Including a good file will result in ingest working but still
239 # raises (we might want to move this to solely happen in the
240 # command line invocation).
241 files.append(self.good_file)
243 # Also include a file with unknown instrument.
244 files.append(self.bad_instrument_file)
246 with self.assertRaises(RuntimeError):
247 self.task.run(files, run=self.outputRun)
248 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
249 self.assertEqual(len(datasets), 1)
251 # Fail fast will trigger a run time error with different text.
252 # Use a different output run to be sure we are not failing because
253 # of the attempt to ingest twice.
254 self.task.config.failFast = True
255 new_run = self.outputRun + "b"
256 with self.assertRaises(RuntimeError) as cm:
257 self.task.run([self.bad_metadata_file, self.good_file], run=new_run)
258 self.assertIn("Problem extracting metadata", str(cm.exception))
260 # Attempt to ingest good file again -- this will fail for a different
261 # reason than failed metadata extraction.
262 with self.assertRaises(ConflictingDefinitionError):
263 self.task.run([self.good_file], run=self.outputRun)
265 # Ingest a file with good metadata but unknown instrument.
266 with self.assertRaises(RuntimeError) as cm:
267 self.task.run([self.bad_instrument_file], run=self.outputRun)
268 self.assertIn("Instrument HSC", str(cm.exception))
270 # Ingest of a metadata index file that will fail translation.
271 with self.assertRaises(RuntimeError) as cm:
272 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")])
273 self.assertIn("Problem extracting metadata", str(cm.exception))
275 # Ingest of a bad index file.
276 with self.assertRaises(RuntimeError) as cm:
277 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")])
278 self.assertIn("Problem reading index file", str(cm.exception))
280 # Ingest of an implied bad index file.
281 with self.assertRaises(RuntimeError) as cm:
282 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")])
284 def testCallbacks(self):
285 """Test the callbacks for failures."""
287 # Define the callbacks.
288 metadata_failures = []
289 successes = []
290 ingest_failures = []
292 def on_metadata_failure(filename, exc):
293 metadata_failures.append(filename)
295 def on_success(datasets):
296 successes.append(datasets)
298 def on_ingest_failure(exposure, exc):
299 ingest_failures.append(exposure)
301 # Need our own task instance
302 config = RawIngestTask.ConfigClass()
303 self.task = DummyCamRawIngestTask(config=config, butler=self.butler,
304 on_metadata_failure=on_metadata_failure,
305 on_success=on_success,
306 on_ingest_failure=on_ingest_failure)
308 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file]
310 with self.assertRaises(RuntimeError):
311 self.task.run(files, run=self.outputRun)
313 self.assertEqual(len(successes), 1)
314 self.assertEqual(len(metadata_failures), 2)
315 self.assertEqual(len(ingest_failures), 0)
317 # Try the good one a second time.
318 with self.assertRaises(RuntimeError):
319 self.task.run([self.good_file], run=self.outputRun)
321 self.assertEqual(len(successes), 1)
322 self.assertEqual(len(ingest_failures), 1)
324 # An index file with metadata that won't translate.
325 metadata_failures[:] = []
326 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
327 with self.assertRaises(RuntimeError):
328 self.task.run(files, run=self.outputRun)
329 self.assertEqual(len(metadata_failures), 2)
331 # Bad index file.
332 metadata_failures[:] = []
333 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
334 with self.assertRaises(RuntimeError):
335 self.task.run(files, run=self.outputRun)
336 self.assertEqual(len(metadata_failures), 1)
338 # Ingest two files that have conflicting exposure metadata.
339 ingest_failures[:] = []
340 successes[:] = []
341 # Ingest 4 files. 2 of them will implicitly find an index and one
342 # will use a sidecar. The 4th will fail due to exposure conflict.
343 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)]
344 new_run = self.outputRun + "_fail"
345 with self.assertRaises(RuntimeError):
346 self.task.run(files, run=new_run)
347 self.assertEqual(len(ingest_failures), 1)
348 self.assertEqual(len(successes), 3)
351class TestRawIngestTaskPickle(unittest.TestCase):
352 """Test that pickling of the RawIngestTask works properly."""
354 @classmethod
355 def setUpClass(cls):
356 cls.root = tempfile.mkdtemp(dir=TESTDIR)
357 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {})
359 @classmethod
360 def tearDownClass(cls):
361 if cls.root is not None:
362 shutil.rmtree(cls.root, ignore_errors=True)
364 def setUp(self):
365 self.butler = butlerTests.makeTestCollection(self.creatorButler)
367 self.config = RawIngestTask.ConfigClass()
368 self.config.transfer = "copy" # safe non-default value
369 self.task = RawIngestTask(config=self.config, butler=self.butler)
371 def testPickleTask(self):
372 stream = pickle.dumps(self.task)
373 copy = pickle.loads(stream)
374 self.assertEqual(self.task.getFullName(), copy.getFullName())
375 self.assertEqual(self.task.log.getName(), copy.log.getName())
376 self.assertEqual(self.task.config, copy.config)
377 self.assertEqual(self.task.butler._config, copy.butler._config)
378 self.assertEqual(self.task.butler.collections, copy.butler.collections)
379 self.assertEqual(self.task.butler.run, copy.butler.run)
380 self.assertEqual(self.task.universe, copy.universe)
381 self.assertEqual(self.task.datasetType, copy.datasetType)
384if __name__ == "__main__": 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true
385 unittest.main()