Coverage for tests/test_ingest.py: 24%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22import json
23import os
24import pickle
25import shutil
26import tempfile
27import unittest
29import lsst.daf.butler.tests as butlerTests
30from lsst.daf.butler import DatasetType, Butler, DataCoordinate, Config
31from lsst.daf.butler.registry import ConflictingDefinitionError
32from lsst.daf.butler.core.utils import getFullTypeName
34from lsst.obs.base.ingest_tests import IngestTestBase
35from lsst.obs.base.instrument_tests import DummyCam
36from lsst.obs.base import RawIngestTask
39TESTDIR = os.path.abspath(os.path.dirname(__file__))
40INGESTDIR = os.path.join(TESTDIR, "data", "ingest")
43class DummyCamRawIngestTask(RawIngestTask):
44 """For DummyCam we ingest a different dataset type that can return
45 a non-Exposure."""
47 def getDatasetType(self):
48 """Return the DatasetType of the datasets ingested by this Task.
49 """
50 return DatasetType("raw_dict", ("instrument", "detector", "exposure"), "StructuredDataDict",
51 universe=self.butler.registry.dimensions)
54class RawIngestTestCase(IngestTestBase, unittest.TestCase):
55 """Test ingest using JSON sidecar files."""
57 ingestDatasetTypeName = "raw_dict"
58 rawIngestTask = getFullTypeName(DummyCamRawIngestTask)
59 curatedCalibrationDatasetTypes = ()
60 ingestDir = TESTDIR
61 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam"
62 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")
63 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)]
65 @property
66 def visits(self):
67 butler = Butler(self.root, collections=[self.outputRun])
68 return {
69 DataCoordinate.standardize(
70 instrument="DummyCam",
71 visit=100,
72 universe=butler.registry.dimensions
73 ): [
74 DataCoordinate.standardize(
75 instrument="DummyCam",
76 exposure=100,
77 universe=butler.registry.dimensions
78 )
79 ]
80 }
82 def testWriteCuratedCalibrations(self):
83 """There are no curated calibrations in this test instrument"""
84 pass
87class RawIngestImpliedIndexTestCase(RawIngestTestCase):
88 """Test ingest using JSON index files."""
89 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml")
92class RawIngestEdgeCaseTestCase(unittest.TestCase):
93 """Test ingest using non-standard approaches including failures."""
95 @classmethod
96 def setUpClass(cls):
97 butlerConfig = """
98datastore:
99 # Want to ingest real files so can't use in-memory datastore
100 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore
101"""
102 cls.root = tempfile.mkdtemp(dir=TESTDIR)
103 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig))
104 DummyCam().register(cls.creatorButler.registry)
106 @classmethod
107 def tearDownClass(cls):
108 if cls.root is not None:
109 shutil.rmtree(cls.root, ignore_errors=True)
111 def setUp(self):
112 self.butler = butlerTests.makeTestCollection(self.creatorButler)
113 self.outputRun = self.butler.run
115 config = RawIngestTask.ConfigClass()
116 self.task = DummyCamRawIngestTask(config=config, butler=self.butler)
118 # Different test files.
119 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits")
120 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml")
121 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits")
123 def testSimpleIngest(self):
124 # Use the default per-instrument run for this.
125 self.task.run([self.good_file])
126 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all"))
127 self.assertEqual(len(datasets), 1)
129 # Now parallelized.
130 files = [self.good_file,
131 os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")]
132 self.task.run(files, processes=2, run=self.outputRun)
133 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
134 self.assertEqual(len(datasets), 2)
136 def testExplicitIndex(self):
137 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")]
138 self.task.run(files, run=self.outputRun)
140 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
141 self.assertEqual(len(datasets), 2)
143 # Try again with an explicit index and a file that is in that index.
144 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml"))
145 new_run = self.outputRun + "b"
146 self.task.run(files, run=new_run)
148 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
149 self.assertEqual(len(datasets), 2)
151 # Now with two index files that point to the same files.
152 # Look for the warning from duplication.
153 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"),
154 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json")]
155 new_run = self.outputRun + "c"
157 with self.assertLogs(level="WARNING") as cm:
158 self.task.run(files, run=new_run)
159 self.assertIn("already specified in an index file, ignoring content", cm.output[0])
161 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
162 self.assertEqual(len(datasets), 2)
164 # Again with an index file of metadata and one of translated.
165 # Translated should win.
166 # Put the metadata one first to test that order is preserved.
167 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"),
168 os.path.join(INGESTDIR, "indexed_data", "_index.json")]
169 new_run = self.outputRun + "d"
170 with self.assertLogs(level="WARNING") as cm:
171 self.task.run(files, run=new_run)
172 self.assertIn("already specified in an index file but overriding", cm.output[0])
174 # Reversing the order should change the warning.
175 # Again with an index file of metadata and one of translated.
176 # Translated should win.
177 # Put the metadata one first to test that order is preserved.
178 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"),
179 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
181 new_run = self.outputRun + "e"
182 with self.assertLogs(level="WARNING") as cm:
183 self.task.run(files, run=new_run)
184 self.assertIn("already specified in an index file, ignoring", cm.output[0])
186 # Bad index file.
187 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
188 with self.assertRaises(RuntimeError):
189 self.task.run(files, run=self.outputRun)
191 # Bad index file due to bad instrument.
192 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")]
193 with self.assertLogs(level="WARNING") as cm:
194 with self.assertRaises(RuntimeError):
195 self.task.run(files, run=self.outputRun)
196 self.assertIn("Instrument HSC for file", cm.output[0])
198 def testBadExposure(self):
199 """Test that bad exposures trigger the correct failure modes.
201 This is the only test that uses the bad definition of dataset 4
202 because exposure definitions are defined globally in a butler registry.
203 """
205 # Ingest 3 files. 2 of them will implicitly find an index and one
206 # will use a sidecar.
207 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)]
208 new_run = self.outputRun
209 self.task.run(files, run=new_run)
211 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run))
212 self.assertEqual(len(datasets), 3)
214 # Test fail fast.
215 self.task.config.failFast = True
217 # Ingest files with conflicting exposure definitions.
218 # Ingest 3 files. One of them will implicitly find an index and one
219 # will use a sidecar. The 3rd will fail due to exposure conflict.
220 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)]
221 new_run = self.outputRun + "_bad_exposure"
222 with self.assertRaises(ConflictingDefinitionError):
223 self.task.run(files, run=new_run)
225 def testBadFile(self):
226 """Try to ingest a bad file."""
227 files = [self.bad_metadata_file]
229 with self.assertRaises(RuntimeError) as cm:
230 # Default is to raise an error at the end.
231 self.task.run(files, run=self.outputRun)
232 self.assertIn("Some failures", str(cm.exception))
234 # Including a good file will result in ingest working but still
235 # raises (we might want to move this to solely happen in the
236 # command line invocation).
237 files.append(self.good_file)
239 # Also include a file with unknown instrument.
240 files.append(self.bad_instrument_file)
242 with self.assertRaises(RuntimeError):
243 self.task.run(files, run=self.outputRun)
244 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun))
245 self.assertEqual(len(datasets), 1)
247 # Fail fast will trigger a run time error with different text.
248 # Use a different output run to be sure we are not failing because
249 # of the attempt to ingest twice.
250 self.task.config.failFast = True
251 new_run = self.outputRun + "b"
252 with self.assertRaises(RuntimeError) as cm:
253 self.task.run([self.bad_metadata_file, self.good_file], run=new_run)
254 self.assertIn("Problem extracting metadata", str(cm.exception))
256 # Attempt to ingest good file again -- this will fail for a different
257 # reason than failed metadata extraction.
258 with self.assertRaises(ConflictingDefinitionError):
259 self.task.run([self.good_file], run=self.outputRun)
261 # Ingest a file with good metadata but unknown instrument.
262 with self.assertRaises(RuntimeError) as cm:
263 self.task.run([self.bad_instrument_file], run=self.outputRun)
264 self.assertIn("Instrument HSC", str(cm.exception))
266 # Ingest of a metadata index file that will fail translation.
267 with self.assertRaises(RuntimeError) as cm:
268 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")])
269 self.assertIn("Problem extracting metadata", str(cm.exception))
271 # Ingest of a bad index file.
272 with self.assertRaises(RuntimeError) as cm:
273 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")])
274 self.assertIn("Problem reading index file", str(cm.exception))
276 # Ingest of an implied bad index file.
277 with self.assertRaises(RuntimeError) as cm:
278 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")])
280 def testCallbacks(self):
281 """Test the callbacks for failures."""
283 # Define the callbacks.
284 metadata_failures = []
285 successes = []
286 ingest_failures = []
288 def on_metadata_failure(filename, exc):
289 metadata_failures.append(filename)
291 def on_success(datasets):
292 successes.append(datasets)
294 def on_ingest_failure(exposure, exc):
295 ingest_failures.append(exposure)
297 # Need our own task instance
298 config = RawIngestTask.ConfigClass()
299 self.task = DummyCamRawIngestTask(config=config, butler=self.butler,
300 on_metadata_failure=on_metadata_failure,
301 on_success=on_success,
302 on_ingest_failure=on_ingest_failure)
304 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file]
306 with self.assertRaises(RuntimeError):
307 self.task.run(files, run=self.outputRun)
309 self.assertEqual(len(successes), 1)
310 self.assertEqual(len(metadata_failures), 2)
311 self.assertEqual(len(ingest_failures), 0)
313 # Try the good one a second time.
314 with self.assertRaises(RuntimeError):
315 self.task.run([self.good_file], run=self.outputRun)
317 self.assertEqual(len(successes), 1)
318 self.assertEqual(len(ingest_failures), 1)
320 # An index file with metadata that won't translate.
321 metadata_failures[:] = []
322 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]
323 with self.assertRaises(RuntimeError):
324 self.task.run(files, run=self.outputRun)
325 self.assertEqual(len(metadata_failures), 2)
327 # Bad index file.
328 metadata_failures[:] = []
329 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]
330 with self.assertRaises(RuntimeError):
331 self.task.run(files, run=self.outputRun)
332 self.assertEqual(len(metadata_failures), 1)
334 # Ingest two files that have conflicting exposure metadata.
335 ingest_failures[:] = []
336 successes[:] = []
337 # Ingest 4 files. 2 of them will implicitly find an index and one
338 # will use a sidecar. The 4th will fail due to exposure conflict.
339 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)]
340 new_run = self.outputRun + "_fail"
341 with self.assertRaises(RuntimeError):
342 self.task.run(files, run=new_run)
343 self.assertEqual(len(ingest_failures), 1)
344 self.assertEqual(len(successes), 3)
346 def testSkipExistingExposures(self):
347 """Test that skip_existing_exposures=True avoids exceptions from trying
348 to ingest the same file twice.
350 Notes
351 -----
352 This option also prevents not-ingested-yet raws from being ingested
353 when exposure already exists, but that's (A) hard to test given the
354 test data we have now and (B) not really ideal behavior, just behavior
355 we can live with in order to have a way to avoid keep duplicate ingests
356 from being an error.
357 """
358 # Ingest the first time.
359 self.task.run([self.good_file], run=self.outputRun)
360 # Attempt to ingest a second time with skip_existing_exposures=False
361 # (default). This should fail.
362 with self.assertRaises(RuntimeError):
363 self.task.run([self.good_file], run=self.outputRun)
364 # Try again with `skip_existing_exposures=True.
365 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True)
367 def testUpdateExposureRecords(self):
368 """Test that update_exposure_records=True allows metadata to be
369 modified.
370 """
371 config = RawIngestTask.ConfigClass(failFast=True)
372 task = DummyCamRawIngestTask(config=config, butler=self.butler)
373 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), 'r') as file:
374 metadata = json.load(file)
375 # Modify unique identifiers to avoid clashes with ingests from
376 # other test methods in this test case, because those share a a
377 # data repository.
378 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords"
379 metadata["observation_counter"] = 10
380 metadata["exposure_id"] = 500
381 metadata["exposure_group"] = "50"
382 metadata["visit_id"] = 500
383 base_filename = "dataset"
384 try:
385 # Copy the original file to be ingested (.yaml) to a temporary
386 # directory, and write the new metadata next to it.
387 tmp_dir = tempfile.mkdtemp(dir=TESTDIR)
388 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml")
389 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json")
390 shutil.copy(self.good_file, raw_filename)
391 with open(sidecar_filename, "w") as sidecar_file:
392 json.dump(metadata, sidecar_file)
393 task.run([raw_filename], run=self.outputRun)
394 (record1,) = set(self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam",
395 exposure=500))
396 self.assertEqual(record1.exposure_time, metadata["exposure_time"])
397 # Modify some metadata and repeat the process to update the
398 # exposure.
399 metadata["exposure_time"] *= 2.0
400 with open(sidecar_filename, "w") as sidecar_file:
401 json.dump(metadata, sidecar_file)
402 task.run([raw_filename], run=self.outputRun, skip_existing_exposures=True,
403 update_exposure_records=True)
404 (record2,) = set(self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam",
405 exposure=500))
406 self.assertEqual(record2.exposure_time, record1.exposure_time*2)
407 finally:
408 shutil.rmtree(tmp_dir, ignore_errors=True)
411class TestRawIngestTaskPickle(unittest.TestCase):
412 """Test that pickling of the RawIngestTask works properly."""
414 @classmethod
415 def setUpClass(cls):
416 cls.root = tempfile.mkdtemp(dir=TESTDIR)
417 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {})
419 @classmethod
420 def tearDownClass(cls):
421 if cls.root is not None:
422 shutil.rmtree(cls.root, ignore_errors=True)
424 def setUp(self):
425 self.butler = butlerTests.makeTestCollection(self.creatorButler)
427 self.config = RawIngestTask.ConfigClass()
428 self.config.transfer = "copy" # safe non-default value
429 self.task = RawIngestTask(config=self.config, butler=self.butler)
431 def testPickleTask(self):
432 stream = pickle.dumps(self.task)
433 copy = pickle.loads(stream)
434 self.assertEqual(self.task.getFullName(), copy.getFullName())
435 self.assertEqual(self.task.log.name, copy.log.name)
436 self.assertEqual(self.task.config, copy.config)
437 self.assertEqual(self.task.butler._config, copy.butler._config)
438 self.assertEqual(self.task.butler.collections, copy.butler.collections)
439 self.assertEqual(self.task.butler.run, copy.butler.run)
440 self.assertEqual(self.task.universe, copy.universe)
441 self.assertEqual(self.task.datasetType, copy.datasetType)
444if __name__ == "__main__": 444 ↛ 445line 444 didn't jump to line 445, because the condition on line 444 was never true
445 unittest.main()