Coverage for python/lsst/obs/base/ingest_tests.py: 32%
215 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-05 12:09 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-05 12:09 +0000
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22"""Base class for writing Gen3 raw data ingest tests.
23"""
25__all__ = ("IngestTestBase",)
27import abc
28import os
29import shutil
30import tempfile
31import unittest
33import lsst.afw.cameraGeom
34import lsst.afw.cameraGeom.testUtils # For assertDetectorsEqual
35import lsst.obs.base
36from lsst.daf.butler import Butler, Registry
37from lsst.daf.butler.cli.butler import cli as butlerCli
38from lsst.daf.butler.cli.utils import LogCliRunner
39from lsst.pipe.base import Instrument
40from lsst.resources import ResourcePath
41from lsst.utils import doImportType
43from . import script
46class IngestTestBase(metaclass=abc.ABCMeta):
47 """Base class for tests of gen3 ingest. Subclass from this, then
48 `unittest.TestCase` to get a working test suite.
49 """
51 ingestDir = ""
52 """Root path to ingest files into. Typically `obs_package/tests/`; the
53 actual directory will be a tempdir under this one.
54 """
56 ingestDatasetTypeName = "raw"
57 """The DatasetType to use for the ingest.
59 If this is not an Exposure dataset type the tests will be more limited.
60 """
62 dataIds = []
63 """list of butler data IDs of files that should have been ingested."""
65 file = ""
66 """Full path to a file to ingest in tests."""
68 filterLabel = None
69 """The lsst.afw.image.FilterLabel that should be returned by the above
70 file."""
72 rawIngestTask = "lsst.obs.base.RawIngestTask"
73 """The task to use in the Ingest test."""
75 curatedCalibrationDatasetTypes = None
76 """List or tuple of Datasets types that should be present after calling
77 writeCuratedCalibrations. If `None` writeCuratedCalibrations will
78 not be called and the test will be skipped."""
80 defineVisitsTask = lsst.obs.base.DefineVisitsTask
81 """The task to use to define visits from groups of exposures.
82 This is ignored if ``visits`` is `None`.
83 """
85 visits = {}
86 """A dictionary mapping visit data IDs the lists of exposure data IDs that
87 are associated with them.
88 If this is empty (but not `None`), visit definition will be run but no
89 visits will be expected (e.g. because no exposures are on-sky
90 observations).
91 """
93 seed_config = None
94 """Location of a seed configuration file to pass to butler create.
96 Useful if additional formatters or storage classes need to be defined.
97 """
99 @property
100 @abc.abstractmethod
101 def instrumentClassName(self):
102 """The fully qualified instrument class name.
104 Returns
105 -------
106 `str`
107 The fully qualified instrument class name.
108 """
109 pass
111 @property
112 def instrumentClass(self):
113 """The instrument class."""
114 return doImportType(self.instrumentClassName)
116 @property
117 def instrumentName(self):
118 """The name of the instrument.
120 Returns
121 -------
122 `str`
123 The name of the instrument.
124 """
125 return self.instrumentClass.getName()
127 @classmethod
128 def setUpClass(cls):
129 # Use a temporary working directory.
130 cls.root = tempfile.mkdtemp(dir=cls.ingestDir)
131 cls._createRepo()
133 # Register the instrument and its static metadata.
134 cls._registerInstrument()
136 # Determine the relevant datastore root to use for testing.
137 butler = Butler(cls.root)
138 roots = butler.get_datastore_roots()
139 assert len(roots) == 1 # Only one datastore.
140 cls.datastore_root = list(roots.values())[0]
142 def setUp(self):
143 # Want a unique run name per test.
144 self.outputRun = "raw_ingest_" + self.id()
146 @classmethod
147 def tearDownClass(cls):
148 if os.path.exists(cls.root):
149 shutil.rmtree(cls.root, ignore_errors=True)
151 def verifyIngest(self, files=None, cli=False, fullCheck=False):
152 """
153 Test that RawIngestTask ingested the expected files.
155 Parameters
156 ----------
157 files : `list` [`str`], or None
158 List of files to be ingested, or None to use ``self.file``
159 fullCheck : `bool`, optional
160 If `True`, read the full raw dataset and check component
161 consistency. If `False` check that a component can be read
162 but do not read the entire raw exposure.
164 Notes
165 -----
166 Reading all the ingested test data can be expensive. The code paths
167 for reading the second raw are the same as reading the first so
168 we do not gain anything by doing full checks of everything.
169 Only read full pixel data for first dataset from file.
170 Don't even do that if we are requested not to by the caller.
171 This only really affects files that contain multiple datasets.
172 """
173 butler = Butler(self.root, run=self.outputRun)
174 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun))
175 self.assertEqual(len(datasets), len(self.dataIds))
177 # Can check that the timespan in the day_obs matches the exposure
178 # record.
179 if "day_obs" in butler.dimensions:
180 days = {
181 (rec.instrument, rec.id): rec.timespan
182 for rec in butler.registry.queryDimensionRecords("day_obs")
183 }
185 exp_records = list(butler.registry.queryDimensionRecords("exposure"))
186 for exp in exp_records:
187 day_span = days[exp.instrument, exp.day_obs]
188 if day_span is not None:
189 self.assertTrue(
190 day_span.contains(exp.timespan.begin), f"Timespan mismatch of {exp} and {day_span}"
191 )
193 # Get the URI to the first dataset and check it is inside the
194 # datastore.
195 datasetUri = butler.getURI(datasets[0])
196 self.assertIsNotNone(datasetUri.relative_to(self.datastore_root))
198 # Get the relevant dataset type.
199 datasetType = butler.get_dataset_type(self.ingestDatasetTypeName)
201 for dataId in self.dataIds:
202 # For testing we only read the entire dataset the first time
203 # round if this is an Exposure. If it's not an Exposure
204 # we always read it completely but we don't read components
205 # because for an arbitrary dataset type we can't easily tell
206 # what component to test.
208 if not datasetType.storageClass.name.startswith("Exposure"):
209 exposure = butler.get(self.ingestDatasetTypeName, dataId)
210 # Could be anything so nothing to test by default
211 continue
213 # Check that we can read metadata from a raw.
214 metadata = butler.get(f"{self.ingestDatasetTypeName}.metadata", dataId)
215 if not fullCheck:
216 continue
217 fullCheck = False
218 exposure = butler.get(self.ingestDatasetTypeName, dataId)
220 # Comparing headers will not work directly because of header
221 # fix up provenance.
222 metadata_headers = metadata.toDict()
223 exposure_headers = exposure.getMetadata().toDict()
224 metadata_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None)
225 exposure_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None)
226 self.assertEqual(metadata_headers, exposure_headers)
228 # Since components follow a different code path we check that
229 # WCS match and also we check that at least the shape
230 # of the image is the same (rather than doing per-pixel equality)
231 wcs = butler.get(f"{self.ingestDatasetTypeName}.wcs", dataId)
232 self.assertEqual(wcs, exposure.getWcs())
234 rawImage = butler.get(f"{self.ingestDatasetTypeName}.image", dataId)
235 self.assertEqual(rawImage.getBBox(), exposure.getBBox())
237 # Check that the filter label got the correct band.
238 filterLabel = butler.get(f"{self.ingestDatasetTypeName}.filter", dataId)
239 self.assertEqual(filterLabel, self.filterLabel)
241 # Check that the exposure's Detector is the same as the component
242 # we would read (this is tricky for LSST, which modifies its
243 # detector at read time; for most other cameras it should be
244 # trivially satisfied.
245 detector = butler.get(f"{self.ingestDatasetTypeName}.detector", dataId)
246 self.assertDetectorsEqual(detector, exposure.getDetector(), compareTransforms=False)
248 self.checkRepo(files=files)
250 def checkRepo(self, files=None):
251 """Check the state of the repository after ingest.
253 This is an optional hook provided for subclasses; by default it does
254 nothing.
256 Parameters
257 ----------
258 files : `list` [`str`], or None
259 List of files to be ingested, or None to use ``self.file``
260 """
261 return
263 @classmethod
264 def _createRepo(cls):
265 """Use the Click `testing` module to call the butler command line api
266 to create a repository.
267 """
268 runner = LogCliRunner()
269 args = []
270 if cls.seed_config:
271 args.extend(["--seed-config", cls.seed_config])
272 result = runner.invoke(butlerCli, ["create", cls.root, *args])
273 # Classmethod so assertEqual does not work.
274 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}"
276 def _ingestRaws(self, transfer, file=None):
277 """Use the Click `testing` module to call the butler command line api
278 to ingest raws.
280 Parameters
281 ----------
282 transfer : `str`
283 The external data transfer type.
284 file : `str`
285 Path to a file to ingest instead of the default associated with
286 the object.
287 """
288 if file is None:
289 file = self.file
290 runner = LogCliRunner()
291 result = runner.invoke(
292 butlerCli,
293 [
294 "ingest-raws",
295 self.root,
296 file,
297 "--output-run",
298 self.outputRun,
299 "--transfer",
300 transfer,
301 "--ingest-task",
302 self.rawIngestTask,
303 ],
304 )
305 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}")
307 @classmethod
308 def _registerInstrument(cls):
309 """Use the Click `testing` module to call the butler command line api
310 to register the instrument.
311 """
312 runner = LogCliRunner()
313 result = runner.invoke(butlerCli, ["register-instrument", cls.root, cls.instrumentClassName])
314 # Classmethod so assertEqual does not work.
315 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}"
317 def _writeCuratedCalibrations(self):
318 """Use the Click `testing` module to call the butler command line api
319 to write curated calibrations.
320 """
321 runner = LogCliRunner()
322 result = runner.invoke(butlerCli, ["write-curated-calibrations", self.root, self.instrumentName])
323 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}")
325 def testLink(self):
326 self._ingestRaws(transfer="link")
327 self.verifyIngest()
329 def testSymLink(self):
330 self._ingestRaws(transfer="symlink")
331 self.verifyIngest()
333 def testDirect(self):
334 self._ingestRaws(transfer="direct")
336 # Check that it really did have a URI outside of datastore.
337 srcUri = ResourcePath(self.file, forceAbsolute=True)
338 butler = Butler(self.root, run=self.outputRun)
339 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun))
340 datastoreUri = butler.getURI(datasets[0])
341 self.assertEqual(datastoreUri, srcUri)
343 def testCopy(self):
344 self._ingestRaws(transfer="copy")
345 # Only test full read of raws for the copy test. No need to do it
346 # in the other tests since the formatter will be the same in all
347 # cases.
348 self.verifyIngest(fullCheck=True)
350 def testHardLink(self):
351 try:
352 self._ingestRaws(transfer="hardlink")
353 # Running ingest through the Click testing infrastructure causes
354 # the original exception indicating that we can't hard-link
355 # on this filesystem to be turned into a nonzero exit code, which
356 # then trips the test assertion.
357 except (AssertionError, PermissionError) as err:
358 raise unittest.SkipTest(
359 "Skipping hard-link test because input data is on a different filesystem."
360 ) from err
361 self.verifyIngest()
363 def testInPlace(self):
364 """Test that files already in the directory can be added to the
365 registry in-place.
366 """
367 butler = Butler(self.root, run=self.outputRun)
369 # If the test uses an index file the index file needs to also
370 # appear in the datastore root along with the file to be ingested.
371 # In that scenario the file name being used for ingest can not
372 # be modified and must have the same name as found in the index
373 # file itself.
374 source_file_uri = ResourcePath(self.file)
375 index_file = source_file_uri.dirname().join("_index.json")
376 pathInStore = source_file_uri.basename()
377 if index_file.exists():
378 os.symlink(index_file.ospath, self.datastore_root.join("_index.json").ospath)
379 else:
380 # No index file so we are free to pick any name.
381 pathInStore = "prefix-" + pathInStore
383 # Create a symlink to the original file so that it looks like it
384 # is now inside the datastore.
385 newPath = self.datastore_root.join(pathInStore)
386 os.symlink(os.path.abspath(self.file), newPath.ospath)
388 # If there is a sidecar file it needs to be linked in as well
389 # since ingest code does not follow symlinks.
390 sidecar_uri = ResourcePath(source_file_uri).updatedExtension(".json")
391 if sidecar_uri.exists():
392 newSidecar = ResourcePath(newPath).updatedExtension(".json")
393 os.symlink(sidecar_uri.ospath, newSidecar.ospath)
395 # Run ingest with auto mode since that should automatically determine
396 # that an in-place ingest is happening.
397 self._ingestRaws(transfer="auto", file=newPath.ospath)
398 self.verifyIngest()
400 # Recreate a butler post-ingest (the earlier one won't see the
401 # ingested files).
402 butler = Butler(self.root, run=self.outputRun)
404 # Check that the URI associated with this path is the right one.
405 uri = butler.getURI(self.ingestDatasetTypeName, self.dataIds[0])
406 self.assertEqual(uri.relative_to(self.datastore_root), pathInStore)
408 def testFailOnConflict(self):
409 """Re-ingesting the same data into the repository should fail."""
410 self._ingestRaws(transfer="symlink")
411 with self.assertRaises(AssertionError):
412 self._ingestRaws(transfer="symlink")
414 def testWriteCuratedCalibrations(self):
415 """Test that we can ingest the curated calibrations, and read them
416 with `loadCamera` both before and after.
417 """
418 if self.curatedCalibrationDatasetTypes is None:
419 raise unittest.SkipTest("Class requests disabling of writeCuratedCalibrations test")
421 butler = Butler(self.root, writeable=False)
422 collection = self.instrumentClass().makeCalibrationCollectionName()
424 # Trying to load a camera with a data ID not known to the registry
425 # is an error, because we can't get any temporal information.
426 with self.assertRaises(LookupError):
427 lsst.obs.base.loadCamera(butler, {"exposure": 0}, collections=collection)
429 # Ingest raws in order to get some exposure records.
430 self._ingestRaws(transfer="auto")
432 # Load camera should returned an unversioned camera because there's
433 # nothing in the repo.
434 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection)
435 self.assertFalse(isVersioned)
436 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera)
438 self._writeCuratedCalibrations()
440 # Make a new butler instance to make sure we don't have any stale
441 # caches (e.g. of DatasetTypes). Note that we didn't give
442 # _writeCuratedCalibrations the butler instance we had, because it's
443 # trying to test the CLI interface anyway.
444 butler = Butler(self.root, writeable=False)
446 instrumentClass = self.instrumentClass()
447 calibration_names = instrumentClass.getCuratedCalibrationNames()
449 for datasetTypeName in self.curatedCalibrationDatasetTypes:
450 with self.subTest(dtype=datasetTypeName):
451 found = list(
452 butler.registry.queryDatasetAssociations(
453 datasetTypeName,
454 collections=collection,
455 )
456 )
457 self.assertGreater(len(found), 0, f"Checking {datasetTypeName}")
458 self.assertIn(datasetTypeName, calibration_names)
460 # Load camera should returned the versioned camera from the repo.
461 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection)
462 self.assertTrue(isVersioned)
463 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera)
465 def testDefineVisits(self):
466 if self.visits is None:
467 self.skipTest("Expected visits were not defined.")
468 self._ingestRaws(transfer="link")
470 # Check that obscore table (if configured) has correct contents.
471 butler = Butler(self.root, run=self.outputRun)
472 self._check_obscore(butler.registry, has_visits=False)
474 # Calling defineVisits tests the implementation of the butler command
475 # line interface "define-visits" subcommand. Functions in the script
476 # folder are generally considered protected and should not be used
477 # as public api.
478 script.defineVisits(
479 self.root,
480 config_file=None,
481 collections=self.outputRun,
482 instrument=self.instrumentName,
483 raw_name=self.ingestDatasetTypeName,
484 )
486 # Test that we got the visits we expected.
487 visits = butler.registry.queryDataIds(["visit"]).expanded().toSet()
488 self.assertCountEqual(visits, self.visits.keys())
489 instr = Instrument.from_string(self.instrumentName, butler.registry)
490 camera = instr.getCamera()
491 for foundVisit, (expectedVisit, expectedExposures) in zip(visits, self.visits.items(), strict=True):
492 # Test that this visit is associated with the expected exposures.
493 foundExposures = (
494 butler.registry.queryDataIds(["exposure"], dataId=expectedVisit).expanded().toSet()
495 )
496 self.assertCountEqual(foundExposures, expectedExposures)
497 # Test that we have a visit region, and that it contains all of the
498 # detector+visit regions.
499 self.assertIsNotNone(foundVisit.region)
500 detectorVisitDataIds = (
501 butler.registry.queryDataIds(["visit", "detector"], dataId=expectedVisit).expanded().toSet()
502 )
503 self.assertEqual(len(detectorVisitDataIds), len(camera))
504 for dataId in detectorVisitDataIds:
505 self.assertTrue(foundVisit.region.contains(dataId.region))
507 # Check obscore table again.
508 self._check_obscore(butler.registry, has_visits=True)
510 def _check_obscore(self, registry: Registry, has_visits: bool) -> None:
511 """Verify contents of obscore table."""
512 return