Coverage for tests/test_butler.py: 12%
1096 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-19 03:42 -0700
« prev ^ index » next coverage.py v7.2.3, created at 2023-04-19 03:42 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import gc
26import logging
27import os
28import pathlib
29import pickle
30import posixpath
31import random
32import shutil
33import string
34import tempfile
35import unittest
37try:
38 import boto3
39 import botocore
40 from moto import mock_s3
41except ImportError:
42 boto3 = None
44 def mock_s3(cls):
45 """A no-op decorator in case moto mock_s3 can not be imported."""
46 return cls
49try:
50 # It's possible but silly to have testing.postgresql installed without
51 # having the postgresql server installed (because then nothing in
52 # testing.postgresql would work), so we use the presence of that module
53 # to test whether we can expect the server to be available.
54 import testing.postgresql
55except ImportError:
56 testing = None
58import astropy.time
59import sqlalchemy
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionType,
64 Config,
65 DatasetIdGenEnum,
66 DatasetRef,
67 DatasetType,
68 FileDataset,
69 FileTemplate,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import (
77 CollectionError,
78 CollectionTypeError,
79 ConflictingDefinitionError,
80 DataIdValueError,
81 MissingCollectionError,
82 OrphanedRecordError,
83)
84from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
85from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
86from lsst.resources import ResourcePath
87from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
88from lsst.utils import doImport
89from lsst.utils.introspection import get_full_type_name
91TESTDIR = os.path.abspath(os.path.dirname(__file__))
94def makeExampleMetrics():
95 return MetricsExample(
96 {"AM1": 5.2, "AM2": 30.6},
97 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
98 [563, 234, 456.7, 752, 8, 9, 27],
99 )
102class TransactionTestError(Exception):
103 """Specific error for testing transactions, to prevent misdiagnosing
104 that might otherwise occur when a standard exception is used.
105 """
107 pass
110class ButlerConfigTests(unittest.TestCase):
111 """Simple tests for ButlerConfig that are not tested in any other test
112 cases."""
114 def testSearchPath(self):
115 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
116 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
117 config1 = ButlerConfig(configFile)
118 self.assertNotIn("testConfigs", "\n".join(cm.output))
120 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
121 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
122 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
123 self.assertIn("testConfigs", "\n".join(cm.output))
125 key = ("datastore", "records", "table")
126 self.assertNotEqual(config1[key], config2[key])
127 self.assertEqual(config2[key], "override_record")
130class ButlerPutGetTests:
131 """Helper method for running a suite of put/get tests from different
132 butler configurations."""
134 root = None
135 default_run = "ingésτ😺"
137 @staticmethod
138 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
139 """Create a DatasetType and register it"""
140 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
141 registry.registerDatasetType(datasetType)
142 return datasetType
144 @classmethod
145 def setUpClass(cls):
146 cls.storageClassFactory = StorageClassFactory()
147 cls.storageClassFactory.addFromConfig(cls.configFile)
149 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
150 datasetType = datasetRef.datasetType
151 dataId = datasetRef.dataId
152 deferred = butler.getDeferred(datasetRef)
154 for component in components:
155 compTypeName = datasetType.componentTypeName(component)
156 result = butler.get(compTypeName, dataId, collections=collections)
157 self.assertEqual(result, getattr(reference, component))
158 result_deferred = deferred.get(component=component)
159 self.assertEqual(result_deferred, result)
161 def tearDown(self):
162 removeTestTempDir(self.root)
164 def create_butler(self, run, storageClass, datasetTypeName):
165 butler = Butler(self.tmpConfigFile, run=run)
167 collections = set(butler.registry.queryCollections())
168 self.assertEqual(collections, set([run]))
170 # Create and register a DatasetType
171 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
173 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
175 # Add needed Dimensions
176 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
177 butler.registry.insertDimensionData(
178 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
179 )
180 butler.registry.insertDimensionData(
181 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
182 )
183 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
184 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
185 butler.registry.insertDimensionData(
186 "visit",
187 {
188 "instrument": "DummyCamComp",
189 "id": 423,
190 "name": "fourtwentythree",
191 "physical_filter": "d-r",
192 "visit_system": 1,
193 "datetime_begin": visit_start,
194 "datetime_end": visit_end,
195 },
196 )
198 # Add more visits for some later tests
199 for visit_id in (424, 425):
200 butler.registry.insertDimensionData(
201 "visit",
202 {
203 "instrument": "DummyCamComp",
204 "id": visit_id,
205 "name": f"fourtwentyfour_{visit_id}",
206 "physical_filter": "d-r",
207 "visit_system": 1,
208 },
209 )
210 return butler, datasetType
212 def runPutGetTest(self, storageClass, datasetTypeName):
213 # New datasets will be added to run and tag, but we will only look in
214 # tag when looking up datasets.
215 run = self.default_run
216 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
218 # Create and store a dataset
219 metric = makeExampleMetrics()
220 dataId = {"instrument": "DummyCamComp", "visit": 423}
222 # Create a DatasetRef for put
223 refIn = DatasetRef(datasetType, dataId, id=None)
225 # Put with a preexisting id should fail
226 with self.assertRaises(ValueError):
227 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
229 # Put and remove the dataset once as a DatasetRef, once as a dataId,
230 # and once with a DatasetType
232 # Keep track of any collections we add and do not clean up
233 expected_collections = {run}
235 counter = 0
236 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
237 # Since we are using subTest we can get cascading failures
238 # here with the first attempt failing and the others failing
239 # immediately because the dataset already exists. Work around
240 # this by using a distinct run collection each time
241 counter += 1
242 this_run = f"put_run_{counter}"
243 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
244 expected_collections.update({this_run})
246 with self.subTest(args=args):
247 ref = butler.put(metric, *args, run=this_run)
248 self.assertIsInstance(ref, DatasetRef)
250 # Test getDirect
251 metricOut = butler.get(ref)
252 self.assertEqual(metric, metricOut)
253 # Test get
254 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
255 self.assertEqual(metric, metricOut)
256 # Test get with a datasetRef
257 metricOut = butler.get(ref, collections=this_run)
258 self.assertEqual(metric, metricOut)
259 # Test getDeferred with dataId
260 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
261 self.assertEqual(metric, metricOut)
262 # Test getDeferred with a datasetRef
263 metricOut = butler.getDeferred(ref, collections=this_run).get()
264 self.assertEqual(metric, metricOut)
265 # and deferred direct with ref
266 metricOut = butler.getDeferred(ref).get()
267 self.assertEqual(metric, metricOut)
269 # Check we can get components
270 if storageClass.isComposite():
271 self.assertGetComponents(
272 butler, ref, ("summary", "data", "output"), metric, collections=this_run
273 )
275 # Can the artifacts themselves be retrieved?
276 if not butler.datastore.isEphemeral:
277 root_uri = ResourcePath(self.root)
279 for preserve_path in (True, False):
280 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
281 # Use copy so that we can test that overwrite
282 # protection works (using "auto" for File URIs would
283 # use hard links and subsequent transfer would work
284 # because it knows they are the same file).
285 transferred = butler.retrieveArtifacts(
286 [ref], destination, preserve_path=preserve_path, transfer="copy"
287 )
288 self.assertGreater(len(transferred), 0)
289 artifacts = list(ResourcePath.findFileResources([destination]))
290 self.assertEqual(set(transferred), set(artifacts))
292 for artifact in transferred:
293 path_in_destination = artifact.relative_to(destination)
294 self.assertIsNotNone(path_in_destination)
296 # when path is not preserved there should not be
297 # any path separators.
298 num_seps = path_in_destination.count("/")
299 if preserve_path:
300 self.assertGreater(num_seps, 0)
301 else:
302 self.assertEqual(num_seps, 0)
304 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
305 n_uris = len(secondary_uris)
306 if primary_uri:
307 n_uris += 1
308 self.assertEqual(
309 len(artifacts),
310 n_uris,
311 "Comparing expected artifacts vs actual:"
312 f" {artifacts} vs {primary_uri} and {secondary_uris}",
313 )
315 if preserve_path:
316 # No need to run these twice
317 with self.assertRaises(ValueError):
318 butler.retrieveArtifacts([ref], destination, transfer="move")
320 with self.assertRaises(FileExistsError):
321 butler.retrieveArtifacts([ref], destination)
323 transferred_again = butler.retrieveArtifacts(
324 [ref], destination, preserve_path=preserve_path, overwrite=True
325 )
326 self.assertEqual(set(transferred_again), set(transferred))
328 # Now remove the dataset completely.
329 butler.pruneDatasets([ref], purge=True, unstore=True)
330 # Lookup with original args should still fail.
331 with self.assertRaises(LookupError):
332 butler.datasetExists(*args, collections=this_run)
333 # get() should still fail.
334 with self.assertRaises(FileNotFoundError):
335 butler.get(ref)
336 # Registry shouldn't be able to find it by dataset_id anymore.
337 self.assertIsNone(butler.registry.getDataset(ref.id))
339 # Do explicit registry removal since we know they are
340 # empty
341 butler.registry.removeCollection(this_run)
342 expected_collections.remove(this_run)
344 # Put the dataset again, since the last thing we did was remove it
345 # and we want to use the default collection.
346 ref = butler.put(metric, refIn)
348 # Get with parameters
349 stop = 4
350 sliced = butler.get(ref, parameters={"slice": slice(stop)})
351 self.assertNotEqual(metric, sliced)
352 self.assertEqual(metric.summary, sliced.summary)
353 self.assertEqual(metric.output, sliced.output)
354 self.assertEqual(metric.data[:stop], sliced.data)
355 # getDeferred with parameters
356 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
357 self.assertNotEqual(metric, sliced)
358 self.assertEqual(metric.summary, sliced.summary)
359 self.assertEqual(metric.output, sliced.output)
360 self.assertEqual(metric.data[:stop], sliced.data)
361 # getDeferred with deferred parameters
362 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
363 self.assertNotEqual(metric, sliced)
364 self.assertEqual(metric.summary, sliced.summary)
365 self.assertEqual(metric.output, sliced.output)
366 self.assertEqual(metric.data[:stop], sliced.data)
368 if storageClass.isComposite():
369 # Check that components can be retrieved
370 metricOut = butler.get(ref.datasetType.name, dataId)
371 compNameS = ref.datasetType.componentTypeName("summary")
372 compNameD = ref.datasetType.componentTypeName("data")
373 summary = butler.get(compNameS, dataId)
374 self.assertEqual(summary, metric.summary)
375 data = butler.get(compNameD, dataId)
376 self.assertEqual(data, metric.data)
378 if "counter" in storageClass.derivedComponents:
379 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
380 self.assertEqual(count, len(data))
382 count = butler.get(
383 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
384 )
385 self.assertEqual(count, stop)
387 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
388 summary = butler.get(compRef)
389 self.assertEqual(summary, metric.summary)
391 # Create a Dataset type that has the same name but is inconsistent.
392 inconsistentDatasetType = DatasetType(
393 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
394 )
396 # Getting with a dataset type that does not match registry fails
397 with self.assertRaises(ValueError):
398 butler.get(inconsistentDatasetType, dataId)
400 # Combining a DatasetRef with a dataId should fail
401 with self.assertRaises(ValueError):
402 butler.get(ref, dataId)
403 # Getting with an explicit ref should fail if the id doesn't match
404 with self.assertRaises(ValueError):
405 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
407 # Getting a dataset with unknown parameters should fail
408 with self.assertRaises(KeyError):
409 butler.get(ref, parameters={"unsupported": True})
411 # Check we have a collection
412 collections = set(butler.registry.queryCollections())
413 self.assertEqual(collections, expected_collections)
415 # Clean up to check that we can remove something that may have
416 # already had a component removed
417 butler.pruneDatasets([ref], unstore=True, purge=True)
419 # Check that we can configure a butler to accept a put even
420 # if it already has the dataset in registry.
421 ref = butler.put(metric, refIn)
423 # Repeat put will fail.
424 with self.assertRaises(ConflictingDefinitionError):
425 butler.put(metric, refIn)
427 # Remove the datastore entry.
428 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
430 # Put will still fail
431 with self.assertRaises(ConflictingDefinitionError):
432 butler.put(metric, refIn)
434 # Allow the put to succeed
435 butler._allow_put_of_predefined_dataset = True
436 ref2 = butler.put(metric, refIn)
437 self.assertEqual(ref2.id, ref.id)
439 # A second put will still fail but with a different exception
440 # than before.
441 with self.assertRaises(ConflictingDefinitionError):
442 butler.put(metric, refIn)
444 # Reset the flag to avoid confusion
445 butler._allow_put_of_predefined_dataset = False
447 # Leave the dataset in place since some downstream tests require
448 # something to be present
450 return butler
452 def testDeferredCollectionPassing(self):
453 # Construct a butler with no run or collection, but make it writeable.
454 butler = Butler(self.tmpConfigFile, writeable=True)
455 # Create and register a DatasetType
456 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
457 datasetType = self.addDatasetType(
458 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
459 )
460 # Add needed Dimensions
461 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
462 butler.registry.insertDimensionData(
463 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
464 )
465 butler.registry.insertDimensionData(
466 "visit",
467 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
468 )
469 dataId = {"instrument": "DummyCamComp", "visit": 423}
470 # Create dataset.
471 metric = makeExampleMetrics()
472 # Register a new run and put dataset.
473 run = "deferred"
474 self.assertTrue(butler.registry.registerRun(run))
475 # Second time it will be allowed but indicate no-op
476 self.assertFalse(butler.registry.registerRun(run))
477 ref = butler.put(metric, datasetType, dataId, run=run)
478 # Putting with no run should fail with TypeError.
479 with self.assertRaises(CollectionError):
480 butler.put(metric, datasetType, dataId)
481 # Dataset should exist.
482 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
483 # We should be able to get the dataset back, but with and without
484 # a deferred dataset handle.
485 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
486 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
487 # Trying to find the dataset without any collection is a TypeError.
488 with self.assertRaises(CollectionError):
489 butler.datasetExists(datasetType, dataId)
490 with self.assertRaises(CollectionError):
491 butler.get(datasetType, dataId)
492 # Associate the dataset with a different collection.
493 butler.registry.registerCollection("tagged")
494 butler.registry.associate("tagged", [ref])
495 # Deleting the dataset from the new collection should make it findable
496 # in the original collection.
497 butler.pruneDatasets([ref], tags=["tagged"])
498 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
501class ButlerTests(ButlerPutGetTests):
502 """Tests for Butler."""
504 useTempRoot = True
506 def setUp(self):
507 """Create a new butler root for each test."""
508 self.root = makeTestTempDir(TESTDIR)
509 Butler.makeRepo(self.root, config=Config(self.configFile))
510 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
512 def testConstructor(self):
513 """Independent test of constructor."""
514 butler = Butler(self.tmpConfigFile, run=self.default_run)
515 self.assertIsInstance(butler, Butler)
517 # Check that butler.yaml is added automatically.
518 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
519 config_dir = self.tmpConfigFile[: -len(end)]
520 butler = Butler(config_dir, run=self.default_run)
521 self.assertIsInstance(butler, Butler)
523 # Even with a ResourcePath.
524 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
525 self.assertIsInstance(butler, Butler)
527 collections = set(butler.registry.queryCollections())
528 self.assertEqual(collections, {self.default_run})
530 # Check that some special characters can be included in run name.
531 special_run = "u@b.c-A"
532 butler_special = Butler(butler=butler, run=special_run)
533 collections = set(butler_special.registry.queryCollections("*@*"))
534 self.assertEqual(collections, {special_run})
536 butler2 = Butler(butler=butler, collections=["other"])
537 self.assertEqual(butler2.collections, ("other",))
538 self.assertIsNone(butler2.run)
539 self.assertIs(butler.datastore, butler2.datastore)
541 # Test that we can use an environment variable to find this
542 # repository.
543 butler_index = Config()
544 butler_index["label"] = self.tmpConfigFile
545 for suffix in (".yaml", ".json"):
546 # Ensure that the content differs so that we know that
547 # we aren't reusing the cache.
548 bad_label = f"s3://bucket/not_real{suffix}"
549 butler_index["bad_label"] = bad_label
550 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
551 butler_index.dumpToUri(temp_file)
552 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
553 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
554 uri = Butler.get_repo_uri("bad_label")
555 self.assertEqual(uri, ResourcePath(bad_label))
556 uri = Butler.get_repo_uri("label")
557 butler = Butler(uri, writeable=False)
558 self.assertIsInstance(butler, Butler)
559 butler = Butler("label", writeable=False)
560 self.assertIsInstance(butler, Butler)
561 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
562 Butler("not_there", writeable=False)
563 with self.assertRaises(KeyError) as cm:
564 Butler.get_repo_uri("missing")
565 self.assertIn("not known to", str(cm.exception))
566 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
567 with self.assertRaises(FileNotFoundError):
568 Butler.get_repo_uri("label")
569 self.assertEqual(Butler.get_known_repos(), set())
570 with self.assertRaises(KeyError) as cm:
571 # No environment variable set.
572 Butler.get_repo_uri("label")
573 self.assertIn("No repository index defined", str(cm.exception))
574 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
575 # No aliases registered.
576 Butler("not_there")
577 self.assertEqual(Butler.get_known_repos(), set())
579 def testBasicPutGet(self):
580 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
581 self.runPutGetTest(storageClass, "test_metric")
583 def testCompositePutGetConcrete(self):
584 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
585 butler = self.runPutGetTest(storageClass, "test_metric")
587 # Should *not* be disassembled
588 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
589 self.assertEqual(len(datasets), 1)
590 uri, components = butler.getURIs(datasets[0])
591 self.assertIsInstance(uri, ResourcePath)
592 self.assertFalse(components)
593 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
594 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
596 # Predicted dataset
597 dataId = {"instrument": "DummyCamComp", "visit": 424}
598 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
599 self.assertFalse(components)
600 self.assertIsInstance(uri, ResourcePath)
601 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
602 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
604 def testCompositePutGetVirtual(self):
605 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
606 butler = self.runPutGetTest(storageClass, "test_metric_comp")
608 # Should be disassembled
609 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
610 self.assertEqual(len(datasets), 1)
611 uri, components = butler.getURIs(datasets[0])
613 if butler.datastore.isEphemeral:
614 # Never disassemble in-memory datastore
615 self.assertIsInstance(uri, ResourcePath)
616 self.assertFalse(components)
617 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
618 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
619 else:
620 self.assertIsNone(uri)
621 self.assertEqual(set(components), set(storageClass.components))
622 for compuri in components.values():
623 self.assertIsInstance(compuri, ResourcePath)
624 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
625 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
627 # Predicted dataset
628 dataId = {"instrument": "DummyCamComp", "visit": 424}
629 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
631 if butler.datastore.isEphemeral:
632 # Never disassembled
633 self.assertIsInstance(uri, ResourcePath)
634 self.assertFalse(components)
635 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
636 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
637 else:
638 self.assertIsNone(uri)
639 self.assertEqual(set(components), set(storageClass.components))
640 for compuri in components.values():
641 self.assertIsInstance(compuri, ResourcePath)
642 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
643 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
645 def testStorageClassOverrideGet(self):
646 """Test storage class conversion on get with override."""
647 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
648 datasetTypeName = "anything"
649 run = self.default_run
651 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
653 # Create and store a dataset.
654 metric = makeExampleMetrics()
655 dataId = {"instrument": "DummyCamComp", "visit": 423}
657 ref = butler.put(metric, datasetType, dataId)
659 # Return native type.
660 retrieved = butler.get(ref)
661 self.assertEqual(retrieved, metric)
663 # Specify an override.
664 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
665 model = butler.get(ref, storageClass=new_sc)
666 self.assertNotEqual(type(model), type(retrieved))
667 self.assertIs(type(model), new_sc.pytype)
668 self.assertEqual(retrieved, model)
670 # Defer but override later.
671 deferred = butler.getDeferred(ref)
672 model = deferred.get(storageClass=new_sc)
673 self.assertIs(type(model), new_sc.pytype)
674 self.assertEqual(retrieved, model)
676 # Defer but override up front.
677 deferred = butler.getDeferred(ref, storageClass=new_sc)
678 model = deferred.get()
679 self.assertIs(type(model), new_sc.pytype)
680 self.assertEqual(retrieved, model)
682 # Retrieve a component. Should be a tuple.
683 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
684 self.assertIs(type(data), tuple)
685 self.assertEqual(data, tuple(retrieved.data))
687 # Parameter on the write storage class should work regardless
688 # of read storage class.
689 data = butler.get(
690 "anything.data",
691 dataId,
692 storageClass="StructuredDataDataTestTuple",
693 parameters={"slice": slice(2, 4)},
694 )
695 self.assertEqual(len(data), 2)
697 # Try a parameter that is known to the read storage class but not
698 # the write storage class.
699 with self.assertRaises(KeyError):
700 butler.get(
701 "anything.data",
702 dataId,
703 storageClass="StructuredDataDataTestTuple",
704 parameters={"xslice": slice(2, 4)},
705 )
707 def testPytypePutCoercion(self):
708 """Test python type coercion on Butler.get and put."""
710 # Store some data with the normal example storage class.
711 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
712 datasetTypeName = "test_metric"
713 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
715 dataId = {"instrument": "DummyCamComp", "visit": 423}
717 # Put a dict and this should coerce to a MetricsExample
718 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
719 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
720 test_metric = butler.get(metric_ref)
721 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
722 self.assertEqual(test_metric.summary, test_dict["summary"])
723 self.assertEqual(test_metric.output, test_dict["output"])
725 # Check that the put still works if a DatasetType is given with
726 # a definition matching this python type.
727 registry_type = butler.registry.getDatasetType(datasetTypeName)
728 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
729 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
730 self.assertEqual(metric2_ref.datasetType, registry_type)
732 # The get will return the type expected by registry.
733 test_metric2 = butler.get(metric2_ref)
734 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
736 # Make a new DatasetRef with the compatible but different DatasetType.
737 # This should now return a dict.
738 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
739 test_dict2 = butler.get(new_ref)
740 self.assertEqual(get_full_type_name(test_dict2), "dict")
742 # Get it again with the wrong dataset type definition using get()
743 # rather than get(). This should be consistent with get()
744 # behavior and return the type of the DatasetType.
745 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
746 self.assertEqual(get_full_type_name(test_dict3), "dict")
748 def testIngest(self):
749 butler = Butler(self.tmpConfigFile, run=self.default_run)
751 # Create and register a DatasetType
752 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
754 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
755 datasetTypeName = "metric"
757 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
759 # Add needed Dimensions
760 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
761 butler.registry.insertDimensionData(
762 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
763 )
764 for detector in (1, 2):
765 butler.registry.insertDimensionData(
766 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
767 )
769 butler.registry.insertDimensionData(
770 "visit",
771 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
772 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
773 )
775 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
776 dataRoot = os.path.join(TESTDIR, "data", "basic")
777 datasets = []
778 for detector in (1, 2):
779 detector_name = f"detector_{detector}"
780 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
781 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
782 # Create a DatasetRef for ingest
783 refIn = DatasetRef(datasetType, dataId, id=None)
785 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
787 butler.ingest(*datasets, transfer="copy")
789 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
790 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
792 metrics1 = butler.get(datasetTypeName, dataId1)
793 metrics2 = butler.get(datasetTypeName, dataId2)
794 self.assertNotEqual(metrics1, metrics2)
796 # Compare URIs
797 uri1 = butler.getURI(datasetTypeName, dataId1)
798 uri2 = butler.getURI(datasetTypeName, dataId2)
799 self.assertNotEqual(uri1, uri2)
801 # Now do a multi-dataset but single file ingest
802 metricFile = os.path.join(dataRoot, "detectors.yaml")
803 refs = []
804 for detector in (1, 2):
805 detector_name = f"detector_{detector}"
806 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
807 # Create a DatasetRef for ingest
808 refs.append(DatasetRef(datasetType, dataId, id=None))
810 # Test "move" transfer to ensure that the files themselves
811 # have disappeared following ingest.
812 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
813 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
815 datasets = []
816 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
818 butler.ingest(*datasets, transfer="move", record_validation_info=False)
819 self.assertFalse(tempFile.exists())
821 # Check that the datastore recorded no file size.
822 # Not all datastores can support this.
823 try:
824 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
825 self.assertEqual(infos[0].file_size, -1)
826 except AttributeError:
827 pass
829 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
830 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
832 multi1 = butler.get(datasetTypeName, dataId1)
833 multi2 = butler.get(datasetTypeName, dataId2)
835 self.assertEqual(multi1, metrics1)
836 self.assertEqual(multi2, metrics2)
838 # Compare URIs
839 uri1 = butler.getURI(datasetTypeName, dataId1)
840 uri2 = butler.getURI(datasetTypeName, dataId2)
841 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
843 # Test that removing one does not break the second
844 # This line will issue a warning log message for a ChainedDatastore
845 # that uses an InMemoryDatastore since in-memory can not ingest
846 # files.
847 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
848 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
849 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
850 multi2b = butler.get(datasetTypeName, dataId2)
851 self.assertEqual(multi2, multi2b)
853 def testPickle(self):
854 """Test pickle support."""
855 butler = Butler(self.tmpConfigFile, run=self.default_run)
856 butlerOut = pickle.loads(pickle.dumps(butler))
857 self.assertIsInstance(butlerOut, Butler)
858 self.assertEqual(butlerOut._config, butler._config)
859 self.assertEqual(butlerOut.collections, butler.collections)
860 self.assertEqual(butlerOut.run, butler.run)
862 def testGetDatasetTypes(self):
863 butler = Butler(self.tmpConfigFile, run=self.default_run)
864 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
865 dimensionEntries = [
866 (
867 "instrument",
868 {"instrument": "DummyCam"},
869 {"instrument": "DummyHSC"},
870 {"instrument": "DummyCamComp"},
871 ),
872 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
873 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
874 ]
875 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
876 # Add needed Dimensions
877 for args in dimensionEntries:
878 butler.registry.insertDimensionData(*args)
880 # When a DatasetType is added to the registry entries are not created
881 # for components but querying them can return the components.
882 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
883 components = set()
884 for datasetTypeName in datasetTypeNames:
885 # Create and register a DatasetType
886 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
888 for componentName in storageClass.components:
889 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
891 fromRegistry: set[DatasetType] = set()
892 for parent_dataset_type in butler.registry.queryDatasetTypes():
893 fromRegistry.add(parent_dataset_type)
894 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
895 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
897 # Now that we have some dataset types registered, validate them
898 butler.validateConfiguration(
899 ignore=[
900 "test_metric_comp",
901 "metric3",
902 "metric5",
903 "calexp",
904 "DummySC",
905 "datasetType.component",
906 "random_data",
907 "random_data_2",
908 ]
909 )
911 # Add a new datasetType that will fail template validation
912 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
913 if self.validationCanFail:
914 with self.assertRaises(ValidationError):
915 butler.validateConfiguration()
917 # Rerun validation but with a subset of dataset type names
918 butler.validateConfiguration(datasetTypeNames=["metric4"])
920 # Rerun validation but ignore the bad datasetType
921 butler.validateConfiguration(
922 ignore=[
923 "test_metric_comp",
924 "metric3",
925 "metric5",
926 "calexp",
927 "DummySC",
928 "datasetType.component",
929 "random_data",
930 "random_data_2",
931 ]
932 )
934 def testTransaction(self):
935 butler = Butler(self.tmpConfigFile, run=self.default_run)
936 datasetTypeName = "test_metric"
937 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
938 dimensionEntries = (
939 ("instrument", {"instrument": "DummyCam"}),
940 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
941 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
942 )
943 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
944 metric = makeExampleMetrics()
945 dataId = {"instrument": "DummyCam", "visit": 42}
946 # Create and register a DatasetType
947 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
948 with self.assertRaises(TransactionTestError):
949 with butler.transaction():
950 # Add needed Dimensions
951 for args in dimensionEntries:
952 butler.registry.insertDimensionData(*args)
953 # Store a dataset
954 ref = butler.put(metric, datasetTypeName, dataId)
955 self.assertIsInstance(ref, DatasetRef)
956 # Test getDirect
957 metricOut = butler.get(ref)
958 self.assertEqual(metric, metricOut)
959 # Test get
960 metricOut = butler.get(datasetTypeName, dataId)
961 self.assertEqual(metric, metricOut)
962 # Check we can get components
963 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
964 raise TransactionTestError("This should roll back the entire transaction")
965 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
966 butler.registry.expandDataId(dataId)
967 # Should raise LookupError for missing data ID value
968 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
969 butler.get(datasetTypeName, dataId)
970 # Also check explicitly if Dataset entry is missing
971 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
972 # Direct retrieval should not find the file in the Datastore
973 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
974 butler.get(ref)
976 def testMakeRepo(self):
977 """Test that we can write butler configuration to a new repository via
978 the Butler.makeRepo interface and then instantiate a butler from the
979 repo root.
980 """
981 # Do not run the test if we know this datastore configuration does
982 # not support a file system root
983 if self.fullConfigKey is None:
984 return
986 # create two separate directories
987 root1 = tempfile.mkdtemp(dir=self.root)
988 root2 = tempfile.mkdtemp(dir=self.root)
990 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
991 limited = Config(self.configFile)
992 butler1 = Butler(butlerConfig)
993 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
994 full = Config(self.tmpConfigFile)
995 butler2 = Butler(butlerConfig)
996 # Butlers should have the same configuration regardless of whether
997 # defaults were expanded.
998 self.assertEqual(butler1._config, butler2._config)
999 # Config files loaded directly should not be the same.
1000 self.assertNotEqual(limited, full)
1001 # Make sure "limited" doesn't have a few keys we know it should be
1002 # inheriting from defaults.
1003 self.assertIn(self.fullConfigKey, full)
1004 self.assertNotIn(self.fullConfigKey, limited)
1006 # Collections don't appear until something is put in them
1007 collections1 = set(butler1.registry.queryCollections())
1008 self.assertEqual(collections1, set())
1009 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1011 # Check that a config with no associated file name will not
1012 # work properly with relocatable Butler repo
1013 butlerConfig.configFile = None
1014 with self.assertRaises(ValueError):
1015 Butler(butlerConfig)
1017 with self.assertRaises(FileExistsError):
1018 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1020 def testStringification(self):
1021 butler = Butler(self.tmpConfigFile, run=self.default_run)
1022 butlerStr = str(butler)
1024 if self.datastoreStr is not None:
1025 for testStr in self.datastoreStr:
1026 self.assertIn(testStr, butlerStr)
1027 if self.registryStr is not None:
1028 self.assertIn(self.registryStr, butlerStr)
1030 datastoreName = butler.datastore.name
1031 if self.datastoreName is not None:
1032 for testStr in self.datastoreName:
1033 self.assertIn(testStr, datastoreName)
1035 def testButlerRewriteDataId(self):
1036 """Test that dataIds can be rewritten based on dimension records."""
1038 butler = Butler(self.tmpConfigFile, run=self.default_run)
1040 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1041 datasetTypeName = "random_data"
1043 # Create dimension records.
1044 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1045 butler.registry.insertDimensionData(
1046 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1047 )
1048 butler.registry.insertDimensionData(
1049 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1050 )
1052 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1053 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1054 butler.registry.registerDatasetType(datasetType)
1056 n_exposures = 5
1057 dayobs = 20210530
1059 for i in range(n_exposures):
1060 butler.registry.insertDimensionData(
1061 "exposure",
1062 {
1063 "instrument": "DummyCamComp",
1064 "id": i,
1065 "obs_id": f"exp{i}",
1066 "seq_num": i,
1067 "day_obs": dayobs,
1068 "physical_filter": "d-r",
1069 },
1070 )
1072 # Write some data.
1073 for i in range(n_exposures):
1074 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1076 # Use the seq_num for the put to test rewriting.
1077 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1078 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1080 # Check that the exposure is correct in the dataId
1081 self.assertEqual(ref.dataId["exposure"], i)
1083 # and check that we can get the dataset back with the same dataId
1084 new_metric = butler.get(datasetTypeName, dataId=dataId)
1085 self.assertEqual(new_metric, metric)
1088class FileDatastoreButlerTests(ButlerTests):
1089 """Common tests and specialization of ButlerTests for butlers backed
1090 by datastores that inherit from FileDatastore.
1091 """
1093 def checkFileExists(self, root, relpath):
1094 """Checks if file exists at a given path (relative to root).
1096 Test testPutTemplates verifies actual physical existance of the files
1097 in the requested location.
1098 """
1099 uri = ResourcePath(root, forceDirectory=True)
1100 return uri.join(relpath).exists()
1102 def testPutTemplates(self):
1103 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1104 butler = Butler(self.tmpConfigFile, run=self.default_run)
1106 # Add needed Dimensions
1107 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1108 butler.registry.insertDimensionData(
1109 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1110 )
1111 butler.registry.insertDimensionData(
1112 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1113 )
1114 butler.registry.insertDimensionData(
1115 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1116 )
1118 # Create and store a dataset
1119 metric = makeExampleMetrics()
1121 # Create two almost-identical DatasetTypes (both will use default
1122 # template)
1123 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1124 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1125 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1126 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1128 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1129 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1131 # Put with exactly the data ID keys needed
1132 ref = butler.put(metric, "metric1", dataId1)
1133 uri = butler.getURI(ref)
1134 self.assertTrue(uri.exists())
1135 self.assertTrue(
1136 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1137 )
1139 # Check the template based on dimensions
1140 if hasattr(butler.datastore, "templates"):
1141 butler.datastore.templates.validateTemplates([ref])
1143 # Put with extra data ID keys (physical_filter is an optional
1144 # dependency); should not change template (at least the way we're
1145 # defining them to behave now; the important thing is that they
1146 # must be consistent).
1147 ref = butler.put(metric, "metric2", dataId2)
1148 uri = butler.getURI(ref)
1149 self.assertTrue(uri.exists())
1150 self.assertTrue(
1151 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1152 )
1154 # Check the template based on dimensions
1155 if hasattr(butler.datastore, "templates"):
1156 butler.datastore.templates.validateTemplates([ref])
1158 # Use a template that has a typo in dimension record metadata.
1159 # Easier to test with a butler that has a ref with records attached.
1160 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1161 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1162 path = template.format(ref)
1163 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1165 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1166 with self.assertRaises(KeyError):
1167 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1168 template.format(ref)
1170 # Now use a file template that will not result in unique filenames
1171 with self.assertRaises(FileTemplateValidationError):
1172 butler.put(metric, "metric3", dataId1)
1174 def testImportExport(self):
1175 # Run put/get tests just to create and populate a repo.
1176 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1177 self.runImportExportTest(storageClass)
1179 @unittest.expectedFailure
1180 def testImportExportVirtualComposite(self):
1181 # Run put/get tests just to create and populate a repo.
1182 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1183 self.runImportExportTest(storageClass)
1185 def runImportExportTest(self, storageClass):
1186 """This test does an export to a temp directory and an import back
1187 into a new temp directory repo. It does not assume a posix datastore"""
1188 exportButler = self.runPutGetTest(storageClass, "test_metric")
1190 # Test that we must have a file extension.
1191 with self.assertRaises(ValueError):
1192 with exportButler.export(filename="dump", directory=".") as export:
1193 pass
1195 # Test that unknown format is not allowed.
1196 with self.assertRaises(ValueError):
1197 with exportButler.export(filename="dump.fits", directory=".") as export:
1198 pass
1200 # Test that the repo actually has at least one dataset.
1201 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1202 self.assertGreater(len(datasets), 0)
1203 # Add a DimensionRecord that's unused by those datasets.
1204 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1205 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1206 # Export and then import datasets.
1207 with safeTestTempDir(TESTDIR) as exportDir:
1208 exportFile = os.path.join(exportDir, "exports.yaml")
1209 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1210 export.saveDatasets(datasets)
1211 # Export the same datasets again. This should quietly do
1212 # nothing because of internal deduplication, and it shouldn't
1213 # complain about being asked to export the "htm7" elements even
1214 # though there aren't any in these datasets or in the database.
1215 export.saveDatasets(datasets, elements=["htm7"])
1216 # Save one of the data IDs again; this should be harmless
1217 # because of internal deduplication.
1218 export.saveDataIds([datasets[0].dataId])
1219 # Save some dimension records directly.
1220 export.saveDimensionData("skymap", [skymapRecord])
1221 self.assertTrue(os.path.exists(exportFile))
1222 with safeTestTempDir(TESTDIR) as importDir:
1223 # We always want this to be a local posix butler
1224 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1225 # Calling script.butlerImport tests the implementation of the
1226 # butler command line interface "import" subcommand. Functions
1227 # in the script folder are generally considered protected and
1228 # should not be used as public api.
1229 with open(exportFile, "r") as f:
1230 script.butlerImport(
1231 importDir,
1232 export_file=f,
1233 directory=exportDir,
1234 transfer="auto",
1235 skip_dimensions=None,
1236 reuse_ids=False,
1237 )
1238 importButler = Butler(importDir, run=self.default_run)
1239 for ref in datasets:
1240 with self.subTest(ref=ref):
1241 # Test for existence by passing in the DatasetType and
1242 # data ID separately, to avoid lookup by dataset_id.
1243 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1244 self.assertEqual(
1245 list(importButler.registry.queryDimensionRecords("skymap")),
1246 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1247 )
1249 def testRemoveRuns(self):
1250 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1251 butler = Butler(self.tmpConfigFile, writeable=True)
1252 # Load registry data with dimensions to hang datasets off of.
1253 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1254 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1255 # Add some RUN-type collection.
1256 run1 = "run1"
1257 butler.registry.registerRun(run1)
1258 run2 = "run2"
1259 butler.registry.registerRun(run2)
1260 # put a dataset in each
1261 metric = makeExampleMetrics()
1262 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1263 datasetType = self.addDatasetType(
1264 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1265 )
1266 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1267 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1268 uri1 = butler.getURI(ref1, collections=[run1])
1269 uri2 = butler.getURI(ref2, collections=[run2])
1271 with self.assertRaises(OrphanedRecordError):
1272 butler.registry.removeDatasetType(datasetType.name)
1274 # Remove from both runs with different values for unstore.
1275 butler.removeRuns([run1], unstore=True)
1276 butler.removeRuns([run2], unstore=False)
1277 # Should be nothing in registry for either one, and datastore should
1278 # not think either exists.
1279 with self.assertRaises(MissingCollectionError):
1280 butler.registry.getCollectionType(run1)
1281 with self.assertRaises(MissingCollectionError):
1282 butler.registry.getCollectionType(run2)
1283 self.assertFalse(butler.datastore.exists(ref1))
1284 self.assertFalse(butler.datastore.exists(ref2))
1285 # The ref we unstored should be gone according to the URI, but the
1286 # one we forgot should still be around.
1287 self.assertFalse(uri1.exists())
1288 self.assertTrue(uri2.exists())
1290 # Now that the collections have been pruned we can remove the
1291 # dataset type
1292 butler.registry.removeDatasetType(datasetType.name)
1294 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm:
1295 butler.registry.removeDatasetType(tuple(["test*", "test*"]))
1296 self.assertIn("not defined", "\n".join(cm.output))
1299class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1300 """PosixDatastore specialization of a butler"""
1302 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1303 fullConfigKey = ".datastore.formatters"
1304 validationCanFail = True
1305 datastoreStr = ["/tmp"]
1306 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1307 registryStr = "/gen3.sqlite3"
1309 def testPathConstructor(self):
1310 """Independent test of constructor using PathLike."""
1311 butler = Butler(self.tmpConfigFile, run=self.default_run)
1312 self.assertIsInstance(butler, Butler)
1314 # And again with a Path object with the butler yaml
1315 path = pathlib.Path(self.tmpConfigFile)
1316 butler = Butler(path, writeable=False)
1317 self.assertIsInstance(butler, Butler)
1319 # And again with a Path object without the butler yaml
1320 # (making sure we skip it if the tmp config doesn't end
1321 # in butler.yaml -- which is the case for a subclass)
1322 if self.tmpConfigFile.endswith("butler.yaml"):
1323 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1324 butler = Butler(path, writeable=False)
1325 self.assertIsInstance(butler, Butler)
1327 def testExportTransferCopy(self):
1328 """Test local export using all transfer modes"""
1329 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1330 exportButler = self.runPutGetTest(storageClass, "test_metric")
1331 # Test that the repo actually has at least one dataset.
1332 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1333 self.assertGreater(len(datasets), 0)
1334 uris = [exportButler.getURI(d) for d in datasets]
1335 datastoreRoot = exportButler.datastore.root
1337 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1339 for path in pathsInStore:
1340 # Assume local file system
1341 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1343 for transfer in ("copy", "link", "symlink", "relsymlink"):
1344 with safeTestTempDir(TESTDIR) as exportDir:
1345 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1346 export.saveDatasets(datasets)
1347 for path in pathsInStore:
1348 self.assertTrue(
1349 self.checkFileExists(exportDir, path),
1350 f"Check that mode {transfer} exported files",
1351 )
1353 def testPruneDatasets(self):
1354 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1355 butler = Butler(self.tmpConfigFile, writeable=True)
1356 # Load registry data with dimensions to hang datasets off of.
1357 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1358 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1359 # Add some RUN-type collections.
1360 run1 = "run1"
1361 butler.registry.registerRun(run1)
1362 run2 = "run2"
1363 butler.registry.registerRun(run2)
1364 # put some datasets. ref1 and ref2 have the same data ID, and are in
1365 # different runs. ref3 has a different data ID.
1366 metric = makeExampleMetrics()
1367 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1368 datasetType = self.addDatasetType(
1369 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1370 )
1371 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1372 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1373 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1375 # Simple prune.
1376 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1377 with self.assertRaises(LookupError):
1378 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1380 # Put data back.
1381 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1382 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1383 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1385 # Check that in normal mode, deleting the record will lead to
1386 # trash not touching the file.
1387 uri1 = butler.datastore.getURI(ref1)
1388 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table
1389 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1390 butler.datastore.trash(ref1)
1391 butler.datastore.emptyTrash()
1392 self.assertTrue(uri1.exists())
1393 uri1.remove() # Clean it up.
1395 # Simulate execution butler setup by deleting the datastore
1396 # record but keeping the file around and trusting.
1397 butler.datastore.trustGetRequest = True
1398 uri2 = butler.datastore.getURI(ref2)
1399 uri3 = butler.datastore.getURI(ref3)
1400 self.assertTrue(uri2.exists())
1401 self.assertTrue(uri3.exists())
1403 # Remove the datastore record.
1404 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table
1405 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1406 self.assertTrue(uri2.exists())
1407 butler.datastore.trash([ref2, ref3])
1408 # Immediate removal for ref2 file
1409 self.assertFalse(uri2.exists())
1410 # But ref3 has to wait for the empty.
1411 self.assertTrue(uri3.exists())
1412 butler.datastore.emptyTrash()
1413 self.assertFalse(uri3.exists())
1415 # Clear out the datasets from registry.
1416 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1418 def testPytypeCoercion(self):
1419 """Test python type coercion on Butler.get and put."""
1421 # Store some data with the normal example storage class.
1422 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1423 datasetTypeName = "test_metric"
1424 butler = self.runPutGetTest(storageClass, datasetTypeName)
1426 dataId = {"instrument": "DummyCamComp", "visit": 423}
1427 metric = butler.get(datasetTypeName, dataId=dataId)
1428 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1430 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1431 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1433 # Now need to hack the registry dataset type definition.
1434 # There is no API for this.
1435 manager = butler.registry._managers.datasets
1436 manager._db.update(
1437 manager._static.dataset_type,
1438 {"name": datasetTypeName},
1439 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1440 )
1442 # Force reset of dataset type cache
1443 butler.registry.refresh()
1445 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1446 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1447 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1449 metric_model = butler.get(datasetTypeName, dataId=dataId)
1450 self.assertNotEqual(type(metric_model), type(metric))
1451 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1453 # Put the model and read it back to show that everything now
1454 # works as normal.
1455 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1456 metric_model_new = butler.get(metric_ref)
1457 self.assertEqual(metric_model_new, metric_model)
1459 # Hack the storage class again to something that will fail on the
1460 # get with no conversion class.
1461 manager._db.update(
1462 manager._static.dataset_type,
1463 {"name": datasetTypeName},
1464 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1465 )
1466 butler.registry.refresh()
1468 with self.assertRaises(ValueError):
1469 butler.get(datasetTypeName, dataId=dataId)
1472@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1473class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1474 """PosixDatastore specialization of a butler using Postgres"""
1476 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1477 fullConfigKey = ".datastore.formatters"
1478 validationCanFail = True
1479 datastoreStr = ["/tmp"]
1480 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1481 registryStr = "PostgreSQL@test"
1483 @staticmethod
1484 def _handler(postgresql):
1485 engine = sqlalchemy.engine.create_engine(postgresql.url())
1486 with engine.begin() as connection:
1487 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1489 @classmethod
1490 def setUpClass(cls):
1491 # Create the postgres test server.
1492 cls.postgresql = testing.postgresql.PostgresqlFactory(
1493 cache_initialized_db=True, on_initialized=cls._handler
1494 )
1495 super().setUpClass()
1497 @classmethod
1498 def tearDownClass(cls):
1499 # Clean up any lingering SQLAlchemy engines/connections
1500 # so they're closed before we shut down the server.
1501 gc.collect()
1502 cls.postgresql.clear_cache()
1503 super().tearDownClass()
1505 def setUp(self):
1506 self.server = self.postgresql()
1508 # Need to add a registry section to the config.
1509 self._temp_config = False
1510 config = Config(self.configFile)
1511 config["registry", "db"] = self.server.url()
1512 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1513 config.dump(fh)
1514 self.configFile = fh.name
1515 self._temp_config = True
1516 super().setUp()
1518 def tearDown(self):
1519 self.server.stop()
1520 if self._temp_config and os.path.exists(self.configFile):
1521 os.remove(self.configFile)
1522 super().tearDown()
1524 def testMakeRepo(self):
1525 # The base class test assumes that it's using sqlite and assumes
1526 # the config file is acceptable to sqlite.
1527 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1530class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1531 """InMemoryDatastore specialization of a butler"""
1533 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1534 fullConfigKey = None
1535 useTempRoot = False
1536 validationCanFail = False
1537 datastoreStr = ["datastore='InMemory"]
1538 datastoreName = ["InMemoryDatastore@"]
1539 registryStr = "/gen3.sqlite3"
1541 def testIngest(self):
1542 pass
1545class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1546 """PosixDatastore specialization"""
1548 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1549 fullConfigKey = ".datastore.datastores.1.formatters"
1550 validationCanFail = True
1551 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1552 datastoreName = [
1553 "InMemoryDatastore@",
1554 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1555 "SecondDatastore",
1556 ]
1557 registryStr = "/gen3.sqlite3"
1560class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1561 """Test that a yaml file in one location can refer to a root in another."""
1563 datastoreStr = ["dir1"]
1564 # Disable the makeRepo test since we are deliberately not using
1565 # butler.yaml as the config name.
1566 fullConfigKey = None
1568 def setUp(self):
1569 self.root = makeTestTempDir(TESTDIR)
1571 # Make a new repository in one place
1572 self.dir1 = os.path.join(self.root, "dir1")
1573 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1575 # Move the yaml file to a different place and add a "root"
1576 self.dir2 = os.path.join(self.root, "dir2")
1577 os.makedirs(self.dir2, exist_ok=True)
1578 configFile1 = os.path.join(self.dir1, "butler.yaml")
1579 config = Config(configFile1)
1580 config["root"] = self.dir1
1581 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1582 config.dumpToUri(configFile2)
1583 os.remove(configFile1)
1584 self.tmpConfigFile = configFile2
1586 def testFileLocations(self):
1587 self.assertNotEqual(self.dir1, self.dir2)
1588 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1589 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1590 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1593class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1594 """Test that a config file created by makeRepo outside of repo works."""
1596 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1598 def setUp(self):
1599 self.root = makeTestTempDir(TESTDIR)
1600 self.root2 = makeTestTempDir(TESTDIR)
1602 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1603 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1605 def tearDown(self):
1606 if os.path.exists(self.root2):
1607 shutil.rmtree(self.root2, ignore_errors=True)
1608 super().tearDown()
1610 def testConfigExistence(self):
1611 c = Config(self.tmpConfigFile)
1612 uri_config = ResourcePath(c["root"])
1613 uri_expected = ResourcePath(self.root, forceDirectory=True)
1614 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1615 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1617 def testPutGet(self):
1618 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1619 self.runPutGetTest(storageClass, "test_metric")
1622class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1623 """Test that a config file created by makeRepo outside of repo works."""
1625 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1627 def setUp(self):
1628 self.root = makeTestTempDir(TESTDIR)
1629 self.root2 = makeTestTempDir(TESTDIR)
1631 self.tmpConfigFile = self.root2
1632 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1634 def testConfigExistence(self):
1635 # Append the yaml file else Config constructor does not know the file
1636 # type.
1637 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1638 super().testConfigExistence()
1641class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1642 """Test that a config file created by makeRepo outside of repo works."""
1644 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1646 def setUp(self):
1647 self.root = makeTestTempDir(TESTDIR)
1648 self.root2 = makeTestTempDir(TESTDIR)
1650 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1651 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1654@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1655class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1656 """S3Datastore specialization of a butler; an S3 storage Datastore +
1657 a local in-memory SqlRegistry.
1658 """
1660 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1661 fullConfigKey = None
1662 validationCanFail = True
1664 bucketName = "anybucketname"
1665 """Name of the Bucket that will be used in the tests. The name is read from
1666 the config file used with the tests during set-up.
1667 """
1669 root = "butlerRoot/"
1670 """Root repository directory expected to be used in case useTempRoot=False.
1671 Otherwise the root is set to a 20 characters long randomly generated string
1672 during set-up.
1673 """
1675 datastoreStr = [f"datastore={root}"]
1676 """Contains all expected root locations in a format expected to be
1677 returned by Butler stringification.
1678 """
1680 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1681 """The expected format of the S3 Datastore string."""
1683 registryStr = "/gen3.sqlite3"
1684 """Expected format of the Registry string."""
1686 mock_s3 = mock_s3()
1687 """The mocked s3 interface from moto."""
1689 def genRoot(self):
1690 """Returns a random string of len 20 to serve as a root
1691 name for the temporary bucket repo.
1693 This is equivalent to tempfile.mkdtemp as this is what self.root
1694 becomes when useTempRoot is True.
1695 """
1696 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1697 return rndstr + "/"
1699 def setUp(self):
1700 config = Config(self.configFile)
1701 uri = ResourcePath(config[".datastore.datastore.root"])
1702 self.bucketName = uri.netloc
1704 # Enable S3 mocking of tests.
1705 self.mock_s3.start()
1707 # set up some fake credentials if they do not exist
1708 self.usingDummyCredentials = setAwsEnvCredentials()
1710 if self.useTempRoot:
1711 self.root = self.genRoot()
1712 rooturi = f"s3://{self.bucketName}/{self.root}"
1713 config.update({"datastore": {"datastore": {"root": rooturi}}})
1715 # need local folder to store registry database
1716 self.reg_dir = makeTestTempDir(TESTDIR)
1717 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1719 # MOTO needs to know that we expect Bucket bucketname to exist
1720 # (this used to be the class attribute bucketName)
1721 s3 = boto3.resource("s3")
1722 s3.create_bucket(Bucket=self.bucketName)
1724 self.datastoreStr = f"datastore={self.root}"
1725 self.datastoreName = [f"FileDatastore@{rooturi}"]
1726 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1727 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1729 def tearDown(self):
1730 s3 = boto3.resource("s3")
1731 bucket = s3.Bucket(self.bucketName)
1732 try:
1733 bucket.objects.all().delete()
1734 except botocore.exceptions.ClientError as e:
1735 if e.response["Error"]["Code"] == "404":
1736 # the key was not reachable - pass
1737 pass
1738 else:
1739 raise
1741 bucket = s3.Bucket(self.bucketName)
1742 bucket.delete()
1744 # Stop the S3 mock.
1745 self.mock_s3.stop()
1747 # unset any potentially set dummy credentials
1748 if self.usingDummyCredentials:
1749 unsetAwsEnvCredentials()
1751 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1752 shutil.rmtree(self.reg_dir, ignore_errors=True)
1754 if self.useTempRoot and os.path.exists(self.root):
1755 shutil.rmtree(self.root, ignore_errors=True)
1757 super().tearDown()
1760class PosixDatastoreTransfers(unittest.TestCase):
1761 """Test data transfers between butlers.
1763 Test for different managers. UUID to UUID and integer to integer are
1764 tested. UUID to integer is not supported since we do not currently
1765 want to allow that. Integer to UUID is supported with the caveat
1766 that UUID4 will be generated and this will be incorrect for raw
1767 dataset types. The test ignores that.
1768 """
1770 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1772 @classmethod
1773 def setUpClass(cls):
1774 cls.storageClassFactory = StorageClassFactory()
1775 cls.storageClassFactory.addFromConfig(cls.configFile)
1777 def setUp(self):
1778 self.root = makeTestTempDir(TESTDIR)
1779 self.config = Config(self.configFile)
1781 def tearDown(self):
1782 removeTestTempDir(self.root)
1784 def create_butler(self, manager, label):
1785 config = Config(self.configFile)
1786 config["registry", "managers", "datasets"] = manager
1787 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1789 def create_butlers(self, manager1, manager2):
1790 self.source_butler = self.create_butler(manager1, "1")
1791 self.target_butler = self.create_butler(manager2, "2")
1793 def testTransferUuidToUuid(self):
1794 self.create_butlers(
1795 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1796 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1797 )
1798 # Setting id_gen_map should have no effect here
1799 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1801 def _enable_trust(self, datastore) -> None:
1802 if hasattr(datastore, "trustGetRequest"):
1803 datastore.trustGetRequest = True
1804 elif hasattr(datastore, "datastores"):
1805 for datastore in datastore.datastores:
1806 if hasattr(datastore, "trustGetRequest"):
1807 datastore.trustGetRequest = True
1809 def testTransferMissing(self):
1810 """Test transfers where datastore records are missing.
1812 This is how execution butler works.
1813 """
1814 self.create_butlers(
1815 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1816 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1817 )
1819 # Configure the source butler to allow trust.
1820 self._enable_trust(self.source_butler.datastore)
1822 self.assertButlerTransfers(purge=True)
1824 def testTransferMissingDisassembly(self):
1825 """Test transfers where datastore records are missing.
1827 This is how execution butler works.
1828 """
1829 self.create_butlers(
1830 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1831 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1832 )
1834 # Configure the source butler to allow trust.
1835 self._enable_trust(self.source_butler.datastore)
1837 # Test disassembly.
1838 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1840 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1841 """Test that a run can be transferred to another butler."""
1843 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1844 datasetTypeName = "random_data"
1846 # Test will create 3 collections and we will want to transfer
1847 # two of those three.
1848 runs = ["run1", "run2", "other"]
1850 # Also want to use two different dataset types to ensure that
1851 # grouping works.
1852 datasetTypeNames = ["random_data", "random_data_2"]
1854 # Create the run collections in the source butler.
1855 for run in runs:
1856 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1858 # Create dimensions in source butler.
1859 n_exposures = 30
1860 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1861 self.source_butler.registry.insertDimensionData(
1862 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1863 )
1864 self.source_butler.registry.insertDimensionData(
1865 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1866 )
1868 for i in range(n_exposures):
1869 self.source_butler.registry.insertDimensionData(
1870 "exposure",
1871 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1872 )
1874 # Create dataset types in the source butler.
1875 dimensions = self.source_butler.registry.dimensions.extract(["instrument", "exposure"])
1876 for datasetTypeName in datasetTypeNames:
1877 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1878 self.source_butler.registry.registerDatasetType(datasetType)
1880 # Write a dataset to an unrelated run -- this will ensure that
1881 # we are rewriting integer dataset ids in the target if necessary.
1882 # Will not be relevant for UUID.
1883 run = "distraction"
1884 butler = Butler(butler=self.source_butler, run=run)
1885 butler.put(
1886 makeExampleMetrics(),
1887 datasetTypeName,
1888 exposure=1,
1889 instrument="DummyCamComp",
1890 physical_filter="d-r",
1891 )
1893 # Write some example metrics to the source
1894 butler = Butler(butler=self.source_butler)
1896 # Set of DatasetRefs that should be in the list of refs to transfer
1897 # but which will not be transferred.
1898 deleted = set()
1900 n_expected = 20 # Number of datasets expected to be transferred
1901 source_refs = []
1902 for i in range(n_exposures):
1903 # Put a third of datasets into each collection, only retain
1904 # two thirds.
1905 index = i % 3
1906 run = runs[index]
1907 datasetTypeName = datasetTypeNames[i % 2]
1909 metric_data = {
1910 "summary": {"counter": i},
1911 "output": {"text": "metric"},
1912 "data": [2 * x for x in range(i)],
1913 }
1914 metric = MetricsExample(**metric_data)
1915 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1916 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
1918 # Remove the datastore record using low-level API
1919 if purge:
1920 # Remove records for a fraction.
1921 if index == 1:
1922 # For one of these delete the file as well.
1923 # This allows the "missing" code to filter the
1924 # file out.
1925 # Access the individual datastores.
1926 datastores = []
1927 if hasattr(butler.datastore, "datastores"):
1928 datastores.extend(butler.datastore.datastores)
1929 else:
1930 datastores.append(butler.datastore)
1932 if not deleted:
1933 # For a chained datastore we need to remove
1934 # files in each chain.
1935 for datastore in datastores:
1936 # The file might not be known to the datastore
1937 # if constraints are used.
1938 try:
1939 primary, uris = datastore.getURIs(ref)
1940 except FileNotFoundError:
1941 continue
1942 if primary:
1943 if primary.scheme != "mem":
1944 primary.remove()
1945 for uri in uris.values():
1946 if uri.scheme != "mem":
1947 uri.remove()
1948 n_expected -= 1
1949 deleted.add(ref)
1951 # Remove the datastore record.
1952 for datastore in datastores:
1953 if hasattr(datastore, "removeStoredItemInfo"):
1954 datastore.removeStoredItemInfo(ref)
1956 if index < 2:
1957 source_refs.append(ref)
1958 if ref not in deleted:
1959 new_metric = butler.get(ref.unresolved(), collections=run)
1960 self.assertEqual(new_metric, metric)
1962 # Create some bad dataset types to ensure we check for inconsistent
1963 # definitions.
1964 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
1965 for datasetTypeName in datasetTypeNames:
1966 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
1967 self.target_butler.registry.registerDatasetType(datasetType)
1968 with self.assertRaises(ConflictingDefinitionError) as cm:
1969 self.target_butler.transfer_from(self.source_butler, source_refs)
1970 self.assertIn("dataset type differs", str(cm.exception))
1972 # And remove the bad definitions.
1973 for datasetTypeName in datasetTypeNames:
1974 self.target_butler.registry.removeDatasetType(datasetTypeName)
1976 # Transfer without creating dataset types should fail.
1977 with self.assertRaises(KeyError):
1978 self.target_butler.transfer_from(self.source_butler, source_refs)
1980 # Transfer without creating dimensions should fail.
1981 with self.assertRaises(ConflictingDefinitionError) as cm:
1982 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True)
1983 self.assertIn("dimension", str(cm.exception))
1985 # The failed transfer above leaves registry in an inconsistent
1986 # state because the run is created but then rolled back without
1987 # the collection cache being cleared. For now force a refresh.
1988 # Can remove with DM-35498.
1989 self.target_butler.registry.refresh()
1991 # Now transfer them to the second butler, including dimensions.
1992 with self.assertLogs(level=logging.DEBUG) as cm:
1993 transferred = self.target_butler.transfer_from(
1994 self.source_butler,
1995 source_refs,
1996 register_dataset_types=True,
1997 transfer_dimensions=True,
1998 )
1999 self.assertEqual(len(transferred), n_expected)
2000 log_output = ";".join(cm.output)
2002 # A ChainedDatastore will use the in-memory datastore for mexists
2003 # so we can not rely on the mexists log message.
2004 self.assertIn("Number of datastore records found in source", log_output)
2005 self.assertIn("Creating output run", log_output)
2007 # Do the transfer twice to ensure that it will do nothing extra.
2008 # Only do this if purge=True because it does not work for int
2009 # dataset_id.
2010 if purge:
2011 # This should not need to register dataset types.
2012 transferred = self.target_butler.transfer_from(self.source_butler, source_refs)
2013 self.assertEqual(len(transferred), n_expected)
2015 # Also do an explicit low-level transfer to trigger some
2016 # edge cases.
2017 with self.assertLogs(level=logging.DEBUG) as cm:
2018 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2019 log_output = ";".join(cm.output)
2020 self.assertIn("no file artifacts exist", log_output)
2022 with self.assertRaises((TypeError, AttributeError)):
2023 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2025 with self.assertRaises(ValueError):
2026 self.target_butler.datastore.transfer_from(
2027 self.source_butler.datastore, source_refs, transfer="split"
2028 )
2030 # Now try to get the same refs from the new butler.
2031 for ref in source_refs:
2032 if ref not in deleted:
2033 unresolved_ref = ref.unresolved()
2034 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2035 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2036 self.assertEqual(new_metric, old_metric)
2038 # Now prune run2 collection and create instead a CHAINED collection.
2039 # This should block the transfer.
2040 self.target_butler.removeRuns(["run2"], unstore=True)
2041 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2042 with self.assertRaises(CollectionTypeError):
2043 # Re-importing the run1 datasets can be problematic if they
2044 # use integer IDs so filter those out.
2045 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2046 self.target_butler.transfer_from(self.source_butler, to_transfer)
2049class ChainedDatastoreTransfers(PosixDatastoreTransfers):
2050 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2053if __name__ == "__main__":
2054 unittest.main()