Coverage for tests/test_butler.py: 12%
1128 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-03 09:15 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-03 09:15 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import gc
26import json
27import logging
28import os
29import pathlib
30import pickle
31import posixpath
32import random
33import shutil
34import string
35import tempfile
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 # It's possible but silly to have testing.postgresql installed without
52 # having the postgresql server installed (because then nothing in
53 # testing.postgresql would work), so we use the presence of that module
54 # to test whether we can expect the server to be available.
55 import testing.postgresql
56except ImportError:
57 testing = None
59import astropy.time
60import sqlalchemy
61from lsst.daf.butler import (
62 Butler,
63 ButlerConfig,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplate,
71 FileTemplateValidationError,
72 StorageClassFactory,
73 ValidationError,
74 script,
75)
76from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
77from lsst.daf.butler.registry import (
78 CollectionError,
79 CollectionTypeError,
80 ConflictingDefinitionError,
81 DataIdValueError,
82 MissingCollectionError,
83 OrphanedRecordError,
84)
85from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
86from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
87from lsst.resources import ResourcePath
88from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
89from lsst.utils import doImport
90from lsst.utils.introspection import get_full_type_name
92TESTDIR = os.path.abspath(os.path.dirname(__file__))
95def makeExampleMetrics():
96 return MetricsExample(
97 {"AM1": 5.2, "AM2": 30.6},
98 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
99 [563, 234, 456.7, 752, 8, 9, 27],
100 )
103class TransactionTestError(Exception):
104 """Specific error for testing transactions, to prevent misdiagnosing
105 that might otherwise occur when a standard exception is used.
106 """
108 pass
111class ButlerConfigTests(unittest.TestCase):
112 """Simple tests for ButlerConfig that are not tested in any other test
113 cases."""
115 def testSearchPath(self):
116 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
117 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
118 config1 = ButlerConfig(configFile)
119 self.assertNotIn("testConfigs", "\n".join(cm.output))
121 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
122 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
123 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
124 self.assertIn("testConfigs", "\n".join(cm.output))
126 key = ("datastore", "records", "table")
127 self.assertNotEqual(config1[key], config2[key])
128 self.assertEqual(config2[key], "override_record")
131class ButlerPutGetTests:
132 """Helper method for running a suite of put/get tests from different
133 butler configurations."""
135 root = None
136 default_run = "ingésτ😺"
138 @staticmethod
139 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
140 """Create a DatasetType and register it"""
141 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
142 registry.registerDatasetType(datasetType)
143 return datasetType
145 @classmethod
146 def setUpClass(cls):
147 cls.storageClassFactory = StorageClassFactory()
148 cls.storageClassFactory.addFromConfig(cls.configFile)
150 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
151 datasetType = datasetRef.datasetType
152 dataId = datasetRef.dataId
153 deferred = butler.getDeferred(datasetRef)
155 for component in components:
156 compTypeName = datasetType.componentTypeName(component)
157 result = butler.get(compTypeName, dataId, collections=collections)
158 self.assertEqual(result, getattr(reference, component))
159 result_deferred = deferred.get(component=component)
160 self.assertEqual(result_deferred, result)
162 def tearDown(self):
163 removeTestTempDir(self.root)
165 def create_butler(self, run, storageClass, datasetTypeName):
166 butler = Butler(self.tmpConfigFile, run=run)
168 collections = set(butler.registry.queryCollections())
169 self.assertEqual(collections, set([run]))
171 # Create and register a DatasetType
172 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
174 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
176 # Add needed Dimensions
177 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
178 butler.registry.insertDimensionData(
179 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
180 )
181 butler.registry.insertDimensionData(
182 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
183 )
184 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
185 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
186 butler.registry.insertDimensionData(
187 "visit",
188 {
189 "instrument": "DummyCamComp",
190 "id": 423,
191 "name": "fourtwentythree",
192 "physical_filter": "d-r",
193 "visit_system": 1,
194 "datetime_begin": visit_start,
195 "datetime_end": visit_end,
196 },
197 )
199 # Add more visits for some later tests
200 for visit_id in (424, 425):
201 butler.registry.insertDimensionData(
202 "visit",
203 {
204 "instrument": "DummyCamComp",
205 "id": visit_id,
206 "name": f"fourtwentyfour_{visit_id}",
207 "physical_filter": "d-r",
208 "visit_system": 1,
209 },
210 )
211 return butler, datasetType
213 def runPutGetTest(self, storageClass, datasetTypeName):
214 # New datasets will be added to run and tag, but we will only look in
215 # tag when looking up datasets.
216 run = self.default_run
217 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
219 # Create and store a dataset
220 metric = makeExampleMetrics()
221 dataId = {"instrument": "DummyCamComp", "visit": 423}
223 # Create a DatasetRef for put
224 refIn = DatasetRef(datasetType, dataId, id=None)
226 # Put with a preexisting id should fail
227 with self.assertRaises(ValueError):
228 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
230 # Put and remove the dataset once as a DatasetRef, once as a dataId,
231 # and once with a DatasetType
233 # Keep track of any collections we add and do not clean up
234 expected_collections = {run}
236 counter = 0
237 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
238 # Since we are using subTest we can get cascading failures
239 # here with the first attempt failing and the others failing
240 # immediately because the dataset already exists. Work around
241 # this by using a distinct run collection each time
242 counter += 1
243 this_run = f"put_run_{counter}"
244 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
245 expected_collections.update({this_run})
247 with self.subTest(args=args):
248 ref = butler.put(metric, *args, run=this_run)
249 self.assertIsInstance(ref, DatasetRef)
251 # Test getDirect
252 metricOut = butler.get(ref)
253 self.assertEqual(metric, metricOut)
254 # Test get
255 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
256 self.assertEqual(metric, metricOut)
257 # Test get with a datasetRef
258 metricOut = butler.get(ref, collections=this_run)
259 self.assertEqual(metric, metricOut)
260 # Test getDeferred with dataId
261 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
262 self.assertEqual(metric, metricOut)
263 # Test getDeferred with a datasetRef
264 metricOut = butler.getDeferred(ref, collections=this_run).get()
265 self.assertEqual(metric, metricOut)
266 # and deferred direct with ref
267 metricOut = butler.getDeferred(ref).get()
268 self.assertEqual(metric, metricOut)
270 # Check we can get components
271 if storageClass.isComposite():
272 self.assertGetComponents(
273 butler, ref, ("summary", "data", "output"), metric, collections=this_run
274 )
276 # Can the artifacts themselves be retrieved?
277 if not butler.datastore.isEphemeral:
278 root_uri = ResourcePath(self.root)
280 for preserve_path in (True, False):
281 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
282 # Use copy so that we can test that overwrite
283 # protection works (using "auto" for File URIs would
284 # use hard links and subsequent transfer would work
285 # because it knows they are the same file).
286 transferred = butler.retrieveArtifacts(
287 [ref], destination, preserve_path=preserve_path, transfer="copy"
288 )
289 self.assertGreater(len(transferred), 0)
290 artifacts = list(ResourcePath.findFileResources([destination]))
291 self.assertEqual(set(transferred), set(artifacts))
293 for artifact in transferred:
294 path_in_destination = artifact.relative_to(destination)
295 self.assertIsNotNone(path_in_destination)
297 # when path is not preserved there should not be
298 # any path separators.
299 num_seps = path_in_destination.count("/")
300 if preserve_path:
301 self.assertGreater(num_seps, 0)
302 else:
303 self.assertEqual(num_seps, 0)
305 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
306 n_uris = len(secondary_uris)
307 if primary_uri:
308 n_uris += 1
309 self.assertEqual(
310 len(artifacts),
311 n_uris,
312 "Comparing expected artifacts vs actual:"
313 f" {artifacts} vs {primary_uri} and {secondary_uris}",
314 )
316 if preserve_path:
317 # No need to run these twice
318 with self.assertRaises(ValueError):
319 butler.retrieveArtifacts([ref], destination, transfer="move")
321 with self.assertRaises(FileExistsError):
322 butler.retrieveArtifacts([ref], destination)
324 transferred_again = butler.retrieveArtifacts(
325 [ref], destination, preserve_path=preserve_path, overwrite=True
326 )
327 self.assertEqual(set(transferred_again), set(transferred))
329 # Now remove the dataset completely.
330 butler.pruneDatasets([ref], purge=True, unstore=True)
331 # Lookup with original args should still fail.
332 with self.assertRaises(LookupError):
333 butler.datasetExists(*args, collections=this_run)
334 # get() should still fail.
335 with self.assertRaises(FileNotFoundError):
336 butler.get(ref)
337 # Registry shouldn't be able to find it by dataset_id anymore.
338 self.assertIsNone(butler.registry.getDataset(ref.id))
340 # Do explicit registry removal since we know they are
341 # empty
342 butler.registry.removeCollection(this_run)
343 expected_collections.remove(this_run)
345 # Put the dataset again, since the last thing we did was remove it
346 # and we want to use the default collection.
347 ref = butler.put(metric, refIn)
349 # Get with parameters
350 stop = 4
351 sliced = butler.get(ref, parameters={"slice": slice(stop)})
352 self.assertNotEqual(metric, sliced)
353 self.assertEqual(metric.summary, sliced.summary)
354 self.assertEqual(metric.output, sliced.output)
355 self.assertEqual(metric.data[:stop], sliced.data)
356 # getDeferred with parameters
357 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
358 self.assertNotEqual(metric, sliced)
359 self.assertEqual(metric.summary, sliced.summary)
360 self.assertEqual(metric.output, sliced.output)
361 self.assertEqual(metric.data[:stop], sliced.data)
362 # getDeferred with deferred parameters
363 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
364 self.assertNotEqual(metric, sliced)
365 self.assertEqual(metric.summary, sliced.summary)
366 self.assertEqual(metric.output, sliced.output)
367 self.assertEqual(metric.data[:stop], sliced.data)
369 if storageClass.isComposite():
370 # Check that components can be retrieved
371 metricOut = butler.get(ref.datasetType.name, dataId)
372 compNameS = ref.datasetType.componentTypeName("summary")
373 compNameD = ref.datasetType.componentTypeName("data")
374 summary = butler.get(compNameS, dataId)
375 self.assertEqual(summary, metric.summary)
376 data = butler.get(compNameD, dataId)
377 self.assertEqual(data, metric.data)
379 if "counter" in storageClass.derivedComponents:
380 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
381 self.assertEqual(count, len(data))
383 count = butler.get(
384 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
385 )
386 self.assertEqual(count, stop)
388 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
389 summary = butler.get(compRef)
390 self.assertEqual(summary, metric.summary)
392 # Create a Dataset type that has the same name but is inconsistent.
393 inconsistentDatasetType = DatasetType(
394 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
395 )
397 # Getting with a dataset type that does not match registry fails
398 with self.assertRaises(ValueError):
399 butler.get(inconsistentDatasetType, dataId)
401 # Combining a DatasetRef with a dataId should fail
402 with self.assertRaises(ValueError):
403 butler.get(ref, dataId)
404 # Getting with an explicit ref should fail if the id doesn't match
405 with self.assertRaises(ValueError):
406 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
408 # Getting a dataset with unknown parameters should fail
409 with self.assertRaises(KeyError):
410 butler.get(ref, parameters={"unsupported": True})
412 # Check we have a collection
413 collections = set(butler.registry.queryCollections())
414 self.assertEqual(collections, expected_collections)
416 # Clean up to check that we can remove something that may have
417 # already had a component removed
418 butler.pruneDatasets([ref], unstore=True, purge=True)
420 # Check that we can configure a butler to accept a put even
421 # if it already has the dataset in registry.
422 ref = butler.put(metric, refIn)
424 # Repeat put will fail.
425 with self.assertRaises(ConflictingDefinitionError):
426 butler.put(metric, refIn)
428 # Remove the datastore entry.
429 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
431 # Put will still fail
432 with self.assertRaises(ConflictingDefinitionError):
433 butler.put(metric, refIn)
435 # Allow the put to succeed
436 butler._allow_put_of_predefined_dataset = True
437 ref2 = butler.put(metric, refIn)
438 self.assertEqual(ref2.id, ref.id)
440 # A second put will still fail but with a different exception
441 # than before.
442 with self.assertRaises(ConflictingDefinitionError):
443 butler.put(metric, refIn)
445 # Reset the flag to avoid confusion
446 butler._allow_put_of_predefined_dataset = False
448 # Leave the dataset in place since some downstream tests require
449 # something to be present
451 return butler
453 def testDeferredCollectionPassing(self):
454 # Construct a butler with no run or collection, but make it writeable.
455 butler = Butler(self.tmpConfigFile, writeable=True)
456 # Create and register a DatasetType
457 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
458 datasetType = self.addDatasetType(
459 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
460 )
461 # Add needed Dimensions
462 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
463 butler.registry.insertDimensionData(
464 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
465 )
466 butler.registry.insertDimensionData(
467 "visit",
468 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
469 )
470 dataId = {"instrument": "DummyCamComp", "visit": 423}
471 # Create dataset.
472 metric = makeExampleMetrics()
473 # Register a new run and put dataset.
474 run = "deferred"
475 self.assertTrue(butler.registry.registerRun(run))
476 # Second time it will be allowed but indicate no-op
477 self.assertFalse(butler.registry.registerRun(run))
478 ref = butler.put(metric, datasetType, dataId, run=run)
479 # Putting with no run should fail with TypeError.
480 with self.assertRaises(CollectionError):
481 butler.put(metric, datasetType, dataId)
482 # Dataset should exist.
483 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
484 # We should be able to get the dataset back, but with and without
485 # a deferred dataset handle.
486 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
487 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
488 # Trying to find the dataset without any collection is a TypeError.
489 with self.assertRaises(CollectionError):
490 butler.datasetExists(datasetType, dataId)
491 with self.assertRaises(CollectionError):
492 butler.get(datasetType, dataId)
493 # Associate the dataset with a different collection.
494 butler.registry.registerCollection("tagged")
495 butler.registry.associate("tagged", [ref])
496 # Deleting the dataset from the new collection should make it findable
497 # in the original collection.
498 butler.pruneDatasets([ref], tags=["tagged"])
499 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
502class ButlerTests(ButlerPutGetTests):
503 """Tests for Butler."""
505 useTempRoot = True
507 def setUp(self):
508 """Create a new butler root for each test."""
509 self.root = makeTestTempDir(TESTDIR)
510 Butler.makeRepo(self.root, config=Config(self.configFile))
511 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
513 def testConstructor(self):
514 """Independent test of constructor."""
515 butler = Butler(self.tmpConfigFile, run=self.default_run)
516 self.assertIsInstance(butler, Butler)
518 # Check that butler.yaml is added automatically.
519 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
520 config_dir = self.tmpConfigFile[: -len(end)]
521 butler = Butler(config_dir, run=self.default_run)
522 self.assertIsInstance(butler, Butler)
524 # Even with a ResourcePath.
525 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
526 self.assertIsInstance(butler, Butler)
528 collections = set(butler.registry.queryCollections())
529 self.assertEqual(collections, {self.default_run})
531 # Check that some special characters can be included in run name.
532 special_run = "u@b.c-A"
533 butler_special = Butler(butler=butler, run=special_run)
534 collections = set(butler_special.registry.queryCollections("*@*"))
535 self.assertEqual(collections, {special_run})
537 butler2 = Butler(butler=butler, collections=["other"])
538 self.assertEqual(butler2.collections, ("other",))
539 self.assertIsNone(butler2.run)
540 self.assertIs(butler.datastore, butler2.datastore)
542 # Test that we can use an environment variable to find this
543 # repository.
544 butler_index = Config()
545 butler_index["label"] = self.tmpConfigFile
546 for suffix in (".yaml", ".json"):
547 # Ensure that the content differs so that we know that
548 # we aren't reusing the cache.
549 bad_label = f"s3://bucket/not_real{suffix}"
550 butler_index["bad_label"] = bad_label
551 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
552 butler_index.dumpToUri(temp_file)
553 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
554 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
555 uri = Butler.get_repo_uri("bad_label")
556 self.assertEqual(uri, ResourcePath(bad_label))
557 uri = Butler.get_repo_uri("label")
558 butler = Butler(uri, writeable=False)
559 self.assertIsInstance(butler, Butler)
560 butler = Butler("label", writeable=False)
561 self.assertIsInstance(butler, Butler)
562 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
563 Butler("not_there", writeable=False)
564 with self.assertRaises(KeyError) as cm:
565 Butler.get_repo_uri("missing")
566 self.assertIn("not known to", str(cm.exception))
567 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
568 with self.assertRaises(FileNotFoundError):
569 Butler.get_repo_uri("label")
570 self.assertEqual(Butler.get_known_repos(), set())
571 with self.assertRaises(KeyError) as cm:
572 # No environment variable set.
573 Butler.get_repo_uri("label")
574 self.assertIn("No repository index defined", str(cm.exception))
575 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
576 # No aliases registered.
577 Butler("not_there")
578 self.assertEqual(Butler.get_known_repos(), set())
580 def testBasicPutGet(self):
581 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
582 self.runPutGetTest(storageClass, "test_metric")
584 def testCompositePutGetConcrete(self):
585 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
586 butler = self.runPutGetTest(storageClass, "test_metric")
588 # Should *not* be disassembled
589 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
590 self.assertEqual(len(datasets), 1)
591 uri, components = butler.getURIs(datasets[0])
592 self.assertIsInstance(uri, ResourcePath)
593 self.assertFalse(components)
594 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
595 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
597 # Predicted dataset
598 dataId = {"instrument": "DummyCamComp", "visit": 424}
599 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
600 self.assertFalse(components)
601 self.assertIsInstance(uri, ResourcePath)
602 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
603 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
605 def testCompositePutGetVirtual(self):
606 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
607 butler = self.runPutGetTest(storageClass, "test_metric_comp")
609 # Should be disassembled
610 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
611 self.assertEqual(len(datasets), 1)
612 uri, components = butler.getURIs(datasets[0])
614 if butler.datastore.isEphemeral:
615 # Never disassemble in-memory datastore
616 self.assertIsInstance(uri, ResourcePath)
617 self.assertFalse(components)
618 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
619 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
620 else:
621 self.assertIsNone(uri)
622 self.assertEqual(set(components), set(storageClass.components))
623 for compuri in components.values():
624 self.assertIsInstance(compuri, ResourcePath)
625 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
626 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
628 # Predicted dataset
629 dataId = {"instrument": "DummyCamComp", "visit": 424}
630 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
632 if butler.datastore.isEphemeral:
633 # Never disassembled
634 self.assertIsInstance(uri, ResourcePath)
635 self.assertFalse(components)
636 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
637 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
638 else:
639 self.assertIsNone(uri)
640 self.assertEqual(set(components), set(storageClass.components))
641 for compuri in components.values():
642 self.assertIsInstance(compuri, ResourcePath)
643 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
644 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
646 def testStorageClassOverrideGet(self):
647 """Test storage class conversion on get with override."""
648 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
649 datasetTypeName = "anything"
650 run = self.default_run
652 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
654 # Create and store a dataset.
655 metric = makeExampleMetrics()
656 dataId = {"instrument": "DummyCamComp", "visit": 423}
658 ref = butler.put(metric, datasetType, dataId)
660 # Return native type.
661 retrieved = butler.get(ref)
662 self.assertEqual(retrieved, metric)
664 # Specify an override.
665 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
666 model = butler.get(ref, storageClass=new_sc)
667 self.assertNotEqual(type(model), type(retrieved))
668 self.assertIs(type(model), new_sc.pytype)
669 self.assertEqual(retrieved, model)
671 # Defer but override later.
672 deferred = butler.getDeferred(ref)
673 model = deferred.get(storageClass=new_sc)
674 self.assertIs(type(model), new_sc.pytype)
675 self.assertEqual(retrieved, model)
677 # Defer but override up front.
678 deferred = butler.getDeferred(ref, storageClass=new_sc)
679 model = deferred.get()
680 self.assertIs(type(model), new_sc.pytype)
681 self.assertEqual(retrieved, model)
683 # Retrieve a component. Should be a tuple.
684 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
685 self.assertIs(type(data), tuple)
686 self.assertEqual(data, tuple(retrieved.data))
688 # Parameter on the write storage class should work regardless
689 # of read storage class.
690 data = butler.get(
691 "anything.data",
692 dataId,
693 storageClass="StructuredDataDataTestTuple",
694 parameters={"slice": slice(2, 4)},
695 )
696 self.assertEqual(len(data), 2)
698 # Try a parameter that is known to the read storage class but not
699 # the write storage class.
700 with self.assertRaises(KeyError):
701 butler.get(
702 "anything.data",
703 dataId,
704 storageClass="StructuredDataDataTestTuple",
705 parameters={"xslice": slice(2, 4)},
706 )
708 def testPytypePutCoercion(self):
709 """Test python type coercion on Butler.get and put."""
711 # Store some data with the normal example storage class.
712 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
713 datasetTypeName = "test_metric"
714 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
716 dataId = {"instrument": "DummyCamComp", "visit": 423}
718 # Put a dict and this should coerce to a MetricsExample
719 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
720 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
721 test_metric = butler.get(metric_ref)
722 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
723 self.assertEqual(test_metric.summary, test_dict["summary"])
724 self.assertEqual(test_metric.output, test_dict["output"])
726 # Check that the put still works if a DatasetType is given with
727 # a definition matching this python type.
728 registry_type = butler.registry.getDatasetType(datasetTypeName)
729 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
730 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
731 self.assertEqual(metric2_ref.datasetType, registry_type)
733 # The get will return the type expected by registry.
734 test_metric2 = butler.get(metric2_ref)
735 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
737 # Make a new DatasetRef with the compatible but different DatasetType.
738 # This should now return a dict.
739 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
740 test_dict2 = butler.get(new_ref)
741 self.assertEqual(get_full_type_name(test_dict2), "dict")
743 # Get it again with the wrong dataset type definition using get()
744 # rather than get(). This should be consistent with get()
745 # behavior and return the type of the DatasetType.
746 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
747 self.assertEqual(get_full_type_name(test_dict3), "dict")
749 def testIngest(self):
750 butler = Butler(self.tmpConfigFile, run=self.default_run)
752 # Create and register a DatasetType
753 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
755 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
756 datasetTypeName = "metric"
758 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
760 # Add needed Dimensions
761 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
762 butler.registry.insertDimensionData(
763 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
764 )
765 for detector in (1, 2):
766 butler.registry.insertDimensionData(
767 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
768 )
770 butler.registry.insertDimensionData(
771 "visit",
772 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
773 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
774 )
776 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
777 dataRoot = os.path.join(TESTDIR, "data", "basic")
778 datasets = []
779 for detector in (1, 2):
780 detector_name = f"detector_{detector}"
781 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
782 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
783 # Create a DatasetRef for ingest
784 refIn = DatasetRef(datasetType, dataId, id=None)
786 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
788 butler.ingest(*datasets, transfer="copy")
790 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
791 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
793 metrics1 = butler.get(datasetTypeName, dataId1)
794 metrics2 = butler.get(datasetTypeName, dataId2)
795 self.assertNotEqual(metrics1, metrics2)
797 # Compare URIs
798 uri1 = butler.getURI(datasetTypeName, dataId1)
799 uri2 = butler.getURI(datasetTypeName, dataId2)
800 self.assertNotEqual(uri1, uri2)
802 # Now do a multi-dataset but single file ingest
803 metricFile = os.path.join(dataRoot, "detectors.yaml")
804 refs = []
805 for detector in (1, 2):
806 detector_name = f"detector_{detector}"
807 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
808 # Create a DatasetRef for ingest
809 refs.append(DatasetRef(datasetType, dataId, id=None))
811 # Test "move" transfer to ensure that the files themselves
812 # have disappeared following ingest.
813 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
814 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
816 datasets = []
817 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
819 butler.ingest(*datasets, transfer="move", record_validation_info=False)
820 self.assertFalse(tempFile.exists())
822 # Check that the datastore recorded no file size.
823 # Not all datastores can support this.
824 try:
825 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
826 self.assertEqual(infos[0].file_size, -1)
827 except AttributeError:
828 pass
830 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
831 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
833 multi1 = butler.get(datasetTypeName, dataId1)
834 multi2 = butler.get(datasetTypeName, dataId2)
836 self.assertEqual(multi1, metrics1)
837 self.assertEqual(multi2, metrics2)
839 # Compare URIs
840 uri1 = butler.getURI(datasetTypeName, dataId1)
841 uri2 = butler.getURI(datasetTypeName, dataId2)
842 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
844 # Test that removing one does not break the second
845 # This line will issue a warning log message for a ChainedDatastore
846 # that uses an InMemoryDatastore since in-memory can not ingest
847 # files.
848 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
849 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
850 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
851 multi2b = butler.get(datasetTypeName, dataId2)
852 self.assertEqual(multi2, multi2b)
854 def testPickle(self):
855 """Test pickle support."""
856 butler = Butler(self.tmpConfigFile, run=self.default_run)
857 butlerOut = pickle.loads(pickle.dumps(butler))
858 self.assertIsInstance(butlerOut, Butler)
859 self.assertEqual(butlerOut._config, butler._config)
860 self.assertEqual(butlerOut.collections, butler.collections)
861 self.assertEqual(butlerOut.run, butler.run)
863 def testGetDatasetTypes(self):
864 butler = Butler(self.tmpConfigFile, run=self.default_run)
865 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
866 dimensionEntries = [
867 (
868 "instrument",
869 {"instrument": "DummyCam"},
870 {"instrument": "DummyHSC"},
871 {"instrument": "DummyCamComp"},
872 ),
873 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
874 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
875 ]
876 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
877 # Add needed Dimensions
878 for args in dimensionEntries:
879 butler.registry.insertDimensionData(*args)
881 # When a DatasetType is added to the registry entries are not created
882 # for components but querying them can return the components.
883 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
884 components = set()
885 for datasetTypeName in datasetTypeNames:
886 # Create and register a DatasetType
887 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
889 for componentName in storageClass.components:
890 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
892 fromRegistry: set[DatasetType] = set()
893 for parent_dataset_type in butler.registry.queryDatasetTypes():
894 fromRegistry.add(parent_dataset_type)
895 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
896 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
898 # Now that we have some dataset types registered, validate them
899 butler.validateConfiguration(
900 ignore=[
901 "test_metric_comp",
902 "metric3",
903 "metric5",
904 "calexp",
905 "DummySC",
906 "datasetType.component",
907 "random_data",
908 "random_data_2",
909 ]
910 )
912 # Add a new datasetType that will fail template validation
913 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
914 if self.validationCanFail:
915 with self.assertRaises(ValidationError):
916 butler.validateConfiguration()
918 # Rerun validation but with a subset of dataset type names
919 butler.validateConfiguration(datasetTypeNames=["metric4"])
921 # Rerun validation but ignore the bad datasetType
922 butler.validateConfiguration(
923 ignore=[
924 "test_metric_comp",
925 "metric3",
926 "metric5",
927 "calexp",
928 "DummySC",
929 "datasetType.component",
930 "random_data",
931 "random_data_2",
932 ]
933 )
935 def testTransaction(self):
936 butler = Butler(self.tmpConfigFile, run=self.default_run)
937 datasetTypeName = "test_metric"
938 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
939 dimensionEntries = (
940 ("instrument", {"instrument": "DummyCam"}),
941 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
942 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
943 )
944 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
945 metric = makeExampleMetrics()
946 dataId = {"instrument": "DummyCam", "visit": 42}
947 # Create and register a DatasetType
948 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
949 with self.assertRaises(TransactionTestError):
950 with butler.transaction():
951 # Add needed Dimensions
952 for args in dimensionEntries:
953 butler.registry.insertDimensionData(*args)
954 # Store a dataset
955 ref = butler.put(metric, datasetTypeName, dataId)
956 self.assertIsInstance(ref, DatasetRef)
957 # Test getDirect
958 metricOut = butler.get(ref)
959 self.assertEqual(metric, metricOut)
960 # Test get
961 metricOut = butler.get(datasetTypeName, dataId)
962 self.assertEqual(metric, metricOut)
963 # Check we can get components
964 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
965 raise TransactionTestError("This should roll back the entire transaction")
966 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
967 butler.registry.expandDataId(dataId)
968 # Should raise LookupError for missing data ID value
969 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
970 butler.get(datasetTypeName, dataId)
971 # Also check explicitly if Dataset entry is missing
972 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
973 # Direct retrieval should not find the file in the Datastore
974 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
975 butler.get(ref)
977 def testMakeRepo(self):
978 """Test that we can write butler configuration to a new repository via
979 the Butler.makeRepo interface and then instantiate a butler from the
980 repo root.
981 """
982 # Do not run the test if we know this datastore configuration does
983 # not support a file system root
984 if self.fullConfigKey is None:
985 return
987 # create two separate directories
988 root1 = tempfile.mkdtemp(dir=self.root)
989 root2 = tempfile.mkdtemp(dir=self.root)
991 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
992 limited = Config(self.configFile)
993 butler1 = Butler(butlerConfig)
994 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
995 full = Config(self.tmpConfigFile)
996 butler2 = Butler(butlerConfig)
997 # Butlers should have the same configuration regardless of whether
998 # defaults were expanded.
999 self.assertEqual(butler1._config, butler2._config)
1000 # Config files loaded directly should not be the same.
1001 self.assertNotEqual(limited, full)
1002 # Make sure "limited" doesn't have a few keys we know it should be
1003 # inheriting from defaults.
1004 self.assertIn(self.fullConfigKey, full)
1005 self.assertNotIn(self.fullConfigKey, limited)
1007 # Collections don't appear until something is put in them
1008 collections1 = set(butler1.registry.queryCollections())
1009 self.assertEqual(collections1, set())
1010 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1012 # Check that a config with no associated file name will not
1013 # work properly with relocatable Butler repo
1014 butlerConfig.configFile = None
1015 with self.assertRaises(ValueError):
1016 Butler(butlerConfig)
1018 with self.assertRaises(FileExistsError):
1019 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1021 def testStringification(self):
1022 butler = Butler(self.tmpConfigFile, run=self.default_run)
1023 butlerStr = str(butler)
1025 if self.datastoreStr is not None:
1026 for testStr in self.datastoreStr:
1027 self.assertIn(testStr, butlerStr)
1028 if self.registryStr is not None:
1029 self.assertIn(self.registryStr, butlerStr)
1031 datastoreName = butler.datastore.name
1032 if self.datastoreName is not None:
1033 for testStr in self.datastoreName:
1034 self.assertIn(testStr, datastoreName)
1036 def testButlerRewriteDataId(self):
1037 """Test that dataIds can be rewritten based on dimension records."""
1039 butler = Butler(self.tmpConfigFile, run=self.default_run)
1041 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1042 datasetTypeName = "random_data"
1044 # Create dimension records.
1045 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1046 butler.registry.insertDimensionData(
1047 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1048 )
1049 butler.registry.insertDimensionData(
1050 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1051 )
1053 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1054 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1055 butler.registry.registerDatasetType(datasetType)
1057 n_exposures = 5
1058 dayobs = 20210530
1060 for i in range(n_exposures):
1061 butler.registry.insertDimensionData(
1062 "exposure",
1063 {
1064 "instrument": "DummyCamComp",
1065 "id": i,
1066 "obs_id": f"exp{i}",
1067 "seq_num": i,
1068 "day_obs": dayobs,
1069 "physical_filter": "d-r",
1070 },
1071 )
1073 # Write some data.
1074 for i in range(n_exposures):
1075 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1077 # Use the seq_num for the put to test rewriting.
1078 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1079 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1081 # Check that the exposure is correct in the dataId
1082 self.assertEqual(ref.dataId["exposure"], i)
1084 # and check that we can get the dataset back with the same dataId
1085 new_metric = butler.get(datasetTypeName, dataId=dataId)
1086 self.assertEqual(new_metric, metric)
1089class FileDatastoreButlerTests(ButlerTests):
1090 """Common tests and specialization of ButlerTests for butlers backed
1091 by datastores that inherit from FileDatastore.
1092 """
1094 def checkFileExists(self, root, relpath):
1095 """Checks if file exists at a given path (relative to root).
1097 Test testPutTemplates verifies actual physical existance of the files
1098 in the requested location.
1099 """
1100 uri = ResourcePath(root, forceDirectory=True)
1101 return uri.join(relpath).exists()
1103 def testPutTemplates(self):
1104 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1105 butler = Butler(self.tmpConfigFile, run=self.default_run)
1107 # Add needed Dimensions
1108 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1109 butler.registry.insertDimensionData(
1110 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1111 )
1112 butler.registry.insertDimensionData(
1113 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1114 )
1115 butler.registry.insertDimensionData(
1116 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1117 )
1119 # Create and store a dataset
1120 metric = makeExampleMetrics()
1122 # Create two almost-identical DatasetTypes (both will use default
1123 # template)
1124 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1125 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1126 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1127 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1129 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1130 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1132 # Put with exactly the data ID keys needed
1133 ref = butler.put(metric, "metric1", dataId1)
1134 uri = butler.getURI(ref)
1135 self.assertTrue(uri.exists())
1136 self.assertTrue(
1137 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1138 )
1140 # Check the template based on dimensions
1141 if hasattr(butler.datastore, "templates"):
1142 butler.datastore.templates.validateTemplates([ref])
1144 # Put with extra data ID keys (physical_filter is an optional
1145 # dependency); should not change template (at least the way we're
1146 # defining them to behave now; the important thing is that they
1147 # must be consistent).
1148 ref = butler.put(metric, "metric2", dataId2)
1149 uri = butler.getURI(ref)
1150 self.assertTrue(uri.exists())
1151 self.assertTrue(
1152 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1153 )
1155 # Check the template based on dimensions
1156 if hasattr(butler.datastore, "templates"):
1157 butler.datastore.templates.validateTemplates([ref])
1159 # Use a template that has a typo in dimension record metadata.
1160 # Easier to test with a butler that has a ref with records attached.
1161 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1162 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1163 path = template.format(ref)
1164 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1166 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1167 with self.assertRaises(KeyError):
1168 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1169 template.format(ref)
1171 # Now use a file template that will not result in unique filenames
1172 with self.assertRaises(FileTemplateValidationError):
1173 butler.put(metric, "metric3", dataId1)
1175 def testImportExport(self):
1176 # Run put/get tests just to create and populate a repo.
1177 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1178 self.runImportExportTest(storageClass)
1180 @unittest.expectedFailure
1181 def testImportExportVirtualComposite(self):
1182 # Run put/get tests just to create and populate a repo.
1183 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1184 self.runImportExportTest(storageClass)
1186 def runImportExportTest(self, storageClass):
1187 """This test does an export to a temp directory and an import back
1188 into a new temp directory repo. It does not assume a posix datastore"""
1189 exportButler = self.runPutGetTest(storageClass, "test_metric")
1191 # Test that we must have a file extension.
1192 with self.assertRaises(ValueError):
1193 with exportButler.export(filename="dump", directory=".") as export:
1194 pass
1196 # Test that unknown format is not allowed.
1197 with self.assertRaises(ValueError):
1198 with exportButler.export(filename="dump.fits", directory=".") as export:
1199 pass
1201 # Test that the repo actually has at least one dataset.
1202 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1203 self.assertGreater(len(datasets), 0)
1204 # Add a DimensionRecord that's unused by those datasets.
1205 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1206 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1207 # Export and then import datasets.
1208 with safeTestTempDir(TESTDIR) as exportDir:
1209 exportFile = os.path.join(exportDir, "exports.yaml")
1210 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1211 export.saveDatasets(datasets)
1212 # Export the same datasets again. This should quietly do
1213 # nothing because of internal deduplication, and it shouldn't
1214 # complain about being asked to export the "htm7" elements even
1215 # though there aren't any in these datasets or in the database.
1216 export.saveDatasets(datasets, elements=["htm7"])
1217 # Save one of the data IDs again; this should be harmless
1218 # because of internal deduplication.
1219 export.saveDataIds([datasets[0].dataId])
1220 # Save some dimension records directly.
1221 export.saveDimensionData("skymap", [skymapRecord])
1222 self.assertTrue(os.path.exists(exportFile))
1223 with safeTestTempDir(TESTDIR) as importDir:
1224 # We always want this to be a local posix butler
1225 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1226 # Calling script.butlerImport tests the implementation of the
1227 # butler command line interface "import" subcommand. Functions
1228 # in the script folder are generally considered protected and
1229 # should not be used as public api.
1230 with open(exportFile, "r") as f:
1231 script.butlerImport(
1232 importDir,
1233 export_file=f,
1234 directory=exportDir,
1235 transfer="auto",
1236 skip_dimensions=None,
1237 reuse_ids=False,
1238 )
1239 importButler = Butler(importDir, run=self.default_run)
1240 for ref in datasets:
1241 with self.subTest(ref=ref):
1242 # Test for existence by passing in the DatasetType and
1243 # data ID separately, to avoid lookup by dataset_id.
1244 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1245 self.assertEqual(
1246 list(importButler.registry.queryDimensionRecords("skymap")),
1247 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1248 )
1250 def testRemoveRuns(self):
1251 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1252 butler = Butler(self.tmpConfigFile, writeable=True)
1253 # Load registry data with dimensions to hang datasets off of.
1254 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1255 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1256 # Add some RUN-type collection.
1257 run1 = "run1"
1258 butler.registry.registerRun(run1)
1259 run2 = "run2"
1260 butler.registry.registerRun(run2)
1261 # put a dataset in each
1262 metric = makeExampleMetrics()
1263 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1264 datasetType = self.addDatasetType(
1265 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1266 )
1267 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1268 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1269 uri1 = butler.getURI(ref1, collections=[run1])
1270 uri2 = butler.getURI(ref2, collections=[run2])
1272 with self.assertRaises(OrphanedRecordError):
1273 butler.registry.removeDatasetType(datasetType.name)
1275 # Remove from both runs with different values for unstore.
1276 butler.removeRuns([run1], unstore=True)
1277 butler.removeRuns([run2], unstore=False)
1278 # Should be nothing in registry for either one, and datastore should
1279 # not think either exists.
1280 with self.assertRaises(MissingCollectionError):
1281 butler.registry.getCollectionType(run1)
1282 with self.assertRaises(MissingCollectionError):
1283 butler.registry.getCollectionType(run2)
1284 self.assertFalse(butler.datastore.exists(ref1))
1285 self.assertFalse(butler.datastore.exists(ref2))
1286 # The ref we unstored should be gone according to the URI, but the
1287 # one we forgot should still be around.
1288 self.assertFalse(uri1.exists())
1289 self.assertTrue(uri2.exists())
1291 # Now that the collections have been pruned we can remove the
1292 # dataset type
1293 butler.registry.removeDatasetType(datasetType.name)
1295 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm:
1296 butler.registry.removeDatasetType(tuple(["test*", "test*"]))
1297 self.assertIn("not defined", "\n".join(cm.output))
1300class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1301 """PosixDatastore specialization of a butler"""
1303 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1304 fullConfigKey = ".datastore.formatters"
1305 validationCanFail = True
1306 datastoreStr = ["/tmp"]
1307 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1308 registryStr = "/gen3.sqlite3"
1310 def testPathConstructor(self):
1311 """Independent test of constructor using PathLike."""
1312 butler = Butler(self.tmpConfigFile, run=self.default_run)
1313 self.assertIsInstance(butler, Butler)
1315 # And again with a Path object with the butler yaml
1316 path = pathlib.Path(self.tmpConfigFile)
1317 butler = Butler(path, writeable=False)
1318 self.assertIsInstance(butler, Butler)
1320 # And again with a Path object without the butler yaml
1321 # (making sure we skip it if the tmp config doesn't end
1322 # in butler.yaml -- which is the case for a subclass)
1323 if self.tmpConfigFile.endswith("butler.yaml"):
1324 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1325 butler = Butler(path, writeable=False)
1326 self.assertIsInstance(butler, Butler)
1328 def testExportTransferCopy(self):
1329 """Test local export using all transfer modes"""
1330 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1331 exportButler = self.runPutGetTest(storageClass, "test_metric")
1332 # Test that the repo actually has at least one dataset.
1333 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1334 self.assertGreater(len(datasets), 0)
1335 uris = [exportButler.getURI(d) for d in datasets]
1336 datastoreRoot = exportButler.datastore.root
1338 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1340 for path in pathsInStore:
1341 # Assume local file system
1342 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1344 for transfer in ("copy", "link", "symlink", "relsymlink"):
1345 with safeTestTempDir(TESTDIR) as exportDir:
1346 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1347 export.saveDatasets(datasets)
1348 for path in pathsInStore:
1349 self.assertTrue(
1350 self.checkFileExists(exportDir, path),
1351 f"Check that mode {transfer} exported files",
1352 )
1354 def testPruneDatasets(self):
1355 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1356 butler = Butler(self.tmpConfigFile, writeable=True)
1357 # Load registry data with dimensions to hang datasets off of.
1358 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1359 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1360 # Add some RUN-type collections.
1361 run1 = "run1"
1362 butler.registry.registerRun(run1)
1363 run2 = "run2"
1364 butler.registry.registerRun(run2)
1365 # put some datasets. ref1 and ref2 have the same data ID, and are in
1366 # different runs. ref3 has a different data ID.
1367 metric = makeExampleMetrics()
1368 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1369 datasetType = self.addDatasetType(
1370 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1371 )
1372 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1373 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1374 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1376 # Simple prune.
1377 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1378 with self.assertRaises(LookupError):
1379 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1381 # Put data back.
1382 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1383 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1384 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1386 # Check that in normal mode, deleting the record will lead to
1387 # trash not touching the file.
1388 uri1 = butler.datastore.getURI(ref1)
1389 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table
1390 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1391 butler.datastore.trash(ref1)
1392 butler.datastore.emptyTrash()
1393 self.assertTrue(uri1.exists())
1394 uri1.remove() # Clean it up.
1396 # Simulate execution butler setup by deleting the datastore
1397 # record but keeping the file around and trusting.
1398 butler.datastore.trustGetRequest = True
1399 uri2 = butler.datastore.getURI(ref2)
1400 uri3 = butler.datastore.getURI(ref3)
1401 self.assertTrue(uri2.exists())
1402 self.assertTrue(uri3.exists())
1404 # Remove the datastore record.
1405 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table
1406 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1407 self.assertTrue(uri2.exists())
1408 butler.datastore.trash([ref2, ref3])
1409 # Immediate removal for ref2 file
1410 self.assertFalse(uri2.exists())
1411 # But ref3 has to wait for the empty.
1412 self.assertTrue(uri3.exists())
1413 butler.datastore.emptyTrash()
1414 self.assertFalse(uri3.exists())
1416 # Clear out the datasets from registry.
1417 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1419 def testPytypeCoercion(self):
1420 """Test python type coercion on Butler.get and put."""
1422 # Store some data with the normal example storage class.
1423 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1424 datasetTypeName = "test_metric"
1425 butler = self.runPutGetTest(storageClass, datasetTypeName)
1427 dataId = {"instrument": "DummyCamComp", "visit": 423}
1428 metric = butler.get(datasetTypeName, dataId=dataId)
1429 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1431 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1432 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1434 # Now need to hack the registry dataset type definition.
1435 # There is no API for this.
1436 manager = butler.registry._managers.datasets
1437 manager._db.update(
1438 manager._static.dataset_type,
1439 {"name": datasetTypeName},
1440 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1441 )
1443 # Force reset of dataset type cache
1444 butler.registry.refresh()
1446 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1447 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1448 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1450 metric_model = butler.get(datasetTypeName, dataId=dataId)
1451 self.assertNotEqual(type(metric_model), type(metric))
1452 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1454 # Put the model and read it back to show that everything now
1455 # works as normal.
1456 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1457 metric_model_new = butler.get(metric_ref)
1458 self.assertEqual(metric_model_new, metric_model)
1460 # Hack the storage class again to something that will fail on the
1461 # get with no conversion class.
1462 manager._db.update(
1463 manager._static.dataset_type,
1464 {"name": datasetTypeName},
1465 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1466 )
1467 butler.registry.refresh()
1469 with self.assertRaises(ValueError):
1470 butler.get(datasetTypeName, dataId=dataId)
1473@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1474class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1475 """PosixDatastore specialization of a butler using Postgres"""
1477 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1478 fullConfigKey = ".datastore.formatters"
1479 validationCanFail = True
1480 datastoreStr = ["/tmp"]
1481 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1482 registryStr = "PostgreSQL@test"
1484 @staticmethod
1485 def _handler(postgresql):
1486 engine = sqlalchemy.engine.create_engine(postgresql.url())
1487 with engine.begin() as connection:
1488 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1490 @classmethod
1491 def setUpClass(cls):
1492 # Create the postgres test server.
1493 cls.postgresql = testing.postgresql.PostgresqlFactory(
1494 cache_initialized_db=True, on_initialized=cls._handler
1495 )
1496 super().setUpClass()
1498 @classmethod
1499 def tearDownClass(cls):
1500 # Clean up any lingering SQLAlchemy engines/connections
1501 # so they're closed before we shut down the server.
1502 gc.collect()
1503 cls.postgresql.clear_cache()
1504 super().tearDownClass()
1506 def setUp(self):
1507 self.server = self.postgresql()
1509 # Need to add a registry section to the config.
1510 self._temp_config = False
1511 config = Config(self.configFile)
1512 config["registry", "db"] = self.server.url()
1513 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1514 config.dump(fh)
1515 self.configFile = fh.name
1516 self._temp_config = True
1517 super().setUp()
1519 def tearDown(self):
1520 self.server.stop()
1521 if self._temp_config and os.path.exists(self.configFile):
1522 os.remove(self.configFile)
1523 super().tearDown()
1525 def testMakeRepo(self):
1526 # The base class test assumes that it's using sqlite and assumes
1527 # the config file is acceptable to sqlite.
1528 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1531class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1532 """InMemoryDatastore specialization of a butler"""
1534 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1535 fullConfigKey = None
1536 useTempRoot = False
1537 validationCanFail = False
1538 datastoreStr = ["datastore='InMemory"]
1539 datastoreName = ["InMemoryDatastore@"]
1540 registryStr = "/gen3.sqlite3"
1542 def testIngest(self):
1543 pass
1546class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1547 """PosixDatastore specialization"""
1549 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1550 fullConfigKey = ".datastore.datastores.1.formatters"
1551 validationCanFail = True
1552 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1553 datastoreName = [
1554 "InMemoryDatastore@",
1555 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1556 "SecondDatastore",
1557 ]
1558 registryStr = "/gen3.sqlite3"
1561class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1562 """Test that a yaml file in one location can refer to a root in another."""
1564 datastoreStr = ["dir1"]
1565 # Disable the makeRepo test since we are deliberately not using
1566 # butler.yaml as the config name.
1567 fullConfigKey = None
1569 def setUp(self):
1570 self.root = makeTestTempDir(TESTDIR)
1572 # Make a new repository in one place
1573 self.dir1 = os.path.join(self.root, "dir1")
1574 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1576 # Move the yaml file to a different place and add a "root"
1577 self.dir2 = os.path.join(self.root, "dir2")
1578 os.makedirs(self.dir2, exist_ok=True)
1579 configFile1 = os.path.join(self.dir1, "butler.yaml")
1580 config = Config(configFile1)
1581 config["root"] = self.dir1
1582 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1583 config.dumpToUri(configFile2)
1584 os.remove(configFile1)
1585 self.tmpConfigFile = configFile2
1587 def testFileLocations(self):
1588 self.assertNotEqual(self.dir1, self.dir2)
1589 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1590 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1591 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1594class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1595 """Test that a config file created by makeRepo outside of repo works."""
1597 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1599 def setUp(self):
1600 self.root = makeTestTempDir(TESTDIR)
1601 self.root2 = makeTestTempDir(TESTDIR)
1603 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1604 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1606 def tearDown(self):
1607 if os.path.exists(self.root2):
1608 shutil.rmtree(self.root2, ignore_errors=True)
1609 super().tearDown()
1611 def testConfigExistence(self):
1612 c = Config(self.tmpConfigFile)
1613 uri_config = ResourcePath(c["root"])
1614 uri_expected = ResourcePath(self.root, forceDirectory=True)
1615 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1616 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1618 def testPutGet(self):
1619 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1620 self.runPutGetTest(storageClass, "test_metric")
1623class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1624 """Test that a config file created by makeRepo outside of repo works."""
1626 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1628 def setUp(self):
1629 self.root = makeTestTempDir(TESTDIR)
1630 self.root2 = makeTestTempDir(TESTDIR)
1632 self.tmpConfigFile = self.root2
1633 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1635 def testConfigExistence(self):
1636 # Append the yaml file else Config constructor does not know the file
1637 # type.
1638 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1639 super().testConfigExistence()
1642class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1643 """Test that a config file created by makeRepo outside of repo works."""
1645 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1647 def setUp(self):
1648 self.root = makeTestTempDir(TESTDIR)
1649 self.root2 = makeTestTempDir(TESTDIR)
1651 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1652 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1655@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1656class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1657 """S3Datastore specialization of a butler; an S3 storage Datastore +
1658 a local in-memory SqlRegistry.
1659 """
1661 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1662 fullConfigKey = None
1663 validationCanFail = True
1665 bucketName = "anybucketname"
1666 """Name of the Bucket that will be used in the tests. The name is read from
1667 the config file used with the tests during set-up.
1668 """
1670 root = "butlerRoot/"
1671 """Root repository directory expected to be used in case useTempRoot=False.
1672 Otherwise the root is set to a 20 characters long randomly generated string
1673 during set-up.
1674 """
1676 datastoreStr = [f"datastore={root}"]
1677 """Contains all expected root locations in a format expected to be
1678 returned by Butler stringification.
1679 """
1681 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1682 """The expected format of the S3 Datastore string."""
1684 registryStr = "/gen3.sqlite3"
1685 """Expected format of the Registry string."""
1687 mock_s3 = mock_s3()
1688 """The mocked s3 interface from moto."""
1690 def genRoot(self):
1691 """Returns a random string of len 20 to serve as a root
1692 name for the temporary bucket repo.
1694 This is equivalent to tempfile.mkdtemp as this is what self.root
1695 becomes when useTempRoot is True.
1696 """
1697 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1698 return rndstr + "/"
1700 def setUp(self):
1701 config = Config(self.configFile)
1702 uri = ResourcePath(config[".datastore.datastore.root"])
1703 self.bucketName = uri.netloc
1705 # Enable S3 mocking of tests.
1706 self.mock_s3.start()
1708 # set up some fake credentials if they do not exist
1709 self.usingDummyCredentials = setAwsEnvCredentials()
1711 if self.useTempRoot:
1712 self.root = self.genRoot()
1713 rooturi = f"s3://{self.bucketName}/{self.root}"
1714 config.update({"datastore": {"datastore": {"root": rooturi}}})
1716 # need local folder to store registry database
1717 self.reg_dir = makeTestTempDir(TESTDIR)
1718 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1720 # MOTO needs to know that we expect Bucket bucketname to exist
1721 # (this used to be the class attribute bucketName)
1722 s3 = boto3.resource("s3")
1723 s3.create_bucket(Bucket=self.bucketName)
1725 self.datastoreStr = f"datastore={self.root}"
1726 self.datastoreName = [f"FileDatastore@{rooturi}"]
1727 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1728 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1730 def tearDown(self):
1731 s3 = boto3.resource("s3")
1732 bucket = s3.Bucket(self.bucketName)
1733 try:
1734 bucket.objects.all().delete()
1735 except botocore.exceptions.ClientError as e:
1736 if e.response["Error"]["Code"] == "404":
1737 # the key was not reachable - pass
1738 pass
1739 else:
1740 raise
1742 bucket = s3.Bucket(self.bucketName)
1743 bucket.delete()
1745 # Stop the S3 mock.
1746 self.mock_s3.stop()
1748 # unset any potentially set dummy credentials
1749 if self.usingDummyCredentials:
1750 unsetAwsEnvCredentials()
1752 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1753 shutil.rmtree(self.reg_dir, ignore_errors=True)
1755 if self.useTempRoot and os.path.exists(self.root):
1756 shutil.rmtree(self.root, ignore_errors=True)
1758 super().tearDown()
1761class PosixDatastoreTransfers(unittest.TestCase):
1762 """Test data transfers between butlers.
1764 Test for different managers. UUID to UUID and integer to integer are
1765 tested. UUID to integer is not supported since we do not currently
1766 want to allow that. Integer to UUID is supported with the caveat
1767 that UUID4 will be generated and this will be incorrect for raw
1768 dataset types. The test ignores that.
1769 """
1771 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1773 @classmethod
1774 def setUpClass(cls):
1775 cls.storageClassFactory = StorageClassFactory()
1776 cls.storageClassFactory.addFromConfig(cls.configFile)
1778 def setUp(self):
1779 self.root = makeTestTempDir(TESTDIR)
1780 self.config = Config(self.configFile)
1782 def tearDown(self):
1783 removeTestTempDir(self.root)
1785 def create_butler(self, manager, label):
1786 config = Config(self.configFile)
1787 config["registry", "managers", "datasets"] = manager
1788 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1790 def create_butlers(self, manager1=None, manager2=None):
1791 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID"
1792 if manager1 is None:
1793 manager1 = default
1794 if manager2 is None:
1795 manager2 = default
1796 self.source_butler = self.create_butler(manager1, "1")
1797 self.target_butler = self.create_butler(manager2, "2")
1799 def testTransferUuidToUuid(self):
1800 self.create_butlers()
1801 # Setting id_gen_map should have no effect here
1802 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1804 def _enable_trust(self, datastore) -> None:
1805 if hasattr(datastore, "trustGetRequest"):
1806 datastore.trustGetRequest = True
1807 elif hasattr(datastore, "datastores"):
1808 for datastore in datastore.datastores:
1809 if hasattr(datastore, "trustGetRequest"):
1810 datastore.trustGetRequest = True
1812 def testTransferMissing(self):
1813 """Test transfers where datastore records are missing.
1815 This is how execution butler works.
1816 """
1817 self.create_butlers()
1819 # Configure the source butler to allow trust.
1820 self._enable_trust(self.source_butler.datastore)
1822 self.assertButlerTransfers(purge=True)
1824 def testTransferMissingDisassembly(self):
1825 """Test transfers where datastore records are missing.
1827 This is how execution butler works.
1828 """
1829 self.create_butlers()
1831 # Configure the source butler to allow trust.
1832 self._enable_trust(self.source_butler.datastore)
1834 # Test disassembly.
1835 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1837 def testAbsoluteURITransferDirect(self):
1838 """Test transfer using an absolute URI."""
1839 self._absolute_transfer("auto")
1841 def testAbsoluteURITransferCopy(self):
1842 """Test transfer using an absolute URI."""
1843 self._absolute_transfer("copy")
1845 def _absolute_transfer(self, transfer):
1846 self.create_butlers()
1848 storageClassName = "StructuredData"
1849 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1850 datasetTypeName = "random_data"
1851 runs = ["run1", "run2"]
1852 for run in runs:
1853 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1855 dimensions = self.source_butler.registry.dimensions.extract(())
1856 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1857 self.source_butler.registry.registerDatasetType(datasetType)
1859 metrics = makeExampleMetrics()
1860 with ResourcePath.temporary_uri(suffix=".json") as temp:
1861 source_refs = [DatasetRef(datasetType, {})]
1862 temp.write(json.dumps(metrics.exportAsDict()).encode())
1863 dataset = FileDataset(path=temp, refs=source_refs)
1864 self.source_butler.ingest(dataset, transfer="direct", run="run1")
1866 self.target_butler.transfer_from(
1867 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer
1868 )
1870 uri = self.target_butler.getURI(dataset.refs[0])
1871 if transfer == "auto":
1872 self.assertEqual(uri, temp)
1873 else:
1874 self.assertNotEqual(uri, temp)
1876 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1877 """Test that a run can be transferred to another butler."""
1879 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1880 datasetTypeName = "random_data"
1882 # Test will create 3 collections and we will want to transfer
1883 # two of those three.
1884 runs = ["run1", "run2", "other"]
1886 # Also want to use two different dataset types to ensure that
1887 # grouping works.
1888 datasetTypeNames = ["random_data", "random_data_2"]
1890 # Create the run collections in the source butler.
1891 for run in runs:
1892 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1894 # Create dimensions in source butler.
1895 n_exposures = 30
1896 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1897 self.source_butler.registry.insertDimensionData(
1898 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1899 )
1900 self.source_butler.registry.insertDimensionData(
1901 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1902 )
1904 for i in range(n_exposures):
1905 self.source_butler.registry.insertDimensionData(
1906 "exposure",
1907 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1908 )
1910 # Create dataset types in the source butler.
1911 dimensions = self.source_butler.registry.dimensions.extract(["instrument", "exposure"])
1912 for datasetTypeName in datasetTypeNames:
1913 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1914 self.source_butler.registry.registerDatasetType(datasetType)
1916 # Write a dataset to an unrelated run -- this will ensure that
1917 # we are rewriting integer dataset ids in the target if necessary.
1918 # Will not be relevant for UUID.
1919 run = "distraction"
1920 butler = Butler(butler=self.source_butler, run=run)
1921 butler.put(
1922 makeExampleMetrics(),
1923 datasetTypeName,
1924 exposure=1,
1925 instrument="DummyCamComp",
1926 physical_filter="d-r",
1927 )
1929 # Write some example metrics to the source
1930 butler = Butler(butler=self.source_butler)
1932 # Set of DatasetRefs that should be in the list of refs to transfer
1933 # but which will not be transferred.
1934 deleted = set()
1936 n_expected = 20 # Number of datasets expected to be transferred
1937 source_refs = []
1938 for i in range(n_exposures):
1939 # Put a third of datasets into each collection, only retain
1940 # two thirds.
1941 index = i % 3
1942 run = runs[index]
1943 datasetTypeName = datasetTypeNames[i % 2]
1945 metric_data = {
1946 "summary": {"counter": i},
1947 "output": {"text": "metric"},
1948 "data": [2 * x for x in range(i)],
1949 }
1950 metric = MetricsExample(**metric_data)
1951 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1952 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
1954 # Remove the datastore record using low-level API
1955 if purge:
1956 # Remove records for a fraction.
1957 if index == 1:
1958 # For one of these delete the file as well.
1959 # This allows the "missing" code to filter the
1960 # file out.
1961 # Access the individual datastores.
1962 datastores = []
1963 if hasattr(butler.datastore, "datastores"):
1964 datastores.extend(butler.datastore.datastores)
1965 else:
1966 datastores.append(butler.datastore)
1968 if not deleted:
1969 # For a chained datastore we need to remove
1970 # files in each chain.
1971 for datastore in datastores:
1972 # The file might not be known to the datastore
1973 # if constraints are used.
1974 try:
1975 primary, uris = datastore.getURIs(ref)
1976 except FileNotFoundError:
1977 continue
1978 if primary:
1979 if primary.scheme != "mem":
1980 primary.remove()
1981 for uri in uris.values():
1982 if uri.scheme != "mem":
1983 uri.remove()
1984 n_expected -= 1
1985 deleted.add(ref)
1987 # Remove the datastore record.
1988 for datastore in datastores:
1989 if hasattr(datastore, "removeStoredItemInfo"):
1990 datastore.removeStoredItemInfo(ref)
1992 if index < 2:
1993 source_refs.append(ref)
1994 if ref not in deleted:
1995 new_metric = butler.get(ref.unresolved(), collections=run)
1996 self.assertEqual(new_metric, metric)
1998 # Create some bad dataset types to ensure we check for inconsistent
1999 # definitions.
2000 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2001 for datasetTypeName in datasetTypeNames:
2002 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2003 self.target_butler.registry.registerDatasetType(datasetType)
2004 with self.assertRaises(ConflictingDefinitionError) as cm:
2005 self.target_butler.transfer_from(self.source_butler, source_refs)
2006 self.assertIn("dataset type differs", str(cm.exception))
2008 # And remove the bad definitions.
2009 for datasetTypeName in datasetTypeNames:
2010 self.target_butler.registry.removeDatasetType(datasetTypeName)
2012 # Transfer without creating dataset types should fail.
2013 with self.assertRaises(KeyError):
2014 self.target_butler.transfer_from(self.source_butler, source_refs)
2016 # Transfer without creating dimensions should fail.
2017 with self.assertRaises(ConflictingDefinitionError) as cm:
2018 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True)
2019 self.assertIn("dimension", str(cm.exception))
2021 # The failed transfer above leaves registry in an inconsistent
2022 # state because the run is created but then rolled back without
2023 # the collection cache being cleared. For now force a refresh.
2024 # Can remove with DM-35498.
2025 self.target_butler.registry.refresh()
2027 # Now transfer them to the second butler, including dimensions.
2028 with self.assertLogs(level=logging.DEBUG) as cm:
2029 transferred = self.target_butler.transfer_from(
2030 self.source_butler,
2031 source_refs,
2032 register_dataset_types=True,
2033 transfer_dimensions=True,
2034 )
2035 self.assertEqual(len(transferred), n_expected)
2036 log_output = ";".join(cm.output)
2038 # A ChainedDatastore will use the in-memory datastore for mexists
2039 # so we can not rely on the mexists log message.
2040 self.assertIn("Number of datastore records found in source", log_output)
2041 self.assertIn("Creating output run", log_output)
2043 # Do the transfer twice to ensure that it will do nothing extra.
2044 # Only do this if purge=True because it does not work for int
2045 # dataset_id.
2046 if purge:
2047 # This should not need to register dataset types.
2048 transferred = self.target_butler.transfer_from(self.source_butler, source_refs)
2049 self.assertEqual(len(transferred), n_expected)
2051 # Also do an explicit low-level transfer to trigger some
2052 # edge cases.
2053 with self.assertLogs(level=logging.DEBUG) as cm:
2054 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2055 log_output = ";".join(cm.output)
2056 self.assertIn("no file artifacts exist", log_output)
2058 with self.assertRaises((TypeError, AttributeError)):
2059 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2061 with self.assertRaises(ValueError):
2062 self.target_butler.datastore.transfer_from(
2063 self.source_butler.datastore, source_refs, transfer="split"
2064 )
2066 # Now try to get the same refs from the new butler.
2067 for ref in source_refs:
2068 if ref not in deleted:
2069 unresolved_ref = ref.unresolved()
2070 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2071 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2072 self.assertEqual(new_metric, old_metric)
2074 # Now prune run2 collection and create instead a CHAINED collection.
2075 # This should block the transfer.
2076 self.target_butler.removeRuns(["run2"], unstore=True)
2077 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2078 with self.assertRaises(CollectionTypeError):
2079 # Re-importing the run1 datasets can be problematic if they
2080 # use integer IDs so filter those out.
2081 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2082 self.target_butler.transfer_from(self.source_butler, to_transfer)
2085class ChainedDatastoreTransfers(PosixDatastoreTransfers):
2086 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2089if __name__ == "__main__":
2090 unittest.main()