Coverage for tests/test_butler.py: 12%
1147 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-06 09:33 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-06 09:33 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import gc
26import json
27import logging
28import os
29import pathlib
30import pickle
31import posixpath
32import random
33import shutil
34import string
35import tempfile
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 # It's possible but silly to have testing.postgresql installed without
52 # having the postgresql server installed (because then nothing in
53 # testing.postgresql would work), so we use the presence of that module
54 # to test whether we can expect the server to be available.
55 import testing.postgresql
56except ImportError:
57 testing = None
59import astropy.time
60import sqlalchemy
61from lsst.daf.butler import (
62 Butler,
63 ButlerConfig,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplate,
71 FileTemplateValidationError,
72 StorageClassFactory,
73 ValidationError,
74 script,
75)
76from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
77from lsst.daf.butler.registry import (
78 CollectionError,
79 CollectionTypeError,
80 ConflictingDefinitionError,
81 DataIdValueError,
82 MissingCollectionError,
83 OrphanedRecordError,
84)
85from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
86from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
87from lsst.resources import ResourcePath
88from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
89from lsst.utils import doImport
90from lsst.utils.introspection import get_full_type_name
92TESTDIR = os.path.abspath(os.path.dirname(__file__))
95def makeExampleMetrics():
96 return MetricsExample(
97 {"AM1": 5.2, "AM2": 30.6},
98 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
99 [563, 234, 456.7, 752, 8, 9, 27],
100 )
103class TransactionTestError(Exception):
104 """Specific error for testing transactions, to prevent misdiagnosing
105 that might otherwise occur when a standard exception is used.
106 """
108 pass
111class ButlerConfigTests(unittest.TestCase):
112 """Simple tests for ButlerConfig that are not tested in any other test
113 cases."""
115 def testSearchPath(self):
116 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
117 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
118 config1 = ButlerConfig(configFile)
119 self.assertNotIn("testConfigs", "\n".join(cm.output))
121 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
122 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
123 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
124 self.assertIn("testConfigs", "\n".join(cm.output))
126 key = ("datastore", "records", "table")
127 self.assertNotEqual(config1[key], config2[key])
128 self.assertEqual(config2[key], "override_record")
131class ButlerPutGetTests:
132 """Helper method for running a suite of put/get tests from different
133 butler configurations."""
135 root = None
136 default_run = "ingésτ😺"
138 @staticmethod
139 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
140 """Create a DatasetType and register it"""
141 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
142 registry.registerDatasetType(datasetType)
143 return datasetType
145 @classmethod
146 def setUpClass(cls):
147 cls.storageClassFactory = StorageClassFactory()
148 cls.storageClassFactory.addFromConfig(cls.configFile)
150 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
151 datasetType = datasetRef.datasetType
152 dataId = datasetRef.dataId
153 deferred = butler.getDeferred(datasetRef)
155 for component in components:
156 compTypeName = datasetType.componentTypeName(component)
157 result = butler.get(compTypeName, dataId, collections=collections)
158 self.assertEqual(result, getattr(reference, component))
159 result_deferred = deferred.get(component=component)
160 self.assertEqual(result_deferred, result)
162 def tearDown(self):
163 removeTestTempDir(self.root)
165 def create_butler(self, run, storageClass, datasetTypeName):
166 butler = Butler(self.tmpConfigFile, run=run)
168 collections = set(butler.registry.queryCollections())
169 self.assertEqual(collections, set([run]))
171 # Create and register a DatasetType
172 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
174 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
176 # Add needed Dimensions
177 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
178 butler.registry.insertDimensionData(
179 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
180 )
181 butler.registry.insertDimensionData(
182 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
183 )
184 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
185 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
186 butler.registry.insertDimensionData(
187 "visit",
188 {
189 "instrument": "DummyCamComp",
190 "id": 423,
191 "name": "fourtwentythree",
192 "physical_filter": "d-r",
193 "visit_system": 1,
194 "datetime_begin": visit_start,
195 "datetime_end": visit_end,
196 },
197 )
199 # Add more visits for some later tests
200 for visit_id in (424, 425):
201 butler.registry.insertDimensionData(
202 "visit",
203 {
204 "instrument": "DummyCamComp",
205 "id": visit_id,
206 "name": f"fourtwentyfour_{visit_id}",
207 "physical_filter": "d-r",
208 "visit_system": 1,
209 },
210 )
211 return butler, datasetType
213 def runPutGetTest(self, storageClass, datasetTypeName):
214 # New datasets will be added to run and tag, but we will only look in
215 # tag when looking up datasets.
216 run = self.default_run
217 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
219 # Create and store a dataset
220 metric = makeExampleMetrics()
221 dataId = {"instrument": "DummyCamComp", "visit": 423}
223 # Create a DatasetRef for put
224 refIn = DatasetRef(datasetType, dataId, id=None)
226 # Put with a preexisting id should fail
227 with self.assertRaises(ValueError):
228 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
230 # Put and remove the dataset once as a DatasetRef, once as a dataId,
231 # and once with a DatasetType
233 # Keep track of any collections we add and do not clean up
234 expected_collections = {run}
236 counter = 0
237 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
238 # Since we are using subTest we can get cascading failures
239 # here with the first attempt failing and the others failing
240 # immediately because the dataset already exists. Work around
241 # this by using a distinct run collection each time
242 counter += 1
243 this_run = f"put_run_{counter}"
244 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
245 expected_collections.update({this_run})
247 with self.subTest(args=args):
248 ref = butler.put(metric, *args, run=this_run)
249 self.assertIsInstance(ref, DatasetRef)
251 # Test getDirect
252 metricOut = butler.get(ref)
253 self.assertEqual(metric, metricOut)
254 # Test get
255 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
256 self.assertEqual(metric, metricOut)
257 # Test get with a datasetRef
258 metricOut = butler.get(ref, collections=this_run)
259 self.assertEqual(metric, metricOut)
260 # Test getDeferred with dataId
261 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
262 self.assertEqual(metric, metricOut)
263 # Test getDeferred with a datasetRef
264 metricOut = butler.getDeferred(ref, collections=this_run).get()
265 self.assertEqual(metric, metricOut)
266 # and deferred direct with ref
267 metricOut = butler.getDeferred(ref).get()
268 self.assertEqual(metric, metricOut)
270 # Check we can get components
271 if storageClass.isComposite():
272 self.assertGetComponents(
273 butler, ref, ("summary", "data", "output"), metric, collections=this_run
274 )
276 # Can the artifacts themselves be retrieved?
277 if not butler.datastore.isEphemeral:
278 root_uri = ResourcePath(self.root)
280 for preserve_path in (True, False):
281 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
282 # Use copy so that we can test that overwrite
283 # protection works (using "auto" for File URIs would
284 # use hard links and subsequent transfer would work
285 # because it knows they are the same file).
286 transferred = butler.retrieveArtifacts(
287 [ref], destination, preserve_path=preserve_path, transfer="copy"
288 )
289 self.assertGreater(len(transferred), 0)
290 artifacts = list(ResourcePath.findFileResources([destination]))
291 self.assertEqual(set(transferred), set(artifacts))
293 for artifact in transferred:
294 path_in_destination = artifact.relative_to(destination)
295 self.assertIsNotNone(path_in_destination)
297 # when path is not preserved there should not be
298 # any path separators.
299 num_seps = path_in_destination.count("/")
300 if preserve_path:
301 self.assertGreater(num_seps, 0)
302 else:
303 self.assertEqual(num_seps, 0)
305 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
306 n_uris = len(secondary_uris)
307 if primary_uri:
308 n_uris += 1
309 self.assertEqual(
310 len(artifacts),
311 n_uris,
312 "Comparing expected artifacts vs actual:"
313 f" {artifacts} vs {primary_uri} and {secondary_uris}",
314 )
316 if preserve_path:
317 # No need to run these twice
318 with self.assertRaises(ValueError):
319 butler.retrieveArtifacts([ref], destination, transfer="move")
321 with self.assertRaises(FileExistsError):
322 butler.retrieveArtifacts([ref], destination)
324 transferred_again = butler.retrieveArtifacts(
325 [ref], destination, preserve_path=preserve_path, overwrite=True
326 )
327 self.assertEqual(set(transferred_again), set(transferred))
329 # Now remove the dataset completely.
330 butler.pruneDatasets([ref], purge=True, unstore=True)
331 # Lookup with original args should still fail.
332 with self.assertRaises(LookupError):
333 butler.datasetExists(*args, collections=this_run)
334 # get() should still fail.
335 with self.assertRaises(FileNotFoundError):
336 butler.get(ref)
337 # Registry shouldn't be able to find it by dataset_id anymore.
338 self.assertIsNone(butler.registry.getDataset(ref.id))
340 # Do explicit registry removal since we know they are
341 # empty
342 butler.registry.removeCollection(this_run)
343 expected_collections.remove(this_run)
345 # Put the dataset again, since the last thing we did was remove it
346 # and we want to use the default collection.
347 ref = butler.put(metric, refIn)
349 # Get with parameters
350 stop = 4
351 sliced = butler.get(ref, parameters={"slice": slice(stop)})
352 self.assertNotEqual(metric, sliced)
353 self.assertEqual(metric.summary, sliced.summary)
354 self.assertEqual(metric.output, sliced.output)
355 self.assertEqual(metric.data[:stop], sliced.data)
356 # getDeferred with parameters
357 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
358 self.assertNotEqual(metric, sliced)
359 self.assertEqual(metric.summary, sliced.summary)
360 self.assertEqual(metric.output, sliced.output)
361 self.assertEqual(metric.data[:stop], sliced.data)
362 # getDeferred with deferred parameters
363 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
364 self.assertNotEqual(metric, sliced)
365 self.assertEqual(metric.summary, sliced.summary)
366 self.assertEqual(metric.output, sliced.output)
367 self.assertEqual(metric.data[:stop], sliced.data)
369 if storageClass.isComposite():
370 # Check that components can be retrieved
371 metricOut = butler.get(ref.datasetType.name, dataId)
372 compNameS = ref.datasetType.componentTypeName("summary")
373 compNameD = ref.datasetType.componentTypeName("data")
374 summary = butler.get(compNameS, dataId)
375 self.assertEqual(summary, metric.summary)
376 data = butler.get(compNameD, dataId)
377 self.assertEqual(data, metric.data)
379 if "counter" in storageClass.derivedComponents:
380 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
381 self.assertEqual(count, len(data))
383 count = butler.get(
384 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
385 )
386 self.assertEqual(count, stop)
388 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
389 summary = butler.get(compRef)
390 self.assertEqual(summary, metric.summary)
392 # Create a Dataset type that has the same name but is inconsistent.
393 inconsistentDatasetType = DatasetType(
394 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
395 )
397 # Getting with a dataset type that does not match registry fails
398 with self.assertRaises(ValueError):
399 butler.get(inconsistentDatasetType, dataId)
401 # Combining a DatasetRef with a dataId should fail
402 with self.assertRaises(ValueError):
403 butler.get(ref, dataId)
404 # Getting with an explicit ref should fail if the id doesn't match
405 with self.assertRaises(ValueError):
406 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
408 # Getting a dataset with unknown parameters should fail
409 with self.assertRaises(KeyError):
410 butler.get(ref, parameters={"unsupported": True})
412 # Check we have a collection
413 collections = set(butler.registry.queryCollections())
414 self.assertEqual(collections, expected_collections)
416 # Clean up to check that we can remove something that may have
417 # already had a component removed
418 butler.pruneDatasets([ref], unstore=True, purge=True)
420 # Check that we can configure a butler to accept a put even
421 # if it already has the dataset in registry.
422 ref = butler.put(metric, refIn)
424 # Repeat put will fail.
425 with self.assertRaises(ConflictingDefinitionError):
426 butler.put(metric, refIn)
428 # Remove the datastore entry.
429 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
431 # Put will still fail
432 with self.assertRaises(ConflictingDefinitionError):
433 butler.put(metric, refIn)
435 # Allow the put to succeed
436 butler._allow_put_of_predefined_dataset = True
437 ref2 = butler.put(metric, refIn)
438 self.assertEqual(ref2.id, ref.id)
440 # A second put will still fail but with a different exception
441 # than before.
442 with self.assertRaises(ConflictingDefinitionError):
443 butler.put(metric, refIn)
445 # Reset the flag to avoid confusion
446 butler._allow_put_of_predefined_dataset = False
448 # Leave the dataset in place since some downstream tests require
449 # something to be present
451 return butler
453 def testDeferredCollectionPassing(self):
454 # Construct a butler with no run or collection, but make it writeable.
455 butler = Butler(self.tmpConfigFile, writeable=True)
456 # Create and register a DatasetType
457 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
458 datasetType = self.addDatasetType(
459 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
460 )
461 # Add needed Dimensions
462 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
463 butler.registry.insertDimensionData(
464 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
465 )
466 butler.registry.insertDimensionData(
467 "visit",
468 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
469 )
470 dataId = {"instrument": "DummyCamComp", "visit": 423}
471 # Create dataset.
472 metric = makeExampleMetrics()
473 # Register a new run and put dataset.
474 run = "deferred"
475 self.assertTrue(butler.registry.registerRun(run))
476 # Second time it will be allowed but indicate no-op
477 self.assertFalse(butler.registry.registerRun(run))
478 ref = butler.put(metric, datasetType, dataId, run=run)
479 # Putting with no run should fail with TypeError.
480 with self.assertRaises(CollectionError):
481 butler.put(metric, datasetType, dataId)
482 # Dataset should exist.
483 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
484 # We should be able to get the dataset back, but with and without
485 # a deferred dataset handle.
486 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
487 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
488 # Trying to find the dataset without any collection is a TypeError.
489 with self.assertRaises(CollectionError):
490 butler.datasetExists(datasetType, dataId)
491 with self.assertRaises(CollectionError):
492 butler.get(datasetType, dataId)
493 # Associate the dataset with a different collection.
494 butler.registry.registerCollection("tagged")
495 butler.registry.associate("tagged", [ref])
496 # Deleting the dataset from the new collection should make it findable
497 # in the original collection.
498 butler.pruneDatasets([ref], tags=["tagged"])
499 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
502class ButlerTests(ButlerPutGetTests):
503 """Tests for Butler."""
505 useTempRoot = True
507 def setUp(self):
508 """Create a new butler root for each test."""
509 self.root = makeTestTempDir(TESTDIR)
510 Butler.makeRepo(self.root, config=Config(self.configFile))
511 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
513 def testConstructor(self):
514 """Independent test of constructor."""
515 butler = Butler(self.tmpConfigFile, run=self.default_run)
516 self.assertIsInstance(butler, Butler)
518 # Check that butler.yaml is added automatically.
519 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
520 config_dir = self.tmpConfigFile[: -len(end)]
521 butler = Butler(config_dir, run=self.default_run)
522 self.assertIsInstance(butler, Butler)
524 # Even with a ResourcePath.
525 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
526 self.assertIsInstance(butler, Butler)
528 collections = set(butler.registry.queryCollections())
529 self.assertEqual(collections, {self.default_run})
531 # Check that some special characters can be included in run name.
532 special_run = "u@b.c-A"
533 butler_special = Butler(butler=butler, run=special_run)
534 collections = set(butler_special.registry.queryCollections("*@*"))
535 self.assertEqual(collections, {special_run})
537 butler2 = Butler(butler=butler, collections=["other"])
538 self.assertEqual(butler2.collections, ("other",))
539 self.assertIsNone(butler2.run)
540 self.assertIs(butler.datastore, butler2.datastore)
542 # Test that we can use an environment variable to find this
543 # repository.
544 butler_index = Config()
545 butler_index["label"] = self.tmpConfigFile
546 for suffix in (".yaml", ".json"):
547 # Ensure that the content differs so that we know that
548 # we aren't reusing the cache.
549 bad_label = f"s3://bucket/not_real{suffix}"
550 butler_index["bad_label"] = bad_label
551 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
552 butler_index.dumpToUri(temp_file)
553 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
554 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
555 uri = Butler.get_repo_uri("bad_label")
556 self.assertEqual(uri, ResourcePath(bad_label))
557 uri = Butler.get_repo_uri("label")
558 butler = Butler(uri, writeable=False)
559 self.assertIsInstance(butler, Butler)
560 butler = Butler("label", writeable=False)
561 self.assertIsInstance(butler, Butler)
562 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
563 Butler("not_there", writeable=False)
564 with self.assertRaises(KeyError) as cm:
565 Butler.get_repo_uri("missing")
566 self.assertIn("not known to", str(cm.exception))
567 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
568 with self.assertRaises(FileNotFoundError):
569 Butler.get_repo_uri("label")
570 self.assertEqual(Butler.get_known_repos(), set())
571 with self.assertRaises(KeyError) as cm:
572 # No environment variable set.
573 Butler.get_repo_uri("label")
574 self.assertIn("No repository index defined", str(cm.exception))
575 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
576 # No aliases registered.
577 Butler("not_there")
578 self.assertEqual(Butler.get_known_repos(), set())
580 def testBasicPutGet(self):
581 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
582 self.runPutGetTest(storageClass, "test_metric")
584 def testCompositePutGetConcrete(self):
585 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
586 butler = self.runPutGetTest(storageClass, "test_metric")
588 # Should *not* be disassembled
589 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
590 self.assertEqual(len(datasets), 1)
591 uri, components = butler.getURIs(datasets[0])
592 self.assertIsInstance(uri, ResourcePath)
593 self.assertFalse(components)
594 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
595 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
597 # Predicted dataset
598 dataId = {"instrument": "DummyCamComp", "visit": 424}
599 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
600 self.assertFalse(components)
601 self.assertIsInstance(uri, ResourcePath)
602 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
603 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
605 def testCompositePutGetVirtual(self):
606 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
607 butler = self.runPutGetTest(storageClass, "test_metric_comp")
609 # Should be disassembled
610 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
611 self.assertEqual(len(datasets), 1)
612 uri, components = butler.getURIs(datasets[0])
614 if butler.datastore.isEphemeral:
615 # Never disassemble in-memory datastore
616 self.assertIsInstance(uri, ResourcePath)
617 self.assertFalse(components)
618 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
619 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
620 else:
621 self.assertIsNone(uri)
622 self.assertEqual(set(components), set(storageClass.components))
623 for compuri in components.values():
624 self.assertIsInstance(compuri, ResourcePath)
625 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
626 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
628 # Predicted dataset
629 dataId = {"instrument": "DummyCamComp", "visit": 424}
630 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
632 if butler.datastore.isEphemeral:
633 # Never disassembled
634 self.assertIsInstance(uri, ResourcePath)
635 self.assertFalse(components)
636 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
637 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
638 else:
639 self.assertIsNone(uri)
640 self.assertEqual(set(components), set(storageClass.components))
641 for compuri in components.values():
642 self.assertIsInstance(compuri, ResourcePath)
643 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
644 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
646 def testStorageClassOverrideGet(self):
647 """Test storage class conversion on get with override."""
648 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
649 datasetTypeName = "anything"
650 run = self.default_run
652 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
654 # Create and store a dataset.
655 metric = makeExampleMetrics()
656 dataId = {"instrument": "DummyCamComp", "visit": 423}
658 ref = butler.put(metric, datasetType, dataId)
660 # Return native type.
661 retrieved = butler.get(ref)
662 self.assertEqual(retrieved, metric)
664 # Specify an override.
665 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
666 model = butler.get(ref, storageClass=new_sc)
667 self.assertNotEqual(type(model), type(retrieved))
668 self.assertIs(type(model), new_sc.pytype)
669 self.assertEqual(retrieved, model)
671 # Defer but override later.
672 deferred = butler.getDeferred(ref)
673 model = deferred.get(storageClass=new_sc)
674 self.assertIs(type(model), new_sc.pytype)
675 self.assertEqual(retrieved, model)
677 # Defer but override up front.
678 deferred = butler.getDeferred(ref, storageClass=new_sc)
679 model = deferred.get()
680 self.assertIs(type(model), new_sc.pytype)
681 self.assertEqual(retrieved, model)
683 # Retrieve a component. Should be a tuple.
684 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
685 self.assertIs(type(data), tuple)
686 self.assertEqual(data, tuple(retrieved.data))
688 # Parameter on the write storage class should work regardless
689 # of read storage class.
690 data = butler.get(
691 "anything.data",
692 dataId,
693 storageClass="StructuredDataDataTestTuple",
694 parameters={"slice": slice(2, 4)},
695 )
696 self.assertEqual(len(data), 2)
698 # Try a parameter that is known to the read storage class but not
699 # the write storage class.
700 with self.assertRaises(KeyError):
701 butler.get(
702 "anything.data",
703 dataId,
704 storageClass="StructuredDataDataTestTuple",
705 parameters={"xslice": slice(2, 4)},
706 )
708 def testPytypePutCoercion(self):
709 """Test python type coercion on Butler.get and put."""
711 # Store some data with the normal example storage class.
712 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
713 datasetTypeName = "test_metric"
714 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
716 dataId = {"instrument": "DummyCamComp", "visit": 423}
718 # Put a dict and this should coerce to a MetricsExample
719 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
720 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
721 test_metric = butler.get(metric_ref)
722 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
723 self.assertEqual(test_metric.summary, test_dict["summary"])
724 self.assertEqual(test_metric.output, test_dict["output"])
726 # Check that the put still works if a DatasetType is given with
727 # a definition matching this python type.
728 registry_type = butler.registry.getDatasetType(datasetTypeName)
729 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
730 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
731 self.assertEqual(metric2_ref.datasetType, registry_type)
733 # The get will return the type expected by registry.
734 test_metric2 = butler.get(metric2_ref)
735 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
737 # Make a new DatasetRef with the compatible but different DatasetType.
738 # This should now return a dict.
739 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
740 test_dict2 = butler.get(new_ref)
741 self.assertEqual(get_full_type_name(test_dict2), "dict")
743 # Get it again with the wrong dataset type definition using get()
744 # rather than get(). This should be consistent with get()
745 # behavior and return the type of the DatasetType.
746 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
747 self.assertEqual(get_full_type_name(test_dict3), "dict")
749 def testIngest(self):
750 butler = Butler(self.tmpConfigFile, run=self.default_run)
752 # Create and register a DatasetType
753 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
755 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
756 datasetTypeName = "metric"
758 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
760 # Add needed Dimensions
761 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
762 butler.registry.insertDimensionData(
763 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
764 )
765 for detector in (1, 2):
766 butler.registry.insertDimensionData(
767 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
768 )
770 butler.registry.insertDimensionData(
771 "visit",
772 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
773 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
774 )
776 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
777 dataRoot = os.path.join(TESTDIR, "data", "basic")
778 datasets = []
779 for detector in (1, 2):
780 detector_name = f"detector_{detector}"
781 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
782 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
783 # Create a DatasetRef for ingest
784 refIn = DatasetRef(datasetType, dataId, run=self.default_run)
786 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
788 butler.ingest(*datasets, transfer="copy")
790 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
791 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
793 metrics1 = butler.get(datasetTypeName, dataId1)
794 metrics2 = butler.get(datasetTypeName, dataId2)
795 self.assertNotEqual(metrics1, metrics2)
797 # Compare URIs
798 uri1 = butler.getURI(datasetTypeName, dataId1)
799 uri2 = butler.getURI(datasetTypeName, dataId2)
800 self.assertNotEqual(uri1, uri2)
802 # Now do a multi-dataset but single file ingest
803 metricFile = os.path.join(dataRoot, "detectors.yaml")
804 refs = []
805 for detector in (1, 2):
806 detector_name = f"detector_{detector}"
807 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
808 # Create a DatasetRef for ingest
809 refs.append(DatasetRef(datasetType, dataId, run=self.default_run))
811 # Test "move" transfer to ensure that the files themselves
812 # have disappeared following ingest.
813 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
814 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
816 datasets = []
817 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
819 # For first ingest use copy.
820 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
822 # Now try to ingest again in "execution butler" mode where
823 # the registry entries exist but the datastore does not have
824 # the files. We also need to strip the dimension records to ensure
825 # that they will be re-added by the ingest.
826 ref = datasets[0].refs[0]
827 datasets[0].refs = [
828 butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run)
829 for ref in datasets[0].refs
830 ]
831 all_refs = []
832 for dataset in datasets:
833 refs = []
834 for ref in dataset.refs:
835 # Create a dict from the dataId to drop the records.
836 new_data_id = {str(k): v for k, v in ref.dataId.items()}
837 new_ref = butler.registry.findDataset(ref.datasetType, **new_data_id, collections=ref.run)
838 self.assertFalse(new_ref.dataId.hasRecords())
839 refs.append(new_ref)
840 dataset.refs = refs
841 all_refs.extend(dataset.refs)
842 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False)
844 butler._allow_put_of_predefined_dataset = True
846 # Use move mode to test that the file is deleted. Also
847 # disable recording of file size.
848 butler.ingest(*datasets, transfer="move", record_validation_info=False)
850 # Check that every ref now has records.
851 for dataset in datasets:
852 for ref in dataset.refs:
853 self.assertTrue(ref.dataId.hasRecords())
855 # Ensure that the file has disappeared.
856 self.assertFalse(tempFile.exists())
858 # Check that the datastore recorded no file size.
859 # Not all datastores can support this.
860 try:
861 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
862 self.assertEqual(infos[0].file_size, -1)
863 except AttributeError:
864 pass
866 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
867 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
869 multi1 = butler.get(datasetTypeName, dataId1)
870 multi2 = butler.get(datasetTypeName, dataId2)
872 self.assertEqual(multi1, metrics1)
873 self.assertEqual(multi2, metrics2)
875 # Compare URIs
876 uri1 = butler.getURI(datasetTypeName, dataId1)
877 uri2 = butler.getURI(datasetTypeName, dataId2)
878 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
880 # Test that removing one does not break the second
881 # This line will issue a warning log message for a ChainedDatastore
882 # that uses an InMemoryDatastore since in-memory can not ingest
883 # files.
884 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
885 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
886 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
887 multi2b = butler.get(datasetTypeName, dataId2)
888 self.assertEqual(multi2, multi2b)
890 # Ensure we can ingest 0 datasets
891 datasets = []
892 butler.ingest(*datasets)
894 def testPickle(self):
895 """Test pickle support."""
896 butler = Butler(self.tmpConfigFile, run=self.default_run)
897 butlerOut = pickle.loads(pickle.dumps(butler))
898 self.assertIsInstance(butlerOut, Butler)
899 self.assertEqual(butlerOut._config, butler._config)
900 self.assertEqual(butlerOut.collections, butler.collections)
901 self.assertEqual(butlerOut.run, butler.run)
903 def testGetDatasetTypes(self):
904 butler = Butler(self.tmpConfigFile, run=self.default_run)
905 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
906 dimensionEntries = [
907 (
908 "instrument",
909 {"instrument": "DummyCam"},
910 {"instrument": "DummyHSC"},
911 {"instrument": "DummyCamComp"},
912 ),
913 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
914 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
915 ]
916 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
917 # Add needed Dimensions
918 for args in dimensionEntries:
919 butler.registry.insertDimensionData(*args)
921 # When a DatasetType is added to the registry entries are not created
922 # for components but querying them can return the components.
923 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
924 components = set()
925 for datasetTypeName in datasetTypeNames:
926 # Create and register a DatasetType
927 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
929 for componentName in storageClass.components:
930 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
932 fromRegistry: set[DatasetType] = set()
933 for parent_dataset_type in butler.registry.queryDatasetTypes():
934 fromRegistry.add(parent_dataset_type)
935 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
936 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
938 # Now that we have some dataset types registered, validate them
939 butler.validateConfiguration(
940 ignore=[
941 "test_metric_comp",
942 "metric3",
943 "metric5",
944 "calexp",
945 "DummySC",
946 "datasetType.component",
947 "random_data",
948 "random_data_2",
949 ]
950 )
952 # Add a new datasetType that will fail template validation
953 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
954 if self.validationCanFail:
955 with self.assertRaises(ValidationError):
956 butler.validateConfiguration()
958 # Rerun validation but with a subset of dataset type names
959 butler.validateConfiguration(datasetTypeNames=["metric4"])
961 # Rerun validation but ignore the bad datasetType
962 butler.validateConfiguration(
963 ignore=[
964 "test_metric_comp",
965 "metric3",
966 "metric5",
967 "calexp",
968 "DummySC",
969 "datasetType.component",
970 "random_data",
971 "random_data_2",
972 ]
973 )
975 def testTransaction(self):
976 butler = Butler(self.tmpConfigFile, run=self.default_run)
977 datasetTypeName = "test_metric"
978 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
979 dimensionEntries = (
980 ("instrument", {"instrument": "DummyCam"}),
981 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
982 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
983 )
984 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
985 metric = makeExampleMetrics()
986 dataId = {"instrument": "DummyCam", "visit": 42}
987 # Create and register a DatasetType
988 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
989 with self.assertRaises(TransactionTestError):
990 with butler.transaction():
991 # Add needed Dimensions
992 for args in dimensionEntries:
993 butler.registry.insertDimensionData(*args)
994 # Store a dataset
995 ref = butler.put(metric, datasetTypeName, dataId)
996 self.assertIsInstance(ref, DatasetRef)
997 # Test getDirect
998 metricOut = butler.get(ref)
999 self.assertEqual(metric, metricOut)
1000 # Test get
1001 metricOut = butler.get(datasetTypeName, dataId)
1002 self.assertEqual(metric, metricOut)
1003 # Check we can get components
1004 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
1005 raise TransactionTestError("This should roll back the entire transaction")
1006 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
1007 butler.registry.expandDataId(dataId)
1008 # Should raise LookupError for missing data ID value
1009 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
1010 butler.get(datasetTypeName, dataId)
1011 # Also check explicitly if Dataset entry is missing
1012 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
1013 # Direct retrieval should not find the file in the Datastore
1014 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
1015 butler.get(ref)
1017 def testMakeRepo(self):
1018 """Test that we can write butler configuration to a new repository via
1019 the Butler.makeRepo interface and then instantiate a butler from the
1020 repo root.
1021 """
1022 # Do not run the test if we know this datastore configuration does
1023 # not support a file system root
1024 if self.fullConfigKey is None:
1025 return
1027 # create two separate directories
1028 root1 = tempfile.mkdtemp(dir=self.root)
1029 root2 = tempfile.mkdtemp(dir=self.root)
1031 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1032 limited = Config(self.configFile)
1033 butler1 = Butler(butlerConfig)
1034 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1035 full = Config(self.tmpConfigFile)
1036 butler2 = Butler(butlerConfig)
1037 # Butlers should have the same configuration regardless of whether
1038 # defaults were expanded.
1039 self.assertEqual(butler1._config, butler2._config)
1040 # Config files loaded directly should not be the same.
1041 self.assertNotEqual(limited, full)
1042 # Make sure "limited" doesn't have a few keys we know it should be
1043 # inheriting from defaults.
1044 self.assertIn(self.fullConfigKey, full)
1045 self.assertNotIn(self.fullConfigKey, limited)
1047 # Collections don't appear until something is put in them
1048 collections1 = set(butler1.registry.queryCollections())
1049 self.assertEqual(collections1, set())
1050 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1052 # Check that a config with no associated file name will not
1053 # work properly with relocatable Butler repo
1054 butlerConfig.configFile = None
1055 with self.assertRaises(ValueError):
1056 Butler(butlerConfig)
1058 with self.assertRaises(FileExistsError):
1059 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1061 def testStringification(self):
1062 butler = Butler(self.tmpConfigFile, run=self.default_run)
1063 butlerStr = str(butler)
1065 if self.datastoreStr is not None:
1066 for testStr in self.datastoreStr:
1067 self.assertIn(testStr, butlerStr)
1068 if self.registryStr is not None:
1069 self.assertIn(self.registryStr, butlerStr)
1071 datastoreName = butler.datastore.name
1072 if self.datastoreName is not None:
1073 for testStr in self.datastoreName:
1074 self.assertIn(testStr, datastoreName)
1076 def testButlerRewriteDataId(self):
1077 """Test that dataIds can be rewritten based on dimension records."""
1079 butler = Butler(self.tmpConfigFile, run=self.default_run)
1081 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1082 datasetTypeName = "random_data"
1084 # Create dimension records.
1085 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1086 butler.registry.insertDimensionData(
1087 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1088 )
1089 butler.registry.insertDimensionData(
1090 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1091 )
1093 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1094 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1095 butler.registry.registerDatasetType(datasetType)
1097 n_exposures = 5
1098 dayobs = 20210530
1100 for i in range(n_exposures):
1101 butler.registry.insertDimensionData(
1102 "exposure",
1103 {
1104 "instrument": "DummyCamComp",
1105 "id": i,
1106 "obs_id": f"exp{i}",
1107 "seq_num": i,
1108 "day_obs": dayobs,
1109 "physical_filter": "d-r",
1110 },
1111 )
1113 # Write some data.
1114 for i in range(n_exposures):
1115 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1117 # Use the seq_num for the put to test rewriting.
1118 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1119 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1121 # Check that the exposure is correct in the dataId
1122 self.assertEqual(ref.dataId["exposure"], i)
1124 # and check that we can get the dataset back with the same dataId
1125 new_metric = butler.get(datasetTypeName, dataId=dataId)
1126 self.assertEqual(new_metric, metric)
1129class FileDatastoreButlerTests(ButlerTests):
1130 """Common tests and specialization of ButlerTests for butlers backed
1131 by datastores that inherit from FileDatastore.
1132 """
1134 def checkFileExists(self, root, relpath):
1135 """Checks if file exists at a given path (relative to root).
1137 Test testPutTemplates verifies actual physical existance of the files
1138 in the requested location.
1139 """
1140 uri = ResourcePath(root, forceDirectory=True)
1141 return uri.join(relpath).exists()
1143 def testPutTemplates(self):
1144 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1145 butler = Butler(self.tmpConfigFile, run=self.default_run)
1147 # Add needed Dimensions
1148 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1149 butler.registry.insertDimensionData(
1150 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1151 )
1152 butler.registry.insertDimensionData(
1153 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1154 )
1155 butler.registry.insertDimensionData(
1156 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1157 )
1159 # Create and store a dataset
1160 metric = makeExampleMetrics()
1162 # Create two almost-identical DatasetTypes (both will use default
1163 # template)
1164 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1165 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1166 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1167 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1169 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1170 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1172 # Put with exactly the data ID keys needed
1173 ref = butler.put(metric, "metric1", dataId1)
1174 uri = butler.getURI(ref)
1175 self.assertTrue(uri.exists())
1176 self.assertTrue(
1177 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1178 )
1180 # Check the template based on dimensions
1181 if hasattr(butler.datastore, "templates"):
1182 butler.datastore.templates.validateTemplates([ref])
1184 # Put with extra data ID keys (physical_filter is an optional
1185 # dependency); should not change template (at least the way we're
1186 # defining them to behave now; the important thing is that they
1187 # must be consistent).
1188 ref = butler.put(metric, "metric2", dataId2)
1189 uri = butler.getURI(ref)
1190 self.assertTrue(uri.exists())
1191 self.assertTrue(
1192 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1193 )
1195 # Check the template based on dimensions
1196 if hasattr(butler.datastore, "templates"):
1197 butler.datastore.templates.validateTemplates([ref])
1199 # Use a template that has a typo in dimension record metadata.
1200 # Easier to test with a butler that has a ref with records attached.
1201 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1202 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1203 path = template.format(ref)
1204 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1206 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1207 with self.assertRaises(KeyError):
1208 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1209 template.format(ref)
1211 # Now use a file template that will not result in unique filenames
1212 with self.assertRaises(FileTemplateValidationError):
1213 butler.put(metric, "metric3", dataId1)
1215 def testImportExport(self):
1216 # Run put/get tests just to create and populate a repo.
1217 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1218 self.runImportExportTest(storageClass)
1220 @unittest.expectedFailure
1221 def testImportExportVirtualComposite(self):
1222 # Run put/get tests just to create and populate a repo.
1223 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1224 self.runImportExportTest(storageClass)
1226 def runImportExportTest(self, storageClass):
1227 """This test does an export to a temp directory and an import back
1228 into a new temp directory repo. It does not assume a posix datastore"""
1229 exportButler = self.runPutGetTest(storageClass, "test_metric")
1231 # Test that we must have a file extension.
1232 with self.assertRaises(ValueError):
1233 with exportButler.export(filename="dump", directory=".") as export:
1234 pass
1236 # Test that unknown format is not allowed.
1237 with self.assertRaises(ValueError):
1238 with exportButler.export(filename="dump.fits", directory=".") as export:
1239 pass
1241 # Test that the repo actually has at least one dataset.
1242 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1243 self.assertGreater(len(datasets), 0)
1244 # Add a DimensionRecord that's unused by those datasets.
1245 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1246 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1247 # Export and then import datasets.
1248 with safeTestTempDir(TESTDIR) as exportDir:
1249 exportFile = os.path.join(exportDir, "exports.yaml")
1250 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1251 export.saveDatasets(datasets)
1252 # Export the same datasets again. This should quietly do
1253 # nothing because of internal deduplication, and it shouldn't
1254 # complain about being asked to export the "htm7" elements even
1255 # though there aren't any in these datasets or in the database.
1256 export.saveDatasets(datasets, elements=["htm7"])
1257 # Save one of the data IDs again; this should be harmless
1258 # because of internal deduplication.
1259 export.saveDataIds([datasets[0].dataId])
1260 # Save some dimension records directly.
1261 export.saveDimensionData("skymap", [skymapRecord])
1262 self.assertTrue(os.path.exists(exportFile))
1263 with safeTestTempDir(TESTDIR) as importDir:
1264 # We always want this to be a local posix butler
1265 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1266 # Calling script.butlerImport tests the implementation of the
1267 # butler command line interface "import" subcommand. Functions
1268 # in the script folder are generally considered protected and
1269 # should not be used as public api.
1270 with open(exportFile, "r") as f:
1271 script.butlerImport(
1272 importDir,
1273 export_file=f,
1274 directory=exportDir,
1275 transfer="auto",
1276 skip_dimensions=None,
1277 reuse_ids=False,
1278 )
1279 importButler = Butler(importDir, run=self.default_run)
1280 for ref in datasets:
1281 with self.subTest(ref=ref):
1282 # Test for existence by passing in the DatasetType and
1283 # data ID separately, to avoid lookup by dataset_id.
1284 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1285 self.assertEqual(
1286 list(importButler.registry.queryDimensionRecords("skymap")),
1287 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1288 )
1290 def testRemoveRuns(self):
1291 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1292 butler = Butler(self.tmpConfigFile, writeable=True)
1293 # Load registry data with dimensions to hang datasets off of.
1294 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1295 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1296 # Add some RUN-type collection.
1297 run1 = "run1"
1298 butler.registry.registerRun(run1)
1299 run2 = "run2"
1300 butler.registry.registerRun(run2)
1301 # put a dataset in each
1302 metric = makeExampleMetrics()
1303 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1304 datasetType = self.addDatasetType(
1305 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1306 )
1307 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1308 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1309 uri1 = butler.getURI(ref1, collections=[run1])
1310 uri2 = butler.getURI(ref2, collections=[run2])
1312 with self.assertRaises(OrphanedRecordError):
1313 butler.registry.removeDatasetType(datasetType.name)
1315 # Remove from both runs with different values for unstore.
1316 butler.removeRuns([run1], unstore=True)
1317 butler.removeRuns([run2], unstore=False)
1318 # Should be nothing in registry for either one, and datastore should
1319 # not think either exists.
1320 with self.assertRaises(MissingCollectionError):
1321 butler.registry.getCollectionType(run1)
1322 with self.assertRaises(MissingCollectionError):
1323 butler.registry.getCollectionType(run2)
1324 self.assertFalse(butler.datastore.exists(ref1))
1325 self.assertFalse(butler.datastore.exists(ref2))
1326 # The ref we unstored should be gone according to the URI, but the
1327 # one we forgot should still be around.
1328 self.assertFalse(uri1.exists())
1329 self.assertTrue(uri2.exists())
1331 # Now that the collections have been pruned we can remove the
1332 # dataset type
1333 butler.registry.removeDatasetType(datasetType.name)
1335 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm:
1336 butler.registry.removeDatasetType(tuple(["test*", "test*"]))
1337 self.assertIn("not defined", "\n".join(cm.output))
1340class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1341 """PosixDatastore specialization of a butler"""
1343 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1344 fullConfigKey = ".datastore.formatters"
1345 validationCanFail = True
1346 datastoreStr = ["/tmp"]
1347 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1348 registryStr = "/gen3.sqlite3"
1350 def testPathConstructor(self):
1351 """Independent test of constructor using PathLike."""
1352 butler = Butler(self.tmpConfigFile, run=self.default_run)
1353 self.assertIsInstance(butler, Butler)
1355 # And again with a Path object with the butler yaml
1356 path = pathlib.Path(self.tmpConfigFile)
1357 butler = Butler(path, writeable=False)
1358 self.assertIsInstance(butler, Butler)
1360 # And again with a Path object without the butler yaml
1361 # (making sure we skip it if the tmp config doesn't end
1362 # in butler.yaml -- which is the case for a subclass)
1363 if self.tmpConfigFile.endswith("butler.yaml"):
1364 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1365 butler = Butler(path, writeable=False)
1366 self.assertIsInstance(butler, Butler)
1368 def testExportTransferCopy(self):
1369 """Test local export using all transfer modes"""
1370 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1371 exportButler = self.runPutGetTest(storageClass, "test_metric")
1372 # Test that the repo actually has at least one dataset.
1373 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1374 self.assertGreater(len(datasets), 0)
1375 uris = [exportButler.getURI(d) for d in datasets]
1376 datastoreRoot = exportButler.datastore.root
1378 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1380 for path in pathsInStore:
1381 # Assume local file system
1382 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1384 for transfer in ("copy", "link", "symlink", "relsymlink"):
1385 with safeTestTempDir(TESTDIR) as exportDir:
1386 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1387 export.saveDatasets(datasets)
1388 for path in pathsInStore:
1389 self.assertTrue(
1390 self.checkFileExists(exportDir, path),
1391 f"Check that mode {transfer} exported files",
1392 )
1394 def testPruneDatasets(self):
1395 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1396 butler = Butler(self.tmpConfigFile, writeable=True)
1397 # Load registry data with dimensions to hang datasets off of.
1398 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1399 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1400 # Add some RUN-type collections.
1401 run1 = "run1"
1402 butler.registry.registerRun(run1)
1403 run2 = "run2"
1404 butler.registry.registerRun(run2)
1405 # put some datasets. ref1 and ref2 have the same data ID, and are in
1406 # different runs. ref3 has a different data ID.
1407 metric = makeExampleMetrics()
1408 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1409 datasetType = self.addDatasetType(
1410 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1411 )
1412 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1413 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1414 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1416 # Simple prune.
1417 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1418 with self.assertRaises(LookupError):
1419 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1421 # Put data back.
1422 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1423 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1424 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1426 # Check that in normal mode, deleting the record will lead to
1427 # trash not touching the file.
1428 uri1 = butler.datastore.getURI(ref1)
1429 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table
1430 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1431 butler.datastore.trash(ref1)
1432 butler.datastore.emptyTrash()
1433 self.assertTrue(uri1.exists())
1434 uri1.remove() # Clean it up.
1436 # Simulate execution butler setup by deleting the datastore
1437 # record but keeping the file around and trusting.
1438 butler.datastore.trustGetRequest = True
1439 uri2 = butler.datastore.getURI(ref2)
1440 uri3 = butler.datastore.getURI(ref3)
1441 self.assertTrue(uri2.exists())
1442 self.assertTrue(uri3.exists())
1444 # Remove the datastore record.
1445 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table
1446 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1447 self.assertTrue(uri2.exists())
1448 butler.datastore.trash([ref2, ref3])
1449 # Immediate removal for ref2 file
1450 self.assertFalse(uri2.exists())
1451 # But ref3 has to wait for the empty.
1452 self.assertTrue(uri3.exists())
1453 butler.datastore.emptyTrash()
1454 self.assertFalse(uri3.exists())
1456 # Clear out the datasets from registry.
1457 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1459 def testPytypeCoercion(self):
1460 """Test python type coercion on Butler.get and put."""
1462 # Store some data with the normal example storage class.
1463 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1464 datasetTypeName = "test_metric"
1465 butler = self.runPutGetTest(storageClass, datasetTypeName)
1467 dataId = {"instrument": "DummyCamComp", "visit": 423}
1468 metric = butler.get(datasetTypeName, dataId=dataId)
1469 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1471 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1472 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1474 # Now need to hack the registry dataset type definition.
1475 # There is no API for this.
1476 manager = butler.registry._managers.datasets
1477 manager._db.update(
1478 manager._static.dataset_type,
1479 {"name": datasetTypeName},
1480 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1481 )
1483 # Force reset of dataset type cache
1484 butler.registry.refresh()
1486 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1487 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1488 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1490 metric_model = butler.get(datasetTypeName, dataId=dataId)
1491 self.assertNotEqual(type(metric_model), type(metric))
1492 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1494 # Put the model and read it back to show that everything now
1495 # works as normal.
1496 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1497 metric_model_new = butler.get(metric_ref)
1498 self.assertEqual(metric_model_new, metric_model)
1500 # Hack the storage class again to something that will fail on the
1501 # get with no conversion class.
1502 manager._db.update(
1503 manager._static.dataset_type,
1504 {"name": datasetTypeName},
1505 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1506 )
1507 butler.registry.refresh()
1509 with self.assertRaises(ValueError):
1510 butler.get(datasetTypeName, dataId=dataId)
1513@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1514class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1515 """PosixDatastore specialization of a butler using Postgres"""
1517 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1518 fullConfigKey = ".datastore.formatters"
1519 validationCanFail = True
1520 datastoreStr = ["/tmp"]
1521 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1522 registryStr = "PostgreSQL@test"
1524 @staticmethod
1525 def _handler(postgresql):
1526 engine = sqlalchemy.engine.create_engine(postgresql.url())
1527 with engine.begin() as connection:
1528 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1530 @classmethod
1531 def setUpClass(cls):
1532 # Create the postgres test server.
1533 cls.postgresql = testing.postgresql.PostgresqlFactory(
1534 cache_initialized_db=True, on_initialized=cls._handler
1535 )
1536 super().setUpClass()
1538 @classmethod
1539 def tearDownClass(cls):
1540 # Clean up any lingering SQLAlchemy engines/connections
1541 # so they're closed before we shut down the server.
1542 gc.collect()
1543 cls.postgresql.clear_cache()
1544 super().tearDownClass()
1546 def setUp(self):
1547 self.server = self.postgresql()
1549 # Need to add a registry section to the config.
1550 self._temp_config = False
1551 config = Config(self.configFile)
1552 config["registry", "db"] = self.server.url()
1553 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1554 config.dump(fh)
1555 self.configFile = fh.name
1556 self._temp_config = True
1557 super().setUp()
1559 def tearDown(self):
1560 self.server.stop()
1561 if self._temp_config and os.path.exists(self.configFile):
1562 os.remove(self.configFile)
1563 super().tearDown()
1565 def testMakeRepo(self):
1566 # The base class test assumes that it's using sqlite and assumes
1567 # the config file is acceptable to sqlite.
1568 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1571class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1572 """InMemoryDatastore specialization of a butler"""
1574 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1575 fullConfigKey = None
1576 useTempRoot = False
1577 validationCanFail = False
1578 datastoreStr = ["datastore='InMemory"]
1579 datastoreName = ["InMemoryDatastore@"]
1580 registryStr = "/gen3.sqlite3"
1582 def testIngest(self):
1583 pass
1586class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1587 """PosixDatastore specialization"""
1589 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1590 fullConfigKey = ".datastore.datastores.1.formatters"
1591 validationCanFail = True
1592 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1593 datastoreName = [
1594 "InMemoryDatastore@",
1595 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1596 "SecondDatastore",
1597 ]
1598 registryStr = "/gen3.sqlite3"
1601class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1602 """Test that a yaml file in one location can refer to a root in another."""
1604 datastoreStr = ["dir1"]
1605 # Disable the makeRepo test since we are deliberately not using
1606 # butler.yaml as the config name.
1607 fullConfigKey = None
1609 def setUp(self):
1610 self.root = makeTestTempDir(TESTDIR)
1612 # Make a new repository in one place
1613 self.dir1 = os.path.join(self.root, "dir1")
1614 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1616 # Move the yaml file to a different place and add a "root"
1617 self.dir2 = os.path.join(self.root, "dir2")
1618 os.makedirs(self.dir2, exist_ok=True)
1619 configFile1 = os.path.join(self.dir1, "butler.yaml")
1620 config = Config(configFile1)
1621 config["root"] = self.dir1
1622 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1623 config.dumpToUri(configFile2)
1624 os.remove(configFile1)
1625 self.tmpConfigFile = configFile2
1627 def testFileLocations(self):
1628 self.assertNotEqual(self.dir1, self.dir2)
1629 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1630 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1631 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1634class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1635 """Test that a config file created by makeRepo outside of repo works."""
1637 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1639 def setUp(self):
1640 self.root = makeTestTempDir(TESTDIR)
1641 self.root2 = makeTestTempDir(TESTDIR)
1643 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1644 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1646 def tearDown(self):
1647 if os.path.exists(self.root2):
1648 shutil.rmtree(self.root2, ignore_errors=True)
1649 super().tearDown()
1651 def testConfigExistence(self):
1652 c = Config(self.tmpConfigFile)
1653 uri_config = ResourcePath(c["root"])
1654 uri_expected = ResourcePath(self.root, forceDirectory=True)
1655 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1656 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1658 def testPutGet(self):
1659 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1660 self.runPutGetTest(storageClass, "test_metric")
1663class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1664 """Test that a config file created by makeRepo outside of repo works."""
1666 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1668 def setUp(self):
1669 self.root = makeTestTempDir(TESTDIR)
1670 self.root2 = makeTestTempDir(TESTDIR)
1672 self.tmpConfigFile = self.root2
1673 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1675 def testConfigExistence(self):
1676 # Append the yaml file else Config constructor does not know the file
1677 # type.
1678 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1679 super().testConfigExistence()
1682class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1683 """Test that a config file created by makeRepo outside of repo works."""
1685 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1687 def setUp(self):
1688 self.root = makeTestTempDir(TESTDIR)
1689 self.root2 = makeTestTempDir(TESTDIR)
1691 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1692 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1695@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1696class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1697 """S3Datastore specialization of a butler; an S3 storage Datastore +
1698 a local in-memory SqlRegistry.
1699 """
1701 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1702 fullConfigKey = None
1703 validationCanFail = True
1705 bucketName = "anybucketname"
1706 """Name of the Bucket that will be used in the tests. The name is read from
1707 the config file used with the tests during set-up.
1708 """
1710 root = "butlerRoot/"
1711 """Root repository directory expected to be used in case useTempRoot=False.
1712 Otherwise the root is set to a 20 characters long randomly generated string
1713 during set-up.
1714 """
1716 datastoreStr = [f"datastore={root}"]
1717 """Contains all expected root locations in a format expected to be
1718 returned by Butler stringification.
1719 """
1721 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1722 """The expected format of the S3 Datastore string."""
1724 registryStr = "/gen3.sqlite3"
1725 """Expected format of the Registry string."""
1727 mock_s3 = mock_s3()
1728 """The mocked s3 interface from moto."""
1730 def genRoot(self):
1731 """Returns a random string of len 20 to serve as a root
1732 name for the temporary bucket repo.
1734 This is equivalent to tempfile.mkdtemp as this is what self.root
1735 becomes when useTempRoot is True.
1736 """
1737 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1738 return rndstr + "/"
1740 def setUp(self):
1741 config = Config(self.configFile)
1742 uri = ResourcePath(config[".datastore.datastore.root"])
1743 self.bucketName = uri.netloc
1745 # Enable S3 mocking of tests.
1746 self.mock_s3.start()
1748 # set up some fake credentials if they do not exist
1749 self.usingDummyCredentials = setAwsEnvCredentials()
1751 if self.useTempRoot:
1752 self.root = self.genRoot()
1753 rooturi = f"s3://{self.bucketName}/{self.root}"
1754 config.update({"datastore": {"datastore": {"root": rooturi}}})
1756 # need local folder to store registry database
1757 self.reg_dir = makeTestTempDir(TESTDIR)
1758 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1760 # MOTO needs to know that we expect Bucket bucketname to exist
1761 # (this used to be the class attribute bucketName)
1762 s3 = boto3.resource("s3")
1763 s3.create_bucket(Bucket=self.bucketName)
1765 self.datastoreStr = f"datastore={self.root}"
1766 self.datastoreName = [f"FileDatastore@{rooturi}"]
1767 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1768 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1770 def tearDown(self):
1771 s3 = boto3.resource("s3")
1772 bucket = s3.Bucket(self.bucketName)
1773 try:
1774 bucket.objects.all().delete()
1775 except botocore.exceptions.ClientError as e:
1776 if e.response["Error"]["Code"] == "404":
1777 # the key was not reachable - pass
1778 pass
1779 else:
1780 raise
1782 bucket = s3.Bucket(self.bucketName)
1783 bucket.delete()
1785 # Stop the S3 mock.
1786 self.mock_s3.stop()
1788 # unset any potentially set dummy credentials
1789 if self.usingDummyCredentials:
1790 unsetAwsEnvCredentials()
1792 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1793 shutil.rmtree(self.reg_dir, ignore_errors=True)
1795 if self.useTempRoot and os.path.exists(self.root):
1796 shutil.rmtree(self.root, ignore_errors=True)
1798 super().tearDown()
1801class PosixDatastoreTransfers(unittest.TestCase):
1802 """Test data transfers between butlers.
1804 Test for different managers. UUID to UUID and integer to integer are
1805 tested. UUID to integer is not supported since we do not currently
1806 want to allow that. Integer to UUID is supported with the caveat
1807 that UUID4 will be generated and this will be incorrect for raw
1808 dataset types. The test ignores that.
1809 """
1811 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1813 @classmethod
1814 def setUpClass(cls):
1815 cls.storageClassFactory = StorageClassFactory()
1816 cls.storageClassFactory.addFromConfig(cls.configFile)
1818 def setUp(self):
1819 self.root = makeTestTempDir(TESTDIR)
1820 self.config = Config(self.configFile)
1822 def tearDown(self):
1823 removeTestTempDir(self.root)
1825 def create_butler(self, manager, label):
1826 config = Config(self.configFile)
1827 config["registry", "managers", "datasets"] = manager
1828 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1830 def create_butlers(self, manager1=None, manager2=None):
1831 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID"
1832 if manager1 is None:
1833 manager1 = default
1834 if manager2 is None:
1835 manager2 = default
1836 self.source_butler = self.create_butler(manager1, "1")
1837 self.target_butler = self.create_butler(manager2, "2")
1839 def testTransferUuidToUuid(self):
1840 self.create_butlers()
1841 # Setting id_gen_map should have no effect here
1842 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1844 def _enable_trust(self, datastore) -> None:
1845 if hasattr(datastore, "trustGetRequest"):
1846 datastore.trustGetRequest = True
1847 elif hasattr(datastore, "datastores"):
1848 for datastore in datastore.datastores:
1849 if hasattr(datastore, "trustGetRequest"):
1850 datastore.trustGetRequest = True
1852 def testTransferMissing(self):
1853 """Test transfers where datastore records are missing.
1855 This is how execution butler works.
1856 """
1857 self.create_butlers()
1859 # Configure the source butler to allow trust.
1860 self._enable_trust(self.source_butler.datastore)
1862 self.assertButlerTransfers(purge=True)
1864 def testTransferMissingDisassembly(self):
1865 """Test transfers where datastore records are missing.
1867 This is how execution butler works.
1868 """
1869 self.create_butlers()
1871 # Configure the source butler to allow trust.
1872 self._enable_trust(self.source_butler.datastore)
1874 # Test disassembly.
1875 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1877 def testAbsoluteURITransferDirect(self):
1878 """Test transfer using an absolute URI."""
1879 self._absolute_transfer("auto")
1881 def testAbsoluteURITransferCopy(self):
1882 """Test transfer using an absolute URI."""
1883 self._absolute_transfer("copy")
1885 def _absolute_transfer(self, transfer):
1886 self.create_butlers()
1888 storageClassName = "StructuredData"
1889 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1890 datasetTypeName = "random_data"
1891 run = "run1"
1892 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1894 dimensions = self.source_butler.registry.dimensions.extract(())
1895 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1896 self.source_butler.registry.registerDatasetType(datasetType)
1898 metrics = makeExampleMetrics()
1899 with ResourcePath.temporary_uri(suffix=".json") as temp:
1900 source_refs = [DatasetRef(datasetType, {}, run=run)]
1901 temp.write(json.dumps(metrics.exportAsDict()).encode())
1902 dataset = FileDataset(path=temp, refs=source_refs)
1903 self.source_butler.ingest(dataset, transfer="direct")
1905 self.target_butler.transfer_from(
1906 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer
1907 )
1909 uri = self.target_butler.getURI(dataset.refs[0])
1910 if transfer == "auto":
1911 self.assertEqual(uri, temp)
1912 else:
1913 self.assertNotEqual(uri, temp)
1915 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1916 """Test that a run can be transferred to another butler."""
1918 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1919 datasetTypeName = "random_data"
1921 # Test will create 3 collections and we will want to transfer
1922 # two of those three.
1923 runs = ["run1", "run2", "other"]
1925 # Also want to use two different dataset types to ensure that
1926 # grouping works.
1927 datasetTypeNames = ["random_data", "random_data_2"]
1929 # Create the run collections in the source butler.
1930 for run in runs:
1931 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1933 # Create dimensions in source butler.
1934 n_exposures = 30
1935 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1936 self.source_butler.registry.insertDimensionData(
1937 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1938 )
1939 self.source_butler.registry.insertDimensionData(
1940 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1941 )
1943 for i in range(n_exposures):
1944 self.source_butler.registry.insertDimensionData(
1945 "exposure",
1946 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1947 )
1949 # Create dataset types in the source butler.
1950 dimensions = self.source_butler.registry.dimensions.extract(["instrument", "exposure"])
1951 for datasetTypeName in datasetTypeNames:
1952 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1953 self.source_butler.registry.registerDatasetType(datasetType)
1955 # Write a dataset to an unrelated run -- this will ensure that
1956 # we are rewriting integer dataset ids in the target if necessary.
1957 # Will not be relevant for UUID.
1958 run = "distraction"
1959 butler = Butler(butler=self.source_butler, run=run)
1960 butler.put(
1961 makeExampleMetrics(),
1962 datasetTypeName,
1963 exposure=1,
1964 instrument="DummyCamComp",
1965 physical_filter="d-r",
1966 )
1968 # Write some example metrics to the source
1969 butler = Butler(butler=self.source_butler)
1971 # Set of DatasetRefs that should be in the list of refs to transfer
1972 # but which will not be transferred.
1973 deleted = set()
1975 n_expected = 20 # Number of datasets expected to be transferred
1976 source_refs = []
1977 for i in range(n_exposures):
1978 # Put a third of datasets into each collection, only retain
1979 # two thirds.
1980 index = i % 3
1981 run = runs[index]
1982 datasetTypeName = datasetTypeNames[i % 2]
1984 metric_data = {
1985 "summary": {"counter": i},
1986 "output": {"text": "metric"},
1987 "data": [2 * x for x in range(i)],
1988 }
1989 metric = MetricsExample(**metric_data)
1990 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1991 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
1993 # Remove the datastore record using low-level API
1994 if purge:
1995 # Remove records for a fraction.
1996 if index == 1:
1997 # For one of these delete the file as well.
1998 # This allows the "missing" code to filter the
1999 # file out.
2000 # Access the individual datastores.
2001 datastores = []
2002 if hasattr(butler.datastore, "datastores"):
2003 datastores.extend(butler.datastore.datastores)
2004 else:
2005 datastores.append(butler.datastore)
2007 if not deleted:
2008 # For a chained datastore we need to remove
2009 # files in each chain.
2010 for datastore in datastores:
2011 # The file might not be known to the datastore
2012 # if constraints are used.
2013 try:
2014 primary, uris = datastore.getURIs(ref)
2015 except FileNotFoundError:
2016 continue
2017 if primary:
2018 if primary.scheme != "mem":
2019 primary.remove()
2020 for uri in uris.values():
2021 if uri.scheme != "mem":
2022 uri.remove()
2023 n_expected -= 1
2024 deleted.add(ref)
2026 # Remove the datastore record.
2027 for datastore in datastores:
2028 if hasattr(datastore, "removeStoredItemInfo"):
2029 datastore.removeStoredItemInfo(ref)
2031 if index < 2:
2032 source_refs.append(ref)
2033 if ref not in deleted:
2034 new_metric = butler.get(ref.unresolved(), collections=run)
2035 self.assertEqual(new_metric, metric)
2037 # Create some bad dataset types to ensure we check for inconsistent
2038 # definitions.
2039 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2040 for datasetTypeName in datasetTypeNames:
2041 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2042 self.target_butler.registry.registerDatasetType(datasetType)
2043 with self.assertRaises(ConflictingDefinitionError) as cm:
2044 self.target_butler.transfer_from(self.source_butler, source_refs)
2045 self.assertIn("dataset type differs", str(cm.exception))
2047 # And remove the bad definitions.
2048 for datasetTypeName in datasetTypeNames:
2049 self.target_butler.registry.removeDatasetType(datasetTypeName)
2051 # Transfer without creating dataset types should fail.
2052 with self.assertRaises(KeyError):
2053 self.target_butler.transfer_from(self.source_butler, source_refs)
2055 # Transfer without creating dimensions should fail.
2056 with self.assertRaises(ConflictingDefinitionError) as cm:
2057 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True)
2058 self.assertIn("dimension", str(cm.exception))
2060 # The failed transfer above leaves registry in an inconsistent
2061 # state because the run is created but then rolled back without
2062 # the collection cache being cleared. For now force a refresh.
2063 # Can remove with DM-35498.
2064 self.target_butler.registry.refresh()
2066 # Now transfer them to the second butler, including dimensions.
2067 with self.assertLogs(level=logging.DEBUG) as cm:
2068 transferred = self.target_butler.transfer_from(
2069 self.source_butler,
2070 source_refs,
2071 register_dataset_types=True,
2072 transfer_dimensions=True,
2073 )
2074 self.assertEqual(len(transferred), n_expected)
2075 log_output = ";".join(cm.output)
2077 # A ChainedDatastore will use the in-memory datastore for mexists
2078 # so we can not rely on the mexists log message.
2079 self.assertIn("Number of datastore records found in source", log_output)
2080 self.assertIn("Creating output run", log_output)
2082 # Do the transfer twice to ensure that it will do nothing extra.
2083 # Only do this if purge=True because it does not work for int
2084 # dataset_id.
2085 if purge:
2086 # This should not need to register dataset types.
2087 transferred = self.target_butler.transfer_from(self.source_butler, source_refs)
2088 self.assertEqual(len(transferred), n_expected)
2090 # Also do an explicit low-level transfer to trigger some
2091 # edge cases.
2092 with self.assertLogs(level=logging.DEBUG) as cm:
2093 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2094 log_output = ";".join(cm.output)
2095 self.assertIn("no file artifacts exist", log_output)
2097 with self.assertRaises((TypeError, AttributeError)):
2098 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2100 with self.assertRaises(ValueError):
2101 self.target_butler.datastore.transfer_from(
2102 self.source_butler.datastore, source_refs, transfer="split"
2103 )
2105 # Now try to get the same refs from the new butler.
2106 for ref in source_refs:
2107 if ref not in deleted:
2108 unresolved_ref = ref.unresolved()
2109 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2110 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2111 self.assertEqual(new_metric, old_metric)
2113 # Now prune run2 collection and create instead a CHAINED collection.
2114 # This should block the transfer.
2115 self.target_butler.removeRuns(["run2"], unstore=True)
2116 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2117 with self.assertRaises(CollectionTypeError):
2118 # Re-importing the run1 datasets can be problematic if they
2119 # use integer IDs so filter those out.
2120 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2121 self.target_butler.transfer_from(self.source_butler, to_transfer)
2124class ChainedDatastoreTransfers(PosixDatastoreTransfers):
2125 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2128if __name__ == "__main__":
2129 unittest.main()