Coverage for tests/test_butler.py: 14%
1251 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-05 02:04 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-05 02:04 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import gc
26import logging
27import os
28import pathlib
29import pickle
30import posixpath
31import random
32import shutil
33import socket
34import string
35import tempfile
36import time
37import unittest
38from tempfile import gettempdir
39from threading import Thread
41try:
42 import boto3
43 import botocore
44 from moto import mock_s3
45except ImportError:
46 boto3 = None
48 def mock_s3(cls):
49 """A no-op decorator in case moto mock_s3 can not be imported."""
50 return cls
53try:
54 # It's possible but silly to have testing.postgresql installed without
55 # having the postgresql server installed (because then nothing in
56 # testing.postgresql would work), so we use the presence of that module
57 # to test whether we can expect the server to be available.
58 import testing.postgresql
59except ImportError:
60 testing = None
63try:
64 from cheroot import wsgi
65 from wsgidav.wsgidav_app import WsgiDAVApp
66except ImportError:
67 WsgiDAVApp = None
69import astropy.time
70import sqlalchemy
71from lsst.daf.butler import (
72 Butler,
73 ButlerConfig,
74 CollectionType,
75 Config,
76 DatasetIdGenEnum,
77 DatasetRef,
78 DatasetType,
79 FileDataset,
80 FileTemplate,
81 FileTemplateValidationError,
82 StorageClassFactory,
83 ValidationError,
84 script,
85)
86from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
87from lsst.daf.butler.registry import (
88 CollectionError,
89 CollectionTypeError,
90 ConflictingDefinitionError,
91 DataIdValueError,
92 MissingCollectionError,
93)
94from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
95from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
96from lsst.resources import ResourcePath
97from lsst.resources.http import _is_webdav_endpoint
98from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
99from lsst.utils import doImport
100from lsst.utils.introspection import get_full_type_name
102TESTDIR = os.path.abspath(os.path.dirname(__file__))
105def makeExampleMetrics():
106 return MetricsExample(
107 {"AM1": 5.2, "AM2": 30.6},
108 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
109 [563, 234, 456.7, 752, 8, 9, 27],
110 )
113class TransactionTestError(Exception):
114 """Specific error for testing transactions, to prevent misdiagnosing
115 that might otherwise occur when a standard exception is used.
116 """
118 pass
121class ButlerConfigTests(unittest.TestCase):
122 """Simple tests for ButlerConfig that are not tested in any other test
123 cases."""
125 def testSearchPath(self):
126 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
127 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
128 config1 = ButlerConfig(configFile)
129 self.assertNotIn("testConfigs", "\n".join(cm.output))
131 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
132 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
133 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
134 self.assertIn("testConfigs", "\n".join(cm.output))
136 key = ("datastore", "records", "table")
137 self.assertNotEqual(config1[key], config2[key])
138 self.assertEqual(config2[key], "override_record")
141class ButlerPutGetTests:
142 """Helper method for running a suite of put/get tests from different
143 butler configurations."""
145 root = None
146 default_run = "ingésτ😺"
148 @staticmethod
149 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
150 """Create a DatasetType and register it"""
151 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
152 registry.registerDatasetType(datasetType)
153 return datasetType
155 @classmethod
156 def setUpClass(cls):
157 cls.storageClassFactory = StorageClassFactory()
158 cls.storageClassFactory.addFromConfig(cls.configFile)
160 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
161 datasetType = datasetRef.datasetType
162 dataId = datasetRef.dataId
163 deferred = butler.getDirectDeferred(datasetRef)
165 for component in components:
166 compTypeName = datasetType.componentTypeName(component)
167 result = butler.get(compTypeName, dataId, collections=collections)
168 self.assertEqual(result, getattr(reference, component))
169 result_deferred = deferred.get(component=component)
170 self.assertEqual(result_deferred, result)
172 def tearDown(self):
173 removeTestTempDir(self.root)
175 def create_butler(self, run, storageClass, datasetTypeName):
176 butler = Butler(self.tmpConfigFile, run=run)
178 collections = set(butler.registry.queryCollections())
179 self.assertEqual(collections, set([run]))
181 # Create and register a DatasetType
182 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
184 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
186 # Add needed Dimensions
187 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
188 butler.registry.insertDimensionData(
189 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
190 )
191 butler.registry.insertDimensionData(
192 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
193 )
194 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
195 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
196 butler.registry.insertDimensionData(
197 "visit",
198 {
199 "instrument": "DummyCamComp",
200 "id": 423,
201 "name": "fourtwentythree",
202 "physical_filter": "d-r",
203 "visit_system": 1,
204 "datetime_begin": visit_start,
205 "datetime_end": visit_end,
206 },
207 )
209 # Add more visits for some later tests
210 for visit_id in (424, 425):
211 butler.registry.insertDimensionData(
212 "visit",
213 {
214 "instrument": "DummyCamComp",
215 "id": visit_id,
216 "name": f"fourtwentyfour_{visit_id}",
217 "physical_filter": "d-r",
218 "visit_system": 1,
219 },
220 )
221 return butler, datasetType
223 def runPutGetTest(self, storageClass, datasetTypeName):
224 # New datasets will be added to run and tag, but we will only look in
225 # tag when looking up datasets.
226 run = self.default_run
227 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
229 # Create and store a dataset
230 metric = makeExampleMetrics()
231 dataId = {"instrument": "DummyCamComp", "visit": 423}
233 # Create a DatasetRef for put
234 refIn = DatasetRef(datasetType, dataId, id=None)
236 # Put with a preexisting id should fail
237 with self.assertRaises(ValueError):
238 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
240 # Put and remove the dataset once as a DatasetRef, once as a dataId,
241 # and once with a DatasetType
243 # Keep track of any collections we add and do not clean up
244 expected_collections = {run}
246 counter = 0
247 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
248 # Since we are using subTest we can get cascading failures
249 # here with the first attempt failing and the others failing
250 # immediately because the dataset already exists. Work around
251 # this by using a distinct run collection each time
252 counter += 1
253 this_run = f"put_run_{counter}"
254 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
255 expected_collections.update({this_run})
257 with self.subTest(args=args):
258 ref = butler.put(metric, *args, run=this_run)
259 self.assertIsInstance(ref, DatasetRef)
261 # Test getDirect
262 metricOut = butler.getDirect(ref)
263 self.assertEqual(metric, metricOut)
264 # Test get
265 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
266 self.assertEqual(metric, metricOut)
267 # Test get with a datasetRef
268 metricOut = butler.get(ref, collections=this_run)
269 self.assertEqual(metric, metricOut)
270 # Test getDeferred with dataId
271 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
272 self.assertEqual(metric, metricOut)
273 # Test getDeferred with a datasetRef
274 metricOut = butler.getDeferred(ref, collections=this_run).get()
275 self.assertEqual(metric, metricOut)
276 # and deferred direct with ref
277 metricOut = butler.getDirectDeferred(ref).get()
278 self.assertEqual(metric, metricOut)
280 # Check we can get components
281 if storageClass.isComposite():
282 self.assertGetComponents(
283 butler, ref, ("summary", "data", "output"), metric, collections=this_run
284 )
286 # Can the artifacts themselves be retrieved?
287 if not butler.datastore.isEphemeral:
288 root_uri = ResourcePath(self.root)
290 for preserve_path in (True, False):
291 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
292 # Use copy so that we can test that overwrite
293 # protection works (using "auto" for File URIs would
294 # use hard links and subsequent transfer would work
295 # because it knows they are the same file).
296 transferred = butler.retrieveArtifacts(
297 [ref], destination, preserve_path=preserve_path, transfer="copy"
298 )
299 self.assertGreater(len(transferred), 0)
300 artifacts = list(ResourcePath.findFileResources([destination]))
301 self.assertEqual(set(transferred), set(artifacts))
303 for artifact in transferred:
304 path_in_destination = artifact.relative_to(destination)
305 self.assertIsNotNone(path_in_destination)
307 # when path is not preserved there should not be
308 # any path separators.
309 num_seps = path_in_destination.count("/")
310 if preserve_path:
311 self.assertGreater(num_seps, 0)
312 else:
313 self.assertEqual(num_seps, 0)
315 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
316 n_uris = len(secondary_uris)
317 if primary_uri:
318 n_uris += 1
319 self.assertEqual(
320 len(artifacts),
321 n_uris,
322 "Comparing expected artifacts vs actual:"
323 f" {artifacts} vs {primary_uri} and {secondary_uris}",
324 )
326 if preserve_path:
327 # No need to run these twice
328 with self.assertRaises(ValueError):
329 butler.retrieveArtifacts([ref], destination, transfer="move")
331 with self.assertRaises(FileExistsError):
332 butler.retrieveArtifacts([ref], destination)
334 transferred_again = butler.retrieveArtifacts(
335 [ref], destination, preserve_path=preserve_path, overwrite=True
336 )
337 self.assertEqual(set(transferred_again), set(transferred))
339 # Now remove the dataset completely.
340 butler.pruneDatasets([ref], purge=True, unstore=True)
341 # Lookup with original args should still fail.
342 with self.assertRaises(LookupError):
343 butler.datasetExists(*args, collections=this_run)
344 # getDirect() should still fail.
345 with self.assertRaises(FileNotFoundError):
346 butler.getDirect(ref)
347 # Registry shouldn't be able to find it by dataset_id anymore.
348 self.assertIsNone(butler.registry.getDataset(ref.id))
350 # Do explicit registry removal since we know they are
351 # empty
352 butler.registry.removeCollection(this_run)
353 expected_collections.remove(this_run)
355 # Put the dataset again, since the last thing we did was remove it
356 # and we want to use the default collection.
357 ref = butler.put(metric, refIn)
359 # Get with parameters
360 stop = 4
361 sliced = butler.get(ref, parameters={"slice": slice(stop)})
362 self.assertNotEqual(metric, sliced)
363 self.assertEqual(metric.summary, sliced.summary)
364 self.assertEqual(metric.output, sliced.output)
365 self.assertEqual(metric.data[:stop], sliced.data)
366 # getDeferred with parameters
367 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
368 self.assertNotEqual(metric, sliced)
369 self.assertEqual(metric.summary, sliced.summary)
370 self.assertEqual(metric.output, sliced.output)
371 self.assertEqual(metric.data[:stop], sliced.data)
372 # getDeferred with deferred parameters
373 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
374 self.assertNotEqual(metric, sliced)
375 self.assertEqual(metric.summary, sliced.summary)
376 self.assertEqual(metric.output, sliced.output)
377 self.assertEqual(metric.data[:stop], sliced.data)
379 if storageClass.isComposite():
380 # Check that components can be retrieved
381 metricOut = butler.get(ref.datasetType.name, dataId)
382 compNameS = ref.datasetType.componentTypeName("summary")
383 compNameD = ref.datasetType.componentTypeName("data")
384 summary = butler.get(compNameS, dataId)
385 self.assertEqual(summary, metric.summary)
386 data = butler.get(compNameD, dataId)
387 self.assertEqual(data, metric.data)
389 if "counter" in storageClass.derivedComponents:
390 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
391 self.assertEqual(count, len(data))
393 count = butler.get(
394 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
395 )
396 self.assertEqual(count, stop)
398 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
399 summary = butler.getDirect(compRef)
400 self.assertEqual(summary, metric.summary)
402 # Create a Dataset type that has the same name but is inconsistent.
403 inconsistentDatasetType = DatasetType(
404 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
405 )
407 # Getting with a dataset type that does not match registry fails
408 with self.assertRaises(ValueError):
409 butler.get(inconsistentDatasetType, dataId)
411 # Combining a DatasetRef with a dataId should fail
412 with self.assertRaises(ValueError):
413 butler.get(ref, dataId)
414 # Getting with an explicit ref should fail if the id doesn't match
415 with self.assertRaises(ValueError):
416 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
418 # Getting a dataset with unknown parameters should fail
419 with self.assertRaises(KeyError):
420 butler.get(ref, parameters={"unsupported": True})
422 # Check we have a collection
423 collections = set(butler.registry.queryCollections())
424 self.assertEqual(collections, expected_collections)
426 # Clean up to check that we can remove something that may have
427 # already had a component removed
428 butler.pruneDatasets([ref], unstore=True, purge=True)
430 # Check that we can configure a butler to accept a put even
431 # if it already has the dataset in registry.
432 ref = butler.put(metric, refIn)
434 # Repeat put will fail.
435 with self.assertRaises(ConflictingDefinitionError):
436 butler.put(metric, refIn)
438 # Remove the datastore entry.
439 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
441 # Put will still fail
442 with self.assertRaises(ConflictingDefinitionError):
443 butler.put(metric, refIn)
445 # Allow the put to succeed
446 butler._allow_put_of_predefined_dataset = True
447 ref2 = butler.put(metric, refIn)
448 self.assertEqual(ref2.id, ref.id)
450 # A second put will still fail but with a different exception
451 # than before.
452 with self.assertRaises(ConflictingDefinitionError):
453 butler.put(metric, refIn)
455 # Reset the flag to avoid confusion
456 butler._allow_put_of_predefined_dataset = False
458 # Leave the dataset in place since some downstream tests require
459 # something to be present
461 return butler
463 def testDeferredCollectionPassing(self):
464 # Construct a butler with no run or collection, but make it writeable.
465 butler = Butler(self.tmpConfigFile, writeable=True)
466 # Create and register a DatasetType
467 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
468 datasetType = self.addDatasetType(
469 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
470 )
471 # Add needed Dimensions
472 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
473 butler.registry.insertDimensionData(
474 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
475 )
476 butler.registry.insertDimensionData(
477 "visit",
478 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
479 )
480 dataId = {"instrument": "DummyCamComp", "visit": 423}
481 # Create dataset.
482 metric = makeExampleMetrics()
483 # Register a new run and put dataset.
484 run = "deferred"
485 self.assertTrue(butler.registry.registerRun(run))
486 # Second time it will be allowed but indicate no-op
487 self.assertFalse(butler.registry.registerRun(run))
488 ref = butler.put(metric, datasetType, dataId, run=run)
489 # Putting with no run should fail with TypeError.
490 with self.assertRaises(CollectionError):
491 butler.put(metric, datasetType, dataId)
492 # Dataset should exist.
493 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
494 # We should be able to get the dataset back, but with and without
495 # a deferred dataset handle.
496 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
497 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
498 # Trying to find the dataset without any collection is a TypeError.
499 with self.assertRaises(CollectionError):
500 butler.datasetExists(datasetType, dataId)
501 with self.assertRaises(CollectionError):
502 butler.get(datasetType, dataId)
503 # Associate the dataset with a different collection.
504 butler.registry.registerCollection("tagged")
505 butler.registry.associate("tagged", [ref])
506 # Deleting the dataset from the new collection should make it findable
507 # in the original collection.
508 butler.pruneDatasets([ref], tags=["tagged"])
509 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
512class ButlerTests(ButlerPutGetTests):
513 """Tests for Butler."""
515 useTempRoot = True
517 def setUp(self):
518 """Create a new butler root for each test."""
519 self.root = makeTestTempDir(TESTDIR)
520 Butler.makeRepo(self.root, config=Config(self.configFile))
521 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
523 def testConstructor(self):
524 """Independent test of constructor."""
525 butler = Butler(self.tmpConfigFile, run=self.default_run)
526 self.assertIsInstance(butler, Butler)
528 # Check that butler.yaml is added automatically.
529 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
530 config_dir = self.tmpConfigFile[: -len(end)]
531 butler = Butler(config_dir, run=self.default_run)
532 self.assertIsInstance(butler, Butler)
534 # Even with a ResourcePath.
535 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
536 self.assertIsInstance(butler, Butler)
538 collections = set(butler.registry.queryCollections())
539 self.assertEqual(collections, {self.default_run})
541 # Check that some special characters can be included in run name.
542 special_run = "u@b.c-A"
543 butler_special = Butler(butler=butler, run=special_run)
544 collections = set(butler_special.registry.queryCollections("*@*"))
545 self.assertEqual(collections, {special_run})
547 butler2 = Butler(butler=butler, collections=["other"])
548 self.assertEqual(butler2.collections, ("other",))
549 self.assertIsNone(butler2.run)
550 self.assertIs(butler.datastore, butler2.datastore)
552 # Test that we can use an environment variable to find this
553 # repository.
554 butler_index = Config()
555 butler_index["label"] = self.tmpConfigFile
556 for suffix in (".yaml", ".json"):
557 # Ensure that the content differs so that we know that
558 # we aren't reusing the cache.
559 bad_label = f"s3://bucket/not_real{suffix}"
560 butler_index["bad_label"] = bad_label
561 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
562 butler_index.dumpToUri(temp_file)
563 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
564 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
565 uri = Butler.get_repo_uri("bad_label")
566 self.assertEqual(uri, ResourcePath(bad_label))
567 uri = Butler.get_repo_uri("label")
568 butler = Butler(uri, writeable=False)
569 self.assertIsInstance(butler, Butler)
570 butler = Butler("label", writeable=False)
571 self.assertIsInstance(butler, Butler)
572 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
573 Butler("not_there", writeable=False)
574 with self.assertRaises(KeyError) as cm:
575 Butler.get_repo_uri("missing")
576 self.assertIn("not known to", str(cm.exception))
577 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
578 with self.assertRaises(FileNotFoundError):
579 Butler.get_repo_uri("label")
580 self.assertEqual(Butler.get_known_repos(), set())
581 with self.assertRaises(KeyError) as cm:
582 # No environment variable set.
583 Butler.get_repo_uri("label")
584 self.assertIn("No repository index defined", str(cm.exception))
585 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
586 # No aliases registered.
587 Butler("not_there")
588 self.assertEqual(Butler.get_known_repos(), set())
590 def testBasicPutGet(self):
591 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
592 self.runPutGetTest(storageClass, "test_metric")
594 def testCompositePutGetConcrete(self):
595 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
596 butler = self.runPutGetTest(storageClass, "test_metric")
598 # Should *not* be disassembled
599 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
600 self.assertEqual(len(datasets), 1)
601 uri, components = butler.getURIs(datasets[0])
602 self.assertIsInstance(uri, ResourcePath)
603 self.assertFalse(components)
604 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
605 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
607 # Predicted dataset
608 dataId = {"instrument": "DummyCamComp", "visit": 424}
609 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
610 self.assertFalse(components)
611 self.assertIsInstance(uri, ResourcePath)
612 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
613 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
615 def testCompositePutGetVirtual(self):
616 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
617 butler = self.runPutGetTest(storageClass, "test_metric_comp")
619 # Should be disassembled
620 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
621 self.assertEqual(len(datasets), 1)
622 uri, components = butler.getURIs(datasets[0])
624 if butler.datastore.isEphemeral:
625 # Never disassemble in-memory datastore
626 self.assertIsInstance(uri, ResourcePath)
627 self.assertFalse(components)
628 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
629 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
630 else:
631 self.assertIsNone(uri)
632 self.assertEqual(set(components), set(storageClass.components))
633 for compuri in components.values():
634 self.assertIsInstance(compuri, ResourcePath)
635 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
636 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
638 # Predicted dataset
639 dataId = {"instrument": "DummyCamComp", "visit": 424}
640 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
642 if butler.datastore.isEphemeral:
643 # Never disassembled
644 self.assertIsInstance(uri, ResourcePath)
645 self.assertFalse(components)
646 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
647 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
648 else:
649 self.assertIsNone(uri)
650 self.assertEqual(set(components), set(storageClass.components))
651 for compuri in components.values():
652 self.assertIsInstance(compuri, ResourcePath)
653 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
654 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
656 def testStorageClassOverrideGet(self):
657 """Test storage class conversion on get with override."""
658 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
659 datasetTypeName = "anything"
660 run = self.default_run
662 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
664 # Create and store a dataset.
665 metric = makeExampleMetrics()
666 dataId = {"instrument": "DummyCamComp", "visit": 423}
668 ref = butler.put(metric, datasetType, dataId)
670 # Return native type.
671 retrieved = butler.get(ref)
672 self.assertEqual(retrieved, metric)
674 # Specify an override.
675 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
676 model = butler.getDirect(ref, storageClass=new_sc)
677 self.assertNotEqual(type(model), type(retrieved))
678 self.assertIs(type(model), new_sc.pytype)
679 self.assertEqual(retrieved, model)
681 # Defer but override later.
682 deferred = butler.getDirectDeferred(ref)
683 model = deferred.get(storageClass=new_sc)
684 self.assertIs(type(model), new_sc.pytype)
685 self.assertEqual(retrieved, model)
687 # Defer but override up front.
688 deferred = butler.getDirectDeferred(ref, storageClass=new_sc)
689 model = deferred.get()
690 self.assertIs(type(model), new_sc.pytype)
691 self.assertEqual(retrieved, model)
693 # Retrieve a component. Should be a tuple.
694 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
695 self.assertIs(type(data), tuple)
696 self.assertEqual(data, tuple(retrieved.data))
698 # Parameter on the write storage class should work regardless
699 # of read storage class.
700 data = butler.get(
701 "anything.data",
702 dataId,
703 storageClass="StructuredDataDataTestTuple",
704 parameters={"slice": slice(2, 4)},
705 )
706 self.assertEqual(len(data), 2)
708 # Try a parameter that is known to the read storage class but not
709 # the write storage class.
710 with self.assertRaises(KeyError):
711 butler.get(
712 "anything.data",
713 dataId,
714 storageClass="StructuredDataDataTestTuple",
715 parameters={"xslice": slice(2, 4)},
716 )
718 def testPytypePutCoercion(self):
719 """Test python type coercion on Butler.get and put."""
721 # Store some data with the normal example storage class.
722 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
723 datasetTypeName = "test_metric"
724 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
726 dataId = {"instrument": "DummyCamComp", "visit": 423}
728 # Put a dict and this should coerce to a MetricsExample
729 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
730 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
731 test_metric = butler.getDirect(metric_ref)
732 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
733 self.assertEqual(test_metric.summary, test_dict["summary"])
734 self.assertEqual(test_metric.output, test_dict["output"])
736 # Check that the put still works if a DatasetType is given with
737 # a definition matching this python type.
738 registry_type = butler.registry.getDatasetType(datasetTypeName)
739 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
740 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
741 self.assertEqual(metric2_ref.datasetType, registry_type)
743 # The get will return the type expected by registry.
744 test_metric2 = butler.getDirect(metric2_ref)
745 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
747 # Make a new DatasetRef with the compatible but different DatasetType.
748 # This should now return a dict.
749 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
750 test_dict2 = butler.getDirect(new_ref)
751 self.assertEqual(get_full_type_name(test_dict2), "dict")
753 # Get it again with the wrong dataset type definition using get()
754 # rather than getDirect(). This should be consistent with getDirect()
755 # behavior and return the type of the DatasetType.
756 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
757 self.assertEqual(get_full_type_name(test_dict3), "dict")
759 def testIngest(self):
760 butler = Butler(self.tmpConfigFile, run=self.default_run)
762 # Create and register a DatasetType
763 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
765 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
766 datasetTypeName = "metric"
768 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
770 # Add needed Dimensions
771 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
772 butler.registry.insertDimensionData(
773 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
774 )
775 for detector in (1, 2):
776 butler.registry.insertDimensionData(
777 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
778 )
780 butler.registry.insertDimensionData(
781 "visit",
782 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
783 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
784 )
786 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
787 dataRoot = os.path.join(TESTDIR, "data", "basic")
788 datasets = []
789 for detector in (1, 2):
790 detector_name = f"detector_{detector}"
791 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
792 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
793 # Create a DatasetRef for ingest
794 refIn = DatasetRef(datasetType, dataId, id=None)
796 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
798 butler.ingest(*datasets, transfer="copy")
800 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
801 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
803 metrics1 = butler.get(datasetTypeName, dataId1)
804 metrics2 = butler.get(datasetTypeName, dataId2)
805 self.assertNotEqual(metrics1, metrics2)
807 # Compare URIs
808 uri1 = butler.getURI(datasetTypeName, dataId1)
809 uri2 = butler.getURI(datasetTypeName, dataId2)
810 self.assertNotEqual(uri1, uri2)
812 # Now do a multi-dataset but single file ingest
813 metricFile = os.path.join(dataRoot, "detectors.yaml")
814 refs = []
815 for detector in (1, 2):
816 detector_name = f"detector_{detector}"
817 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
818 # Create a DatasetRef for ingest
819 refs.append(DatasetRef(datasetType, dataId, id=None))
821 # Test "move" transfer to ensure that the files themselves
822 # have disappeared following ingest.
823 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
824 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
826 datasets = []
827 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
829 butler.ingest(*datasets, transfer="move", record_validation_info=False)
830 self.assertFalse(tempFile.exists())
832 # Check that the datastore recorded no file size.
833 # Not all datastores can support this.
834 try:
835 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
836 self.assertEqual(infos[0].file_size, -1)
837 except AttributeError:
838 pass
840 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
841 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
843 multi1 = butler.get(datasetTypeName, dataId1)
844 multi2 = butler.get(datasetTypeName, dataId2)
846 self.assertEqual(multi1, metrics1)
847 self.assertEqual(multi2, metrics2)
849 # Compare URIs
850 uri1 = butler.getURI(datasetTypeName, dataId1)
851 uri2 = butler.getURI(datasetTypeName, dataId2)
852 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
854 # Test that removing one does not break the second
855 # This line will issue a warning log message for a ChainedDatastore
856 # that uses an InMemoryDatastore since in-memory can not ingest
857 # files.
858 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
859 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
860 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
861 multi2b = butler.get(datasetTypeName, dataId2)
862 self.assertEqual(multi2, multi2b)
864 def testPruneCollections(self):
865 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
866 butler = Butler(self.tmpConfigFile, writeable=True)
867 # Load registry data with dimensions to hang datasets off of.
868 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
869 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
870 # Add some RUN-type collections.
871 run1 = "run1"
872 butler.registry.registerRun(run1)
873 run2 = "run2"
874 butler.registry.registerRun(run2)
875 # put some datasets. ref1 and ref2 have the same data ID, and are in
876 # different runs. ref3 has a different data ID.
877 metric = makeExampleMetrics()
878 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
879 datasetType = self.addDatasetType(
880 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
881 )
882 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
883 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
884 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
886 # Try to delete a RUN collection without purge, or with purge and not
887 # unstore.
888 with self.assertRaises(TypeError):
889 butler.pruneCollection(run1)
890 with self.assertRaises(TypeError):
891 butler.pruneCollection(run2, purge=True)
892 # Add a TAGGED collection and associate ref3 only into it.
893 tag1 = "tag1"
894 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
895 self.assertTrue(registered)
896 # Registering a second time should be allowed.
897 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
898 self.assertFalse(registered)
899 butler.registry.associate(tag1, [ref3])
900 # Add a CHAINED collection that searches run1 and then run2. It
901 # logically contains only ref1, because ref2 is shadowed due to them
902 # having the same data ID and dataset type.
903 chain1 = "chain1"
904 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
905 butler.registry.setCollectionChain(chain1, [run1, run2])
906 # Try to delete RUN collections, which should fail with complete
907 # rollback because they're still referenced by the CHAINED
908 # collection.
909 with self.assertRaises(sqlalchemy.exc.IntegrityError):
910 butler.pruneCollection(run1, purge=True, unstore=True)
911 with self.assertRaises(sqlalchemy.exc.IntegrityError):
912 butler.pruneCollection(run2, purge=True, unstore=True)
913 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
914 existence = butler.datastore.mexists([ref1, ref2, ref3])
915 self.assertTrue(existence[ref1])
916 self.assertTrue(existence[ref2])
917 self.assertTrue(existence[ref3])
918 # Try to delete CHAINED and TAGGED collections with purge; should not
919 # work.
920 with self.assertRaises(TypeError):
921 butler.pruneCollection(tag1, purge=True, unstore=True)
922 with self.assertRaises(TypeError):
923 butler.pruneCollection(chain1, purge=True, unstore=True)
924 # Remove the tagged collection with unstore=False. This should not
925 # affect the datasets.
926 butler.pruneCollection(tag1)
927 with self.assertRaises(MissingCollectionError):
928 butler.registry.getCollectionType(tag1)
929 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
930 existence = butler.datastore.mexists([ref1, ref2, ref3])
931 self.assertTrue(existence[ref1])
932 self.assertTrue(existence[ref2])
933 self.assertTrue(existence[ref3])
934 # Add the tagged collection back in, and remove it with unstore=True.
935 # This should remove ref3 only from the datastore.
936 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
937 butler.registry.associate(tag1, [ref3])
938 butler.pruneCollection(tag1, unstore=True)
939 with self.assertRaises(MissingCollectionError):
940 butler.registry.getCollectionType(tag1)
941 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
942 existence = butler.datastore.mexists([ref1, ref2, ref3])
943 self.assertTrue(existence[ref1])
944 self.assertTrue(existence[ref2])
945 self.assertFalse(existence[ref3])
946 # Delete the chain with unstore=False. The datasets should not be
947 # affected at all.
948 butler.pruneCollection(chain1)
949 with self.assertRaises(MissingCollectionError):
950 butler.registry.getCollectionType(chain1)
951 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
952 existence = butler.datastore.mexists([ref1, ref2, ref3])
953 self.assertTrue(existence[ref1])
954 self.assertTrue(existence[ref2])
955 self.assertFalse(existence[ref3])
956 existence = butler.datastore.knows_these([ref1, ref2, ref3])
957 self.assertTrue(existence[ref1])
958 self.assertTrue(existence[ref2])
959 self.assertFalse(existence[ref3])
960 # Redefine and then delete the chain with unstore=True. Only ref1
961 # should be unstored (ref3 has already been unstored, but otherwise
962 # would be now).
963 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
964 butler.registry.setCollectionChain(chain1, [run1, run2])
965 butler.pruneCollection(chain1, unstore=True)
966 with self.assertRaises(MissingCollectionError):
967 butler.registry.getCollectionType(chain1)
968 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
969 existence = butler.datastore.mexists([ref1, ref2, ref3])
970 self.assertFalse(existence[ref1])
971 self.assertTrue(existence[ref2])
972 self.assertFalse(existence[ref3])
973 # Remove run1. This removes ref1 and ref3 from the registry (they're
974 # already gone from the datastore, which is fine).
975 butler.pruneCollection(run1, purge=True, unstore=True)
976 with self.assertRaises(MissingCollectionError):
977 butler.registry.getCollectionType(run1)
978 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
979 self.assertTrue(butler.datastore.exists(ref2))
980 self.assertTrue(butler.datastore.knows(ref2))
981 # Remove run2. This removes ref2 from the registry and the datastore.
982 butler.pruneCollection(run2, purge=True, unstore=True)
983 with self.assertRaises(MissingCollectionError):
984 butler.registry.getCollectionType(run2)
985 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
987 # Now that the collections have been pruned we can remove the
988 # dataset type
989 butler.registry.removeDatasetType(datasetType.name)
991 def testPickle(self):
992 """Test pickle support."""
993 butler = Butler(self.tmpConfigFile, run=self.default_run)
994 butlerOut = pickle.loads(pickle.dumps(butler))
995 self.assertIsInstance(butlerOut, Butler)
996 self.assertEqual(butlerOut._config, butler._config)
997 self.assertEqual(butlerOut.collections, butler.collections)
998 self.assertEqual(butlerOut.run, butler.run)
1000 def testGetDatasetTypes(self):
1001 butler = Butler(self.tmpConfigFile, run=self.default_run)
1002 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
1003 dimensionEntries = [
1004 (
1005 "instrument",
1006 {"instrument": "DummyCam"},
1007 {"instrument": "DummyHSC"},
1008 {"instrument": "DummyCamComp"},
1009 ),
1010 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
1011 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
1012 ]
1013 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1014 # Add needed Dimensions
1015 for args in dimensionEntries:
1016 butler.registry.insertDimensionData(*args)
1018 # When a DatasetType is added to the registry entries are not created
1019 # for components but querying them can return the components.
1020 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
1021 components = set()
1022 for datasetTypeName in datasetTypeNames:
1023 # Create and register a DatasetType
1024 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1026 for componentName in storageClass.components:
1027 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
1029 fromRegistry: set[DatasetType] = set()
1030 for parent_dataset_type in butler.registry.queryDatasetTypes():
1031 fromRegistry.add(parent_dataset_type)
1032 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
1033 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
1035 # Now that we have some dataset types registered, validate them
1036 butler.validateConfiguration(
1037 ignore=[
1038 "test_metric_comp",
1039 "metric3",
1040 "metric5",
1041 "calexp",
1042 "DummySC",
1043 "datasetType.component",
1044 "random_data",
1045 "random_data_2",
1046 ]
1047 )
1049 # Add a new datasetType that will fail template validation
1050 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
1051 if self.validationCanFail:
1052 with self.assertRaises(ValidationError):
1053 butler.validateConfiguration()
1055 # Rerun validation but with a subset of dataset type names
1056 butler.validateConfiguration(datasetTypeNames=["metric4"])
1058 # Rerun validation but ignore the bad datasetType
1059 butler.validateConfiguration(
1060 ignore=[
1061 "test_metric_comp",
1062 "metric3",
1063 "metric5",
1064 "calexp",
1065 "DummySC",
1066 "datasetType.component",
1067 "random_data",
1068 "random_data_2",
1069 ]
1070 )
1072 def testTransaction(self):
1073 butler = Butler(self.tmpConfigFile, run=self.default_run)
1074 datasetTypeName = "test_metric"
1075 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1076 dimensionEntries = (
1077 ("instrument", {"instrument": "DummyCam"}),
1078 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
1079 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
1080 )
1081 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1082 metric = makeExampleMetrics()
1083 dataId = {"instrument": "DummyCam", "visit": 42}
1084 # Create and register a DatasetType
1085 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1086 with self.assertRaises(TransactionTestError):
1087 with butler.transaction():
1088 # Add needed Dimensions
1089 for args in dimensionEntries:
1090 butler.registry.insertDimensionData(*args)
1091 # Store a dataset
1092 ref = butler.put(metric, datasetTypeName, dataId)
1093 self.assertIsInstance(ref, DatasetRef)
1094 # Test getDirect
1095 metricOut = butler.getDirect(ref)
1096 self.assertEqual(metric, metricOut)
1097 # Test get
1098 metricOut = butler.get(datasetTypeName, dataId)
1099 self.assertEqual(metric, metricOut)
1100 # Check we can get components
1101 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
1102 raise TransactionTestError("This should roll back the entire transaction")
1103 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
1104 butler.registry.expandDataId(dataId)
1105 # Should raise LookupError for missing data ID value
1106 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
1107 butler.get(datasetTypeName, dataId)
1108 # Also check explicitly if Dataset entry is missing
1109 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
1110 # Direct retrieval should not find the file in the Datastore
1111 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
1112 butler.getDirect(ref)
1114 def testMakeRepo(self):
1115 """Test that we can write butler configuration to a new repository via
1116 the Butler.makeRepo interface and then instantiate a butler from the
1117 repo root.
1118 """
1119 # Do not run the test if we know this datastore configuration does
1120 # not support a file system root
1121 if self.fullConfigKey is None:
1122 return
1124 # create two separate directories
1125 root1 = tempfile.mkdtemp(dir=self.root)
1126 root2 = tempfile.mkdtemp(dir=self.root)
1128 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1129 limited = Config(self.configFile)
1130 butler1 = Butler(butlerConfig)
1131 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1132 full = Config(self.tmpConfigFile)
1133 butler2 = Butler(butlerConfig)
1134 # Butlers should have the same configuration regardless of whether
1135 # defaults were expanded.
1136 self.assertEqual(butler1._config, butler2._config)
1137 # Config files loaded directly should not be the same.
1138 self.assertNotEqual(limited, full)
1139 # Make sure "limited" doesn't have a few keys we know it should be
1140 # inheriting from defaults.
1141 self.assertIn(self.fullConfigKey, full)
1142 self.assertNotIn(self.fullConfigKey, limited)
1144 # Collections don't appear until something is put in them
1145 collections1 = set(butler1.registry.queryCollections())
1146 self.assertEqual(collections1, set())
1147 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1149 # Check that a config with no associated file name will not
1150 # work properly with relocatable Butler repo
1151 butlerConfig.configFile = None
1152 with self.assertRaises(ValueError):
1153 Butler(butlerConfig)
1155 with self.assertRaises(FileExistsError):
1156 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1158 def testStringification(self):
1159 butler = Butler(self.tmpConfigFile, run=self.default_run)
1160 butlerStr = str(butler)
1162 if self.datastoreStr is not None:
1163 for testStr in self.datastoreStr:
1164 self.assertIn(testStr, butlerStr)
1165 if self.registryStr is not None:
1166 self.assertIn(self.registryStr, butlerStr)
1168 datastoreName = butler.datastore.name
1169 if self.datastoreName is not None:
1170 for testStr in self.datastoreName:
1171 self.assertIn(testStr, datastoreName)
1173 def testButlerRewriteDataId(self):
1174 """Test that dataIds can be rewritten based on dimension records."""
1176 butler = Butler(self.tmpConfigFile, run=self.default_run)
1178 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1179 datasetTypeName = "random_data"
1181 # Create dimension records.
1182 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1183 butler.registry.insertDimensionData(
1184 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1185 )
1186 butler.registry.insertDimensionData(
1187 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1188 )
1190 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1191 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1192 butler.registry.registerDatasetType(datasetType)
1194 n_exposures = 5
1195 dayobs = 20210530
1197 for i in range(n_exposures):
1198 butler.registry.insertDimensionData(
1199 "exposure",
1200 {
1201 "instrument": "DummyCamComp",
1202 "id": i,
1203 "obs_id": f"exp{i}",
1204 "seq_num": i,
1205 "day_obs": dayobs,
1206 "physical_filter": "d-r",
1207 },
1208 )
1210 # Write some data.
1211 for i in range(n_exposures):
1212 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1214 # Use the seq_num for the put to test rewriting.
1215 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1216 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1218 # Check that the exposure is correct in the dataId
1219 self.assertEqual(ref.dataId["exposure"], i)
1221 # and check that we can get the dataset back with the same dataId
1222 new_metric = butler.get(datasetTypeName, dataId=dataId)
1223 self.assertEqual(new_metric, metric)
1226class FileDatastoreButlerTests(ButlerTests):
1227 """Common tests and specialization of ButlerTests for butlers backed
1228 by datastores that inherit from FileDatastore.
1229 """
1231 def checkFileExists(self, root, relpath):
1232 """Checks if file exists at a given path (relative to root).
1234 Test testPutTemplates verifies actual physical existance of the files
1235 in the requested location.
1236 """
1237 uri = ResourcePath(root, forceDirectory=True)
1238 return uri.join(relpath).exists()
1240 def testPutTemplates(self):
1241 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1242 butler = Butler(self.tmpConfigFile, run=self.default_run)
1244 # Add needed Dimensions
1245 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1246 butler.registry.insertDimensionData(
1247 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1248 )
1249 butler.registry.insertDimensionData(
1250 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1251 )
1252 butler.registry.insertDimensionData(
1253 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1254 )
1256 # Create and store a dataset
1257 metric = makeExampleMetrics()
1259 # Create two almost-identical DatasetTypes (both will use default
1260 # template)
1261 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1262 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1263 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1264 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1266 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1267 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1269 # Put with exactly the data ID keys needed
1270 ref = butler.put(metric, "metric1", dataId1)
1271 uri = butler.getURI(ref)
1272 self.assertTrue(uri.exists())
1273 self.assertTrue(
1274 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1275 )
1277 # Check the template based on dimensions
1278 if hasattr(butler.datastore, "templates"):
1279 butler.datastore.templates.validateTemplates([ref])
1281 # Put with extra data ID keys (physical_filter is an optional
1282 # dependency); should not change template (at least the way we're
1283 # defining them to behave now; the important thing is that they
1284 # must be consistent).
1285 ref = butler.put(metric, "metric2", dataId2)
1286 uri = butler.getURI(ref)
1287 self.assertTrue(uri.exists())
1288 self.assertTrue(
1289 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1290 )
1292 # Check the template based on dimensions
1293 if hasattr(butler.datastore, "templates"):
1294 butler.datastore.templates.validateTemplates([ref])
1296 # Use a template that has a typo in dimension record metadata.
1297 # Easier to test with a butler that has a ref with records attached.
1298 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1299 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1300 path = template.format(ref)
1301 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1303 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1304 with self.assertRaises(KeyError):
1305 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1306 template.format(ref)
1308 # Now use a file template that will not result in unique filenames
1309 with self.assertRaises(FileTemplateValidationError):
1310 butler.put(metric, "metric3", dataId1)
1312 def testImportExport(self):
1313 # Run put/get tests just to create and populate a repo.
1314 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1315 self.runImportExportTest(storageClass)
1317 @unittest.expectedFailure
1318 def testImportExportVirtualComposite(self):
1319 # Run put/get tests just to create and populate a repo.
1320 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1321 self.runImportExportTest(storageClass)
1323 def runImportExportTest(self, storageClass):
1324 """This test does an export to a temp directory and an import back
1325 into a new temp directory repo. It does not assume a posix datastore"""
1326 exportButler = self.runPutGetTest(storageClass, "test_metric")
1327 # Test that the repo actually has at least one dataset.
1328 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1329 self.assertGreater(len(datasets), 0)
1330 # Add a DimensionRecord that's unused by those datasets.
1331 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1332 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1333 # Export and then import datasets.
1334 with safeTestTempDir(TESTDIR) as exportDir:
1335 exportFile = os.path.join(exportDir, "exports.yaml")
1336 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1337 export.saveDatasets(datasets)
1338 # Export the same datasets again. This should quietly do
1339 # nothing because of internal deduplication, and it shouldn't
1340 # complain about being asked to export the "htm7" elements even
1341 # though there aren't any in these datasets or in the database.
1342 export.saveDatasets(datasets, elements=["htm7"])
1343 # Save one of the data IDs again; this should be harmless
1344 # because of internal deduplication.
1345 export.saveDataIds([datasets[0].dataId])
1346 # Save some dimension records directly.
1347 export.saveDimensionData("skymap", [skymapRecord])
1348 self.assertTrue(os.path.exists(exportFile))
1349 with safeTestTempDir(TESTDIR) as importDir:
1350 # We always want this to be a local posix butler
1351 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1352 # Calling script.butlerImport tests the implementation of the
1353 # butler command line interface "import" subcommand. Functions
1354 # in the script folder are generally considered protected and
1355 # should not be used as public api.
1356 with open(exportFile, "r") as f:
1357 script.butlerImport(
1358 importDir,
1359 export_file=f,
1360 directory=exportDir,
1361 transfer="auto",
1362 skip_dimensions=None,
1363 reuse_ids=False,
1364 )
1365 importButler = Butler(importDir, run=self.default_run)
1366 for ref in datasets:
1367 with self.subTest(ref=ref):
1368 # Test for existence by passing in the DatasetType and
1369 # data ID separately, to avoid lookup by dataset_id.
1370 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1371 self.assertEqual(
1372 list(importButler.registry.queryDimensionRecords("skymap")),
1373 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1374 )
1376 def testRemoveRuns(self):
1377 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1378 butler = Butler(self.tmpConfigFile, writeable=True)
1379 # Load registry data with dimensions to hang datasets off of.
1380 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1381 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1382 # Add some RUN-type collection.
1383 run1 = "run1"
1384 butler.registry.registerRun(run1)
1385 run2 = "run2"
1386 butler.registry.registerRun(run2)
1387 # put a dataset in each
1388 metric = makeExampleMetrics()
1389 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1390 datasetType = self.addDatasetType(
1391 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1392 )
1393 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1394 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1395 uri1 = butler.getURI(ref1, collections=[run1])
1396 uri2 = butler.getURI(ref2, collections=[run2])
1397 # Remove from both runs with different values for unstore.
1398 butler.removeRuns([run1], unstore=True)
1399 butler.removeRuns([run2], unstore=False)
1400 # Should be nothing in registry for either one, and datastore should
1401 # not think either exists.
1402 with self.assertRaises(MissingCollectionError):
1403 butler.registry.getCollectionType(run1)
1404 with self.assertRaises(MissingCollectionError):
1405 butler.registry.getCollectionType(run2)
1406 self.assertFalse(butler.datastore.exists(ref1))
1407 self.assertFalse(butler.datastore.exists(ref2))
1408 # The ref we unstored should be gone according to the URI, but the
1409 # one we forgot should still be around.
1410 self.assertFalse(uri1.exists())
1411 self.assertTrue(uri2.exists())
1414class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1415 """PosixDatastore specialization of a butler"""
1417 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1418 fullConfigKey = ".datastore.formatters"
1419 validationCanFail = True
1420 datastoreStr = ["/tmp"]
1421 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1422 registryStr = "/gen3.sqlite3"
1424 def testPathConstructor(self):
1425 """Independent test of constructor using PathLike."""
1426 butler = Butler(self.tmpConfigFile, run=self.default_run)
1427 self.assertIsInstance(butler, Butler)
1429 # And again with a Path object with the butler yaml
1430 path = pathlib.Path(self.tmpConfigFile)
1431 butler = Butler(path, writeable=False)
1432 self.assertIsInstance(butler, Butler)
1434 # And again with a Path object without the butler yaml
1435 # (making sure we skip it if the tmp config doesn't end
1436 # in butler.yaml -- which is the case for a subclass)
1437 if self.tmpConfigFile.endswith("butler.yaml"):
1438 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1439 butler = Butler(path, writeable=False)
1440 self.assertIsInstance(butler, Butler)
1442 def testExportTransferCopy(self):
1443 """Test local export using all transfer modes"""
1444 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1445 exportButler = self.runPutGetTest(storageClass, "test_metric")
1446 # Test that the repo actually has at least one dataset.
1447 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1448 self.assertGreater(len(datasets), 0)
1449 uris = [exportButler.getURI(d) for d in datasets]
1450 datastoreRoot = exportButler.datastore.root
1452 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1454 for path in pathsInStore:
1455 # Assume local file system
1456 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1458 for transfer in ("copy", "link", "symlink", "relsymlink"):
1459 with safeTestTempDir(TESTDIR) as exportDir:
1460 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1461 export.saveDatasets(datasets)
1462 for path in pathsInStore:
1463 self.assertTrue(
1464 self.checkFileExists(exportDir, path),
1465 f"Check that mode {transfer} exported files",
1466 )
1468 def testPruneDatasets(self):
1469 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1470 butler = Butler(self.tmpConfigFile, writeable=True)
1471 # Load registry data with dimensions to hang datasets off of.
1472 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1473 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1474 # Add some RUN-type collections.
1475 run1 = "run1"
1476 butler.registry.registerRun(run1)
1477 run2 = "run2"
1478 butler.registry.registerRun(run2)
1479 # put some datasets. ref1 and ref2 have the same data ID, and are in
1480 # different runs. ref3 has a different data ID.
1481 metric = makeExampleMetrics()
1482 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1483 datasetType = self.addDatasetType(
1484 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1485 )
1486 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1487 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1488 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1490 # Simple prune.
1491 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1492 with self.assertRaises(LookupError):
1493 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1495 # Put data back.
1496 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1497 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1498 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1500 # Check that in normal mode, deleting the record will lead to
1501 # trash not touching the file.
1502 uri1 = butler.datastore.getURI(ref1)
1503 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table
1504 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1505 butler.datastore.trash(ref1)
1506 butler.datastore.emptyTrash()
1507 self.assertTrue(uri1.exists())
1508 uri1.remove() # Clean it up.
1510 # Simulate execution butler setup by deleting the datastore
1511 # record but keeping the file around and trusting.
1512 butler.datastore.trustGetRequest = True
1513 uri2 = butler.datastore.getURI(ref2)
1514 uri3 = butler.datastore.getURI(ref3)
1515 self.assertTrue(uri2.exists())
1516 self.assertTrue(uri3.exists())
1518 # Remove the datastore record.
1519 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table
1520 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1521 self.assertTrue(uri2.exists())
1522 butler.datastore.trash([ref2, ref3])
1523 # Immediate removal for ref2 file
1524 self.assertFalse(uri2.exists())
1525 # But ref3 has to wait for the empty.
1526 self.assertTrue(uri3.exists())
1527 butler.datastore.emptyTrash()
1528 self.assertFalse(uri3.exists())
1530 # Clear out the datasets from registry.
1531 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1533 def testPytypeCoercion(self):
1534 """Test python type coercion on Butler.get and put."""
1536 # Store some data with the normal example storage class.
1537 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1538 datasetTypeName = "test_metric"
1539 butler = self.runPutGetTest(storageClass, datasetTypeName)
1541 dataId = {"instrument": "DummyCamComp", "visit": 423}
1542 metric = butler.get(datasetTypeName, dataId=dataId)
1543 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1545 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1546 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1548 # Now need to hack the registry dataset type definition.
1549 # There is no API for this.
1550 manager = butler.registry._managers.datasets
1551 manager._db.update(
1552 manager._static.dataset_type,
1553 {"name": datasetTypeName},
1554 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1555 )
1557 # Force reset of dataset type cache
1558 butler.registry.refresh()
1560 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1561 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1562 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1564 metric_model = butler.get(datasetTypeName, dataId=dataId)
1565 self.assertNotEqual(type(metric_model), type(metric))
1566 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1568 # Put the model and read it back to show that everything now
1569 # works as normal.
1570 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1571 metric_model_new = butler.get(metric_ref)
1572 self.assertEqual(metric_model_new, metric_model)
1574 # Hack the storage class again to something that will fail on the
1575 # get with no conversion class.
1576 manager._db.update(
1577 manager._static.dataset_type,
1578 {"name": datasetTypeName},
1579 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1580 )
1581 butler.registry.refresh()
1583 with self.assertRaises(ValueError):
1584 butler.get(datasetTypeName, dataId=dataId)
1587@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1588class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1589 """PosixDatastore specialization of a butler using Postgres"""
1591 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1592 fullConfigKey = ".datastore.formatters"
1593 validationCanFail = True
1594 datastoreStr = ["/tmp"]
1595 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1596 registryStr = "PostgreSQL@test"
1598 @staticmethod
1599 def _handler(postgresql):
1600 engine = sqlalchemy.engine.create_engine(postgresql.url())
1601 with engine.begin() as connection:
1602 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1604 @classmethod
1605 def setUpClass(cls):
1606 # Create the postgres test server.
1607 cls.postgresql = testing.postgresql.PostgresqlFactory(
1608 cache_initialized_db=True, on_initialized=cls._handler
1609 )
1610 super().setUpClass()
1612 @classmethod
1613 def tearDownClass(cls):
1614 # Clean up any lingering SQLAlchemy engines/connections
1615 # so they're closed before we shut down the server.
1616 gc.collect()
1617 cls.postgresql.clear_cache()
1618 super().tearDownClass()
1620 def setUp(self):
1621 self.server = self.postgresql()
1623 # Need to add a registry section to the config.
1624 self._temp_config = False
1625 config = Config(self.configFile)
1626 config["registry", "db"] = self.server.url()
1627 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1628 config.dump(fh)
1629 self.configFile = fh.name
1630 self._temp_config = True
1631 super().setUp()
1633 def tearDown(self):
1634 self.server.stop()
1635 if self._temp_config and os.path.exists(self.configFile):
1636 os.remove(self.configFile)
1637 super().tearDown()
1639 def testMakeRepo(self):
1640 # The base class test assumes that it's using sqlite and assumes
1641 # the config file is acceptable to sqlite.
1642 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1645class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1646 """InMemoryDatastore specialization of a butler"""
1648 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1649 fullConfigKey = None
1650 useTempRoot = False
1651 validationCanFail = False
1652 datastoreStr = ["datastore='InMemory"]
1653 datastoreName = ["InMemoryDatastore@"]
1654 registryStr = "/gen3.sqlite3"
1656 def testIngest(self):
1657 pass
1660class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1661 """PosixDatastore specialization"""
1663 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1664 fullConfigKey = ".datastore.datastores.1.formatters"
1665 validationCanFail = True
1666 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1667 datastoreName = [
1668 "InMemoryDatastore@",
1669 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1670 "SecondDatastore",
1671 ]
1672 registryStr = "/gen3.sqlite3"
1675class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1676 """Test that a yaml file in one location can refer to a root in another."""
1678 datastoreStr = ["dir1"]
1679 # Disable the makeRepo test since we are deliberately not using
1680 # butler.yaml as the config name.
1681 fullConfigKey = None
1683 def setUp(self):
1684 self.root = makeTestTempDir(TESTDIR)
1686 # Make a new repository in one place
1687 self.dir1 = os.path.join(self.root, "dir1")
1688 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1690 # Move the yaml file to a different place and add a "root"
1691 self.dir2 = os.path.join(self.root, "dir2")
1692 os.makedirs(self.dir2, exist_ok=True)
1693 configFile1 = os.path.join(self.dir1, "butler.yaml")
1694 config = Config(configFile1)
1695 config["root"] = self.dir1
1696 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1697 config.dumpToUri(configFile2)
1698 os.remove(configFile1)
1699 self.tmpConfigFile = configFile2
1701 def testFileLocations(self):
1702 self.assertNotEqual(self.dir1, self.dir2)
1703 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1704 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1705 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1708class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1709 """Test that a config file created by makeRepo outside of repo works."""
1711 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1713 def setUp(self):
1714 self.root = makeTestTempDir(TESTDIR)
1715 self.root2 = makeTestTempDir(TESTDIR)
1717 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1718 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1720 def tearDown(self):
1721 if os.path.exists(self.root2):
1722 shutil.rmtree(self.root2, ignore_errors=True)
1723 super().tearDown()
1725 def testConfigExistence(self):
1726 c = Config(self.tmpConfigFile)
1727 uri_config = ResourcePath(c["root"])
1728 uri_expected = ResourcePath(self.root, forceDirectory=True)
1729 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1730 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1732 def testPutGet(self):
1733 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1734 self.runPutGetTest(storageClass, "test_metric")
1737class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1738 """Test that a config file created by makeRepo outside of repo works."""
1740 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1742 def setUp(self):
1743 self.root = makeTestTempDir(TESTDIR)
1744 self.root2 = makeTestTempDir(TESTDIR)
1746 self.tmpConfigFile = self.root2
1747 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1749 def testConfigExistence(self):
1750 # Append the yaml file else Config constructor does not know the file
1751 # type.
1752 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1753 super().testConfigExistence()
1756class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1757 """Test that a config file created by makeRepo outside of repo works."""
1759 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1761 def setUp(self):
1762 self.root = makeTestTempDir(TESTDIR)
1763 self.root2 = makeTestTempDir(TESTDIR)
1765 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1766 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1769@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1770class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1771 """S3Datastore specialization of a butler; an S3 storage Datastore +
1772 a local in-memory SqlRegistry.
1773 """
1775 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1776 fullConfigKey = None
1777 validationCanFail = True
1779 bucketName = "anybucketname"
1780 """Name of the Bucket that will be used in the tests. The name is read from
1781 the config file used with the tests during set-up.
1782 """
1784 root = "butlerRoot/"
1785 """Root repository directory expected to be used in case useTempRoot=False.
1786 Otherwise the root is set to a 20 characters long randomly generated string
1787 during set-up.
1788 """
1790 datastoreStr = [f"datastore={root}"]
1791 """Contains all expected root locations in a format expected to be
1792 returned by Butler stringification.
1793 """
1795 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1796 """The expected format of the S3 Datastore string."""
1798 registryStr = "/gen3.sqlite3"
1799 """Expected format of the Registry string."""
1801 mock_s3 = mock_s3()
1802 """The mocked s3 interface from moto."""
1804 def genRoot(self):
1805 """Returns a random string of len 20 to serve as a root
1806 name for the temporary bucket repo.
1808 This is equivalent to tempfile.mkdtemp as this is what self.root
1809 becomes when useTempRoot is True.
1810 """
1811 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1812 return rndstr + "/"
1814 def setUp(self):
1815 config = Config(self.configFile)
1816 uri = ResourcePath(config[".datastore.datastore.root"])
1817 self.bucketName = uri.netloc
1819 # Enable S3 mocking of tests.
1820 self.mock_s3.start()
1822 # set up some fake credentials if they do not exist
1823 self.usingDummyCredentials = setAwsEnvCredentials()
1825 if self.useTempRoot:
1826 self.root = self.genRoot()
1827 rooturi = f"s3://{self.bucketName}/{self.root}"
1828 config.update({"datastore": {"datastore": {"root": rooturi}}})
1830 # need local folder to store registry database
1831 self.reg_dir = makeTestTempDir(TESTDIR)
1832 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1834 # MOTO needs to know that we expect Bucket bucketname to exist
1835 # (this used to be the class attribute bucketName)
1836 s3 = boto3.resource("s3")
1837 s3.create_bucket(Bucket=self.bucketName)
1839 self.datastoreStr = f"datastore={self.root}"
1840 self.datastoreName = [f"FileDatastore@{rooturi}"]
1841 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1842 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1844 def tearDown(self):
1845 s3 = boto3.resource("s3")
1846 bucket = s3.Bucket(self.bucketName)
1847 try:
1848 bucket.objects.all().delete()
1849 except botocore.exceptions.ClientError as e:
1850 if e.response["Error"]["Code"] == "404":
1851 # the key was not reachable - pass
1852 pass
1853 else:
1854 raise
1856 bucket = s3.Bucket(self.bucketName)
1857 bucket.delete()
1859 # Stop the S3 mock.
1860 self.mock_s3.stop()
1862 # unset any potentially set dummy credentials
1863 if self.usingDummyCredentials:
1864 unsetAwsEnvCredentials()
1866 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1867 shutil.rmtree(self.reg_dir, ignore_errors=True)
1869 if self.useTempRoot and os.path.exists(self.root):
1870 shutil.rmtree(self.root, ignore_errors=True)
1872 super().tearDown()
1875@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1876class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1877 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1878 a local in-memory SqlRegistry.
1879 """
1881 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1882 fullConfigKey = None
1883 validationCanFail = True
1885 serverName = "localhost"
1886 """Name of the server that will be used in the tests.
1887 """
1889 portNumber = 8080
1890 """Port on which the webdav server listens. Automatically chosen
1891 at setUpClass via the _getfreeport() method
1892 """
1894 root = "butlerRoot/"
1895 """Root repository directory expected to be used in case useTempRoot=False.
1896 Otherwise the root is set to a 20 characters long randomly generated string
1897 during set-up.
1898 """
1900 datastoreStr = [f"datastore={root}"]
1901 """Contains all expected root locations in a format expected to be
1902 returned by Butler stringification.
1903 """
1905 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1906 """The expected format of the WebdavDatastore string."""
1908 registryStr = "/gen3.sqlite3"
1909 """Expected format of the Registry string."""
1911 serverThread = None
1912 """Thread in which the local webdav server will run"""
1914 stopWebdavServer = False
1915 """This flag will cause the webdav server to
1916 gracefully shut down when True
1917 """
1919 def genRoot(self):
1920 """Returns a random string of len 20 to serve as a root
1921 name for the temporary bucket repo.
1923 This is equivalent to tempfile.mkdtemp as this is what self.root
1924 becomes when useTempRoot is True.
1925 """
1926 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1927 return rndstr + "/"
1929 @classmethod
1930 def setUpClass(cls):
1931 # Do the same as inherited class
1932 cls.storageClassFactory = StorageClassFactory()
1933 cls.storageClassFactory.addFromConfig(cls.configFile)
1935 cls.portNumber = cls._getfreeport()
1936 # Run a local webdav server on which tests will be run
1937 cls.serverThread = Thread(
1938 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1939 )
1940 cls.serverThread.start()
1941 # Wait for it to start
1942 time.sleep(3)
1944 @classmethod
1945 def tearDownClass(cls):
1946 # Ask for graceful shut down of the webdav server
1947 cls.stopWebdavServer = True
1948 # Wait for the thread to exit
1949 cls.serverThread.join()
1950 super().tearDownClass()
1952 def setUp(self):
1953 config = Config(self.configFile)
1955 if self.useTempRoot:
1956 self.root = self.genRoot()
1957 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1958 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1960 # need local folder to store registry database
1961 self.reg_dir = makeTestTempDir(TESTDIR)
1962 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1964 self.datastoreStr = f"datastore={self.root}"
1965 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1967 if not _is_webdav_endpoint(self.rooturi):
1968 raise OSError("Webdav server not running properly: cannot run tests.")
1970 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1971 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1973 def tearDown(self):
1974 # Clear temporary directory
1975 ResourcePath(self.rooturi).remove()
1976 ResourcePath(self.rooturi).session.close()
1978 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1979 shutil.rmtree(self.reg_dir, ignore_errors=True)
1981 if self.useTempRoot and os.path.exists(self.root):
1982 shutil.rmtree(self.root, ignore_errors=True)
1984 super().tearDown()
1986 def _serveWebdav(self, port: int, stopWebdavServer):
1987 """Starts a local webdav-compatible HTTP server,
1988 Listening on http://localhost:port
1989 This server only runs when this test class is instantiated,
1990 and then shuts down. Must be started is a separate thread.
1992 Parameters
1993 ----------
1994 port : `int`
1995 The port number on which the server should listen
1996 """
1997 root_path = gettempdir()
1999 config = {
2000 "host": "0.0.0.0",
2001 "port": port,
2002 "provider_mapping": {"/": root_path},
2003 "http_authenticator": {"domain_controller": None},
2004 "simple_dc": {"user_mapping": {"*": True}},
2005 "verbose": 0,
2006 }
2007 app = WsgiDAVApp(config)
2009 server_args = {
2010 "bind_addr": (config["host"], config["port"]),
2011 "wsgi_app": app,
2012 }
2013 server = wsgi.Server(**server_args)
2014 server.prepare()
2016 try:
2017 # Start the actual server in a separate thread
2018 t = Thread(target=server.serve, daemon=True)
2019 t.start()
2020 # watch stopWebdavServer, and gracefully
2021 # shut down the server when True
2022 while True:
2023 if stopWebdavServer():
2024 break
2025 time.sleep(1)
2026 except KeyboardInterrupt:
2027 print("Caught Ctrl-C, shutting down...")
2028 finally:
2029 server.stop()
2030 t.join()
2032 def _getfreeport():
2033 """
2034 Determines a free port using sockets.
2035 """
2036 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
2037 free_socket.bind(("127.0.0.1", 0))
2038 free_socket.listen()
2039 port = free_socket.getsockname()[1]
2040 free_socket.close()
2041 return port
2044class PosixDatastoreTransfers(unittest.TestCase):
2045 """Test data transfers between butlers.
2047 Test for different managers. UUID to UUID and integer to integer are
2048 tested. UUID to integer is not supported since we do not currently
2049 want to allow that. Integer to UUID is supported with the caveat
2050 that UUID4 will be generated and this will be incorrect for raw
2051 dataset types. The test ignores that.
2052 """
2054 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2056 @classmethod
2057 def setUpClass(cls):
2058 cls.storageClassFactory = StorageClassFactory()
2059 cls.storageClassFactory.addFromConfig(cls.configFile)
2061 def setUp(self):
2062 self.root = makeTestTempDir(TESTDIR)
2063 self.config = Config(self.configFile)
2065 def tearDown(self):
2066 removeTestTempDir(self.root)
2068 def create_butler(self, manager, label):
2069 config = Config(self.configFile)
2070 config["registry", "managers", "datasets"] = manager
2071 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
2073 def create_butlers(self, manager1, manager2):
2074 self.source_butler = self.create_butler(manager1, "1")
2075 self.target_butler = self.create_butler(manager2, "2")
2077 def testTransferUuidToUuid(self):
2078 self.create_butlers(
2079 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2080 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2081 )
2082 # Setting id_gen_map should have no effect here
2083 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
2085 def testTransferMissing(self):
2086 """Test transfers where datastore records are missing.
2088 This is how execution butler works.
2089 """
2090 self.create_butlers(
2091 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2092 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2093 )
2095 # Configure the source butler to allow trust.
2096 self.source_butler.datastore.trustGetRequest = True
2098 self.assertButlerTransfers(purge=True)
2100 def testTransferMissingDisassembly(self):
2101 """Test transfers where datastore records are missing.
2103 This is how execution butler works.
2104 """
2105 self.create_butlers(
2106 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2107 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2108 )
2110 # Configure the source butler to allow trust.
2111 self.source_butler.datastore.trustGetRequest = True
2113 # Test disassembly.
2114 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
2116 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
2117 """Test that a run can be transferred to another butler."""
2119 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2120 datasetTypeName = "random_data"
2122 # Test will create 3 collections and we will want to transfer
2123 # two of those three.
2124 runs = ["run1", "run2", "other"]
2126 # Also want to use two different dataset types to ensure that
2127 # grouping works.
2128 datasetTypeNames = ["random_data", "random_data_2"]
2130 # Create the run collections in the source butler.
2131 for run in runs:
2132 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2134 # Create dimensions in source butler.
2135 n_exposures = 30
2136 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2137 self.source_butler.registry.insertDimensionData(
2138 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2139 )
2140 self.source_butler.registry.insertDimensionData(
2141 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2142 )
2144 for i in range(n_exposures):
2145 self.source_butler.registry.insertDimensionData(
2146 "exposure",
2147 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2148 )
2150 # Create dataset types in the source butler.
2151 dimensions = self.source_butler.registry.dimensions.extract(["instrument", "exposure"])
2152 for datasetTypeName in datasetTypeNames:
2153 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2154 self.source_butler.registry.registerDatasetType(datasetType)
2156 # Write a dataset to an unrelated run -- this will ensure that
2157 # we are rewriting integer dataset ids in the target if necessary.
2158 # Will not be relevant for UUID.
2159 run = "distraction"
2160 butler = Butler(butler=self.source_butler, run=run)
2161 butler.put(
2162 makeExampleMetrics(),
2163 datasetTypeName,
2164 exposure=1,
2165 instrument="DummyCamComp",
2166 physical_filter="d-r",
2167 )
2169 # Write some example metrics to the source
2170 butler = Butler(butler=self.source_butler)
2172 # Set of DatasetRefs that should be in the list of refs to transfer
2173 # but which will not be transferred.
2174 deleted = set()
2176 n_expected = 20 # Number of datasets expected to be transferred
2177 source_refs = []
2178 for i in range(n_exposures):
2179 # Put a third of datasets into each collection, only retain
2180 # two thirds.
2181 index = i % 3
2182 run = runs[index]
2183 datasetTypeName = datasetTypeNames[i % 2]
2185 metric_data = {
2186 "summary": {"counter": i},
2187 "output": {"text": "metric"},
2188 "data": [2 * x for x in range(i)],
2189 }
2190 metric = MetricsExample(**metric_data)
2191 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2192 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2194 # Remove the datastore record using low-level API
2195 if purge:
2196 # Remove records for a fraction.
2197 if index == 1:
2198 # For one of these delete the file as well.
2199 # This allows the "missing" code to filter the
2200 # file out.
2201 if not deleted:
2202 primary, uris = butler.datastore.getURIs(ref)
2203 if primary:
2204 primary.remove()
2205 for uri in uris.values():
2206 uri.remove()
2207 n_expected -= 1
2208 deleted.add(ref)
2210 # Remove the datastore record.
2211 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2213 if index < 2:
2214 source_refs.append(ref)
2215 if ref not in deleted:
2216 new_metric = butler.get(ref.unresolved(), collections=run)
2217 self.assertEqual(new_metric, metric)
2219 # Create some bad dataset types to ensure we check for inconsistent
2220 # definitions.
2221 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2222 for datasetTypeName in datasetTypeNames:
2223 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2224 self.target_butler.registry.registerDatasetType(datasetType)
2225 with self.assertRaises(ConflictingDefinitionError) as cm:
2226 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2227 self.assertIn("dataset type differs", str(cm.exception))
2229 # And remove the bad definitions.
2230 for datasetTypeName in datasetTypeNames:
2231 self.target_butler.registry.removeDatasetType(datasetTypeName)
2233 # Transfer without creating dataset types should fail.
2234 with self.assertRaises(KeyError):
2235 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2237 # Transfer without creating dimensions should fail.
2238 with self.assertRaises(ConflictingDefinitionError) as cm:
2239 self.target_butler.transfer_from(
2240 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2241 )
2242 self.assertIn("dimension", str(cm.exception))
2244 # The failed transfer above leaves registry in an inconsistent
2245 # state because the run is created but then rolled back without
2246 # the collection cache being cleared. For now force a refresh.
2247 # Can remove with DM-35498.
2248 self.target_butler.registry.refresh()
2250 # Now transfer them to the second butler, including dimensions.
2251 with self.assertLogs(level=logging.DEBUG) as cm:
2252 transferred = self.target_butler.transfer_from(
2253 self.source_butler,
2254 source_refs,
2255 id_gen_map=id_gen_map,
2256 register_dataset_types=True,
2257 transfer_dimensions=True,
2258 )
2259 self.assertEqual(len(transferred), n_expected)
2260 log_output = ";".join(cm.output)
2261 self.assertIn("found in datastore for chunk", log_output)
2262 self.assertIn("Creating output run", log_output)
2264 # Do the transfer twice to ensure that it will do nothing extra.
2265 # Only do this if purge=True because it does not work for int
2266 # dataset_id.
2267 if purge:
2268 # This should not need to register dataset types.
2269 transferred = self.target_butler.transfer_from(
2270 self.source_butler, source_refs, id_gen_map=id_gen_map
2271 )
2272 self.assertEqual(len(transferred), n_expected)
2274 # Also do an explicit low-level transfer to trigger some
2275 # edge cases.
2276 with self.assertLogs(level=logging.DEBUG) as cm:
2277 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2278 log_output = ";".join(cm.output)
2279 self.assertIn("no file artifacts exist", log_output)
2281 with self.assertRaises(TypeError):
2282 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2284 with self.assertRaises(ValueError):
2285 self.target_butler.datastore.transfer_from(
2286 self.source_butler.datastore, source_refs, transfer="split"
2287 )
2289 # Now try to get the same refs from the new butler.
2290 for ref in source_refs:
2291 if ref not in deleted:
2292 unresolved_ref = ref.unresolved()
2293 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2294 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2295 self.assertEqual(new_metric, old_metric)
2297 # Now prune run2 collection and create instead a CHAINED collection.
2298 # This should block the transfer.
2299 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2300 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2301 with self.assertRaises(CollectionTypeError):
2302 # Re-importing the run1 datasets can be problematic if they
2303 # use integer IDs so filter those out.
2304 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2305 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2308if __name__ == "__main__": 2308 ↛ 2309line 2308 didn't jump to line 2309, because the condition on line 2308 was never true
2309 unittest.main()