Coverage for tests/test_butler.py: 16%
1215 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-28 07:52 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-28 07:52 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import gc
26import logging
27import os
28import pathlib
29import pickle
30import posixpath
31import random
32import shutil
33import socket
34import string
35import tempfile
36import time
37import unittest
38from tempfile import gettempdir
39from threading import Thread
41try:
42 import boto3
43 import botocore
44 from moto import mock_s3
45except ImportError:
46 boto3 = None
48 def mock_s3(cls):
49 """A no-op decorator in case moto mock_s3 can not be imported."""
50 return cls
53try:
54 # It's possible but silly to have testing.postgresql installed without
55 # having the postgresql server installed (because then nothing in
56 # testing.postgresql would work), so we use the presence of that module
57 # to test whether we can expect the server to be available.
58 import testing.postgresql
59except ImportError:
60 testing = None
63try:
64 from cheroot import wsgi
65 from wsgidav.wsgidav_app import WsgiDAVApp
66except ImportError:
67 WsgiDAVApp = None
69import astropy.time
70import sqlalchemy
71from lsst.daf.butler import (
72 Butler,
73 ButlerConfig,
74 CollectionSearch,
75 CollectionType,
76 Config,
77 DatasetIdGenEnum,
78 DatasetRef,
79 DatasetType,
80 FileDataset,
81 FileTemplate,
82 FileTemplateValidationError,
83 StorageClassFactory,
84 ValidationError,
85 script,
86)
87from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
88from lsst.daf.butler.registry import (
89 CollectionError,
90 CollectionTypeError,
91 ConflictingDefinitionError,
92 DataIdValueError,
93 MissingCollectionError,
94)
95from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
96from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
97from lsst.resources import ResourcePath
98from lsst.resources.http import _is_webdav_endpoint
99from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
100from lsst.utils import doImport
101from lsst.utils.introspection import get_full_type_name
103TESTDIR = os.path.abspath(os.path.dirname(__file__))
106def makeExampleMetrics():
107 return MetricsExample(
108 {"AM1": 5.2, "AM2": 30.6},
109 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
110 [563, 234, 456.7, 752, 8, 9, 27],
111 )
114class TransactionTestError(Exception):
115 """Specific error for testing transactions, to prevent misdiagnosing
116 that might otherwise occur when a standard exception is used.
117 """
119 pass
122class ButlerConfigTests(unittest.TestCase):
123 """Simple tests for ButlerConfig that are not tested in any other test
124 cases."""
126 def testSearchPath(self):
127 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
128 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
129 config1 = ButlerConfig(configFile)
130 self.assertNotIn("testConfigs", "\n".join(cm.output))
132 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
133 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
134 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
135 self.assertIn("testConfigs", "\n".join(cm.output))
137 key = ("datastore", "records", "table")
138 self.assertNotEqual(config1[key], config2[key])
139 self.assertEqual(config2[key], "override_record")
142class ButlerPutGetTests:
143 """Helper method for running a suite of put/get tests from different
144 butler configurations."""
146 root = None
147 default_run = "ingésτ😺"
149 @staticmethod
150 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
151 """Create a DatasetType and register it"""
152 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
153 registry.registerDatasetType(datasetType)
154 return datasetType
156 @classmethod
157 def setUpClass(cls):
158 cls.storageClassFactory = StorageClassFactory()
159 cls.storageClassFactory.addFromConfig(cls.configFile)
161 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
162 datasetType = datasetRef.datasetType
163 dataId = datasetRef.dataId
164 deferred = butler.getDirectDeferred(datasetRef)
166 for component in components:
167 compTypeName = datasetType.componentTypeName(component)
168 result = butler.get(compTypeName, dataId, collections=collections)
169 self.assertEqual(result, getattr(reference, component))
170 result_deferred = deferred.get(component=component)
171 self.assertEqual(result_deferred, result)
173 def tearDown(self):
174 removeTestTempDir(self.root)
176 def create_butler(self, run, storageClass, datasetTypeName):
177 butler = Butler(self.tmpConfigFile, run=run)
179 collections = set(butler.registry.queryCollections())
180 self.assertEqual(collections, set([run]))
182 # Create and register a DatasetType
183 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
185 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
187 # Add needed Dimensions
188 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
189 butler.registry.insertDimensionData(
190 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
191 )
192 butler.registry.insertDimensionData(
193 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
194 )
195 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
196 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
197 butler.registry.insertDimensionData(
198 "visit",
199 {
200 "instrument": "DummyCamComp",
201 "id": 423,
202 "name": "fourtwentythree",
203 "physical_filter": "d-r",
204 "visit_system": 1,
205 "datetime_begin": visit_start,
206 "datetime_end": visit_end,
207 },
208 )
210 # Add more visits for some later tests
211 for visit_id in (424, 425):
212 butler.registry.insertDimensionData(
213 "visit",
214 {
215 "instrument": "DummyCamComp",
216 "id": visit_id,
217 "name": f"fourtwentyfour_{visit_id}",
218 "physical_filter": "d-r",
219 "visit_system": 1,
220 },
221 )
222 return butler, datasetType
224 def runPutGetTest(self, storageClass, datasetTypeName):
225 # New datasets will be added to run and tag, but we will only look in
226 # tag when looking up datasets.
227 run = self.default_run
228 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
230 # Create and store a dataset
231 metric = makeExampleMetrics()
232 dataId = {"instrument": "DummyCamComp", "visit": 423}
234 # Create a DatasetRef for put
235 refIn = DatasetRef(datasetType, dataId, id=None)
237 # Put with a preexisting id should fail
238 with self.assertRaises(ValueError):
239 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
241 # Put and remove the dataset once as a DatasetRef, once as a dataId,
242 # and once with a DatasetType
244 # Keep track of any collections we add and do not clean up
245 expected_collections = {run}
247 counter = 0
248 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
249 # Since we are using subTest we can get cascading failures
250 # here with the first attempt failing and the others failing
251 # immediately because the dataset already exists. Work around
252 # this by using a distinct run collection each time
253 counter += 1
254 this_run = f"put_run_{counter}"
255 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
256 expected_collections.update({this_run})
258 with self.subTest(args=args):
259 ref = butler.put(metric, *args, run=this_run)
260 self.assertIsInstance(ref, DatasetRef)
262 # Test getDirect
263 metricOut = butler.getDirect(ref)
264 self.assertEqual(metric, metricOut)
265 # Test get
266 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
267 self.assertEqual(metric, metricOut)
268 # Test get with a datasetRef
269 metricOut = butler.get(ref, collections=this_run)
270 self.assertEqual(metric, metricOut)
271 # Test getDeferred with dataId
272 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
273 self.assertEqual(metric, metricOut)
274 # Test getDeferred with a datasetRef
275 metricOut = butler.getDeferred(ref, collections=this_run).get()
276 self.assertEqual(metric, metricOut)
277 # and deferred direct with ref
278 metricOut = butler.getDirectDeferred(ref).get()
279 self.assertEqual(metric, metricOut)
281 # Check we can get components
282 if storageClass.isComposite():
283 self.assertGetComponents(
284 butler, ref, ("summary", "data", "output"), metric, collections=this_run
285 )
287 # Can the artifacts themselves be retrieved?
288 if not butler.datastore.isEphemeral:
289 root_uri = ResourcePath(self.root)
291 for preserve_path in (True, False):
292 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
293 # Use copy so that we can test that overwrite
294 # protection works (using "auto" for File URIs would
295 # use hard links and subsequent transfer would work
296 # because it knows they are the same file).
297 transferred = butler.retrieveArtifacts(
298 [ref], destination, preserve_path=preserve_path, transfer="copy"
299 )
300 self.assertGreater(len(transferred), 0)
301 artifacts = list(ResourcePath.findFileResources([destination]))
302 self.assertEqual(set(transferred), set(artifacts))
304 for artifact in transferred:
305 path_in_destination = artifact.relative_to(destination)
306 self.assertIsNotNone(path_in_destination)
308 # when path is not preserved there should not be
309 # any path separators.
310 num_seps = path_in_destination.count("/")
311 if preserve_path:
312 self.assertGreater(num_seps, 0)
313 else:
314 self.assertEqual(num_seps, 0)
316 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
317 n_uris = len(secondary_uris)
318 if primary_uri:
319 n_uris += 1
320 self.assertEqual(
321 len(artifacts),
322 n_uris,
323 "Comparing expected artifacts vs actual:"
324 f" {artifacts} vs {primary_uri} and {secondary_uris}",
325 )
327 if preserve_path:
328 # No need to run these twice
329 with self.assertRaises(ValueError):
330 butler.retrieveArtifacts([ref], destination, transfer="move")
332 with self.assertRaises(FileExistsError):
333 butler.retrieveArtifacts([ref], destination)
335 transferred_again = butler.retrieveArtifacts(
336 [ref], destination, preserve_path=preserve_path, overwrite=True
337 )
338 self.assertEqual(set(transferred_again), set(transferred))
340 # Now remove the dataset completely.
341 butler.pruneDatasets([ref], purge=True, unstore=True)
342 # Lookup with original args should still fail.
343 with self.assertRaises(LookupError):
344 butler.datasetExists(*args, collections=this_run)
345 # getDirect() should still fail.
346 with self.assertRaises(FileNotFoundError):
347 butler.getDirect(ref)
348 # Registry shouldn't be able to find it by dataset_id anymore.
349 self.assertIsNone(butler.registry.getDataset(ref.id))
351 # Do explicit registry removal since we know they are
352 # empty
353 butler.registry.removeCollection(this_run)
354 expected_collections.remove(this_run)
356 # Put the dataset again, since the last thing we did was remove it
357 # and we want to use the default collection.
358 ref = butler.put(metric, refIn)
360 # Get with parameters
361 stop = 4
362 sliced = butler.get(ref, parameters={"slice": slice(stop)})
363 self.assertNotEqual(metric, sliced)
364 self.assertEqual(metric.summary, sliced.summary)
365 self.assertEqual(metric.output, sliced.output)
366 self.assertEqual(metric.data[:stop], sliced.data)
367 # getDeferred with parameters
368 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
369 self.assertNotEqual(metric, sliced)
370 self.assertEqual(metric.summary, sliced.summary)
371 self.assertEqual(metric.output, sliced.output)
372 self.assertEqual(metric.data[:stop], sliced.data)
373 # getDeferred with deferred parameters
374 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
375 self.assertNotEqual(metric, sliced)
376 self.assertEqual(metric.summary, sliced.summary)
377 self.assertEqual(metric.output, sliced.output)
378 self.assertEqual(metric.data[:stop], sliced.data)
380 if storageClass.isComposite():
381 # Check that components can be retrieved
382 metricOut = butler.get(ref.datasetType.name, dataId)
383 compNameS = ref.datasetType.componentTypeName("summary")
384 compNameD = ref.datasetType.componentTypeName("data")
385 summary = butler.get(compNameS, dataId)
386 self.assertEqual(summary, metric.summary)
387 data = butler.get(compNameD, dataId)
388 self.assertEqual(data, metric.data)
390 if "counter" in storageClass.derivedComponents:
391 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
392 self.assertEqual(count, len(data))
394 count = butler.get(
395 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
396 )
397 self.assertEqual(count, stop)
399 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
400 summary = butler.getDirect(compRef)
401 self.assertEqual(summary, metric.summary)
403 # Create a Dataset type that has the same name but is inconsistent.
404 inconsistentDatasetType = DatasetType(
405 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
406 )
408 # Getting with a dataset type that does not match registry fails
409 with self.assertRaises(ValueError):
410 butler.get(inconsistentDatasetType, dataId)
412 # Combining a DatasetRef with a dataId should fail
413 with self.assertRaises(ValueError):
414 butler.get(ref, dataId)
415 # Getting with an explicit ref should fail if the id doesn't match
416 with self.assertRaises(ValueError):
417 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
419 # Getting a dataset with unknown parameters should fail
420 with self.assertRaises(KeyError):
421 butler.get(ref, parameters={"unsupported": True})
423 # Check we have a collection
424 collections = set(butler.registry.queryCollections())
425 self.assertEqual(collections, expected_collections)
427 # Clean up to check that we can remove something that may have
428 # already had a component removed
429 butler.pruneDatasets([ref], unstore=True, purge=True)
431 # Check that we can configure a butler to accept a put even
432 # if it already has the dataset in registry.
433 ref = butler.put(metric, refIn)
435 # Repeat put will fail.
436 with self.assertRaises(ConflictingDefinitionError):
437 butler.put(metric, refIn)
439 # Remove the datastore entry.
440 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
442 # Put will still fail
443 with self.assertRaises(ConflictingDefinitionError):
444 butler.put(metric, refIn)
446 # Allow the put to succeed
447 butler._allow_put_of_predefined_dataset = True
448 ref2 = butler.put(metric, refIn)
449 self.assertEqual(ref2.id, ref.id)
451 # A second put will still fail but with a different exception
452 # than before.
453 with self.assertRaises(ConflictingDefinitionError):
454 butler.put(metric, refIn)
456 # Reset the flag to avoid confusion
457 butler._allow_put_of_predefined_dataset = False
459 # Leave the dataset in place since some downstream tests require
460 # something to be present
462 return butler
464 def testDeferredCollectionPassing(self):
465 # Construct a butler with no run or collection, but make it writeable.
466 butler = Butler(self.tmpConfigFile, writeable=True)
467 # Create and register a DatasetType
468 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
469 datasetType = self.addDatasetType(
470 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
471 )
472 # Add needed Dimensions
473 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
474 butler.registry.insertDimensionData(
475 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
476 )
477 butler.registry.insertDimensionData(
478 "visit",
479 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
480 )
481 dataId = {"instrument": "DummyCamComp", "visit": 423}
482 # Create dataset.
483 metric = makeExampleMetrics()
484 # Register a new run and put dataset.
485 run = "deferred"
486 self.assertTrue(butler.registry.registerRun(run))
487 # Second time it will be allowed but indicate no-op
488 self.assertFalse(butler.registry.registerRun(run))
489 ref = butler.put(metric, datasetType, dataId, run=run)
490 # Putting with no run should fail with TypeError.
491 with self.assertRaises(CollectionError):
492 butler.put(metric, datasetType, dataId)
493 # Dataset should exist.
494 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
495 # We should be able to get the dataset back, but with and without
496 # a deferred dataset handle.
497 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
498 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
499 # Trying to find the dataset without any collection is a TypeError.
500 with self.assertRaises(CollectionError):
501 butler.datasetExists(datasetType, dataId)
502 with self.assertRaises(CollectionError):
503 butler.get(datasetType, dataId)
504 # Associate the dataset with a different collection.
505 butler.registry.registerCollection("tagged")
506 butler.registry.associate("tagged", [ref])
507 # Deleting the dataset from the new collection should make it findable
508 # in the original collection.
509 butler.pruneDatasets([ref], tags=["tagged"])
510 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
513class ButlerTests(ButlerPutGetTests):
514 """Tests for Butler."""
516 useTempRoot = True
518 def setUp(self):
519 """Create a new butler root for each test."""
520 self.root = makeTestTempDir(TESTDIR)
521 Butler.makeRepo(self.root, config=Config(self.configFile))
522 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
524 def testConstructor(self):
525 """Independent test of constructor."""
526 butler = Butler(self.tmpConfigFile, run=self.default_run)
527 self.assertIsInstance(butler, Butler)
529 # Check that butler.yaml is added automatically.
530 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
531 config_dir = self.tmpConfigFile[: -len(end)]
532 butler = Butler(config_dir, run=self.default_run)
533 self.assertIsInstance(butler, Butler)
535 # Even with a ResourcePath.
536 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
537 self.assertIsInstance(butler, Butler)
539 collections = set(butler.registry.queryCollections())
540 self.assertEqual(collections, {self.default_run})
542 # Check that some special characters can be included in run name.
543 special_run = "u@b.c-A"
544 butler_special = Butler(butler=butler, run=special_run)
545 collections = set(butler_special.registry.queryCollections("*@*"))
546 self.assertEqual(collections, {special_run})
548 butler2 = Butler(butler=butler, collections=["other"])
549 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
550 self.assertIsNone(butler2.run)
551 self.assertIs(butler.datastore, butler2.datastore)
553 # Test that we can use an environment variable to find this
554 # repository.
555 butler_index = Config()
556 butler_index["label"] = self.tmpConfigFile
557 for suffix in (".yaml", ".json"):
558 # Ensure that the content differs so that we know that
559 # we aren't reusing the cache.
560 bad_label = f"s3://bucket/not_real{suffix}"
561 butler_index["bad_label"] = bad_label
562 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
563 butler_index.dumpToUri(temp_file)
564 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
565 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
566 uri = Butler.get_repo_uri("bad_label")
567 self.assertEqual(uri, ResourcePath(bad_label))
568 uri = Butler.get_repo_uri("label")
569 butler = Butler(uri, writeable=False)
570 self.assertIsInstance(butler, Butler)
571 butler = Butler("label", writeable=False)
572 self.assertIsInstance(butler, Butler)
573 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
574 Butler("not_there", writeable=False)
575 with self.assertRaises(KeyError) as cm:
576 Butler.get_repo_uri("missing")
577 self.assertIn("not known to", str(cm.exception))
578 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
579 with self.assertRaises(FileNotFoundError):
580 Butler.get_repo_uri("label")
581 self.assertEqual(Butler.get_known_repos(), set())
582 with self.assertRaises(KeyError) as cm:
583 # No environment variable set.
584 Butler.get_repo_uri("label")
585 self.assertIn("No repository index defined", str(cm.exception))
586 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
587 # No aliases registered.
588 Butler("not_there")
589 self.assertEqual(Butler.get_known_repos(), set())
591 def testBasicPutGet(self):
592 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
593 self.runPutGetTest(storageClass, "test_metric")
595 def testCompositePutGetConcrete(self):
597 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
598 butler = self.runPutGetTest(storageClass, "test_metric")
600 # Should *not* be disassembled
601 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
602 self.assertEqual(len(datasets), 1)
603 uri, components = butler.getURIs(datasets[0])
604 self.assertIsInstance(uri, ResourcePath)
605 self.assertFalse(components)
606 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
607 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
609 # Predicted dataset
610 dataId = {"instrument": "DummyCamComp", "visit": 424}
611 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
612 self.assertFalse(components)
613 self.assertIsInstance(uri, ResourcePath)
614 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
615 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
617 def testCompositePutGetVirtual(self):
618 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
619 butler = self.runPutGetTest(storageClass, "test_metric_comp")
621 # Should be disassembled
622 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
623 self.assertEqual(len(datasets), 1)
624 uri, components = butler.getURIs(datasets[0])
626 if butler.datastore.isEphemeral:
627 # Never disassemble in-memory datastore
628 self.assertIsInstance(uri, ResourcePath)
629 self.assertFalse(components)
630 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
631 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
632 else:
633 self.assertIsNone(uri)
634 self.assertEqual(set(components), set(storageClass.components))
635 for compuri in components.values():
636 self.assertIsInstance(compuri, ResourcePath)
637 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
638 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
640 # Predicted dataset
641 dataId = {"instrument": "DummyCamComp", "visit": 424}
642 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
644 if butler.datastore.isEphemeral:
645 # Never disassembled
646 self.assertIsInstance(uri, ResourcePath)
647 self.assertFalse(components)
648 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
649 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
650 else:
651 self.assertIsNone(uri)
652 self.assertEqual(set(components), set(storageClass.components))
653 for compuri in components.values():
654 self.assertIsInstance(compuri, ResourcePath)
655 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
656 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
658 def testIngest(self):
659 butler = Butler(self.tmpConfigFile, run=self.default_run)
661 # Create and register a DatasetType
662 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
664 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
665 datasetTypeName = "metric"
667 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
669 # Add needed Dimensions
670 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
671 butler.registry.insertDimensionData(
672 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
673 )
674 for detector in (1, 2):
675 butler.registry.insertDimensionData(
676 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
677 )
679 butler.registry.insertDimensionData(
680 "visit",
681 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
682 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
683 )
685 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
686 dataRoot = os.path.join(TESTDIR, "data", "basic")
687 datasets = []
688 for detector in (1, 2):
689 detector_name = f"detector_{detector}"
690 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
691 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
692 # Create a DatasetRef for ingest
693 refIn = DatasetRef(datasetType, dataId, id=None)
695 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
697 butler.ingest(*datasets, transfer="copy")
699 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
700 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
702 metrics1 = butler.get(datasetTypeName, dataId1)
703 metrics2 = butler.get(datasetTypeName, dataId2)
704 self.assertNotEqual(metrics1, metrics2)
706 # Compare URIs
707 uri1 = butler.getURI(datasetTypeName, dataId1)
708 uri2 = butler.getURI(datasetTypeName, dataId2)
709 self.assertNotEqual(uri1, uri2)
711 # Now do a multi-dataset but single file ingest
712 metricFile = os.path.join(dataRoot, "detectors.yaml")
713 refs = []
714 for detector in (1, 2):
715 detector_name = f"detector_{detector}"
716 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
717 # Create a DatasetRef for ingest
718 refs.append(DatasetRef(datasetType, dataId, id=None))
720 datasets = []
721 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
723 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
725 # Check that the datastore recorded no file size.
726 # Not all datastores can support this.
727 try:
728 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
729 self.assertEqual(infos[0].file_size, -1)
730 except AttributeError:
731 pass
733 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
734 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
736 multi1 = butler.get(datasetTypeName, dataId1)
737 multi2 = butler.get(datasetTypeName, dataId2)
739 self.assertEqual(multi1, metrics1)
740 self.assertEqual(multi2, metrics2)
742 # Compare URIs
743 uri1 = butler.getURI(datasetTypeName, dataId1)
744 uri2 = butler.getURI(datasetTypeName, dataId2)
745 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
747 # Test that removing one does not break the second
748 # This line will issue a warning log message for a ChainedDatastore
749 # that uses an InMemoryDatastore since in-memory can not ingest
750 # files.
751 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
752 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
753 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
754 multi2b = butler.get(datasetTypeName, dataId2)
755 self.assertEqual(multi2, multi2b)
757 def testPruneCollections(self):
758 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
759 butler = Butler(self.tmpConfigFile, writeable=True)
760 # Load registry data with dimensions to hang datasets off of.
761 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
762 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
763 # Add some RUN-type collections.
764 run1 = "run1"
765 butler.registry.registerRun(run1)
766 run2 = "run2"
767 butler.registry.registerRun(run2)
768 # put some datasets. ref1 and ref2 have the same data ID, and are in
769 # different runs. ref3 has a different data ID.
770 metric = makeExampleMetrics()
771 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
772 datasetType = self.addDatasetType(
773 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
774 )
775 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
776 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
777 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
779 # Try to delete a RUN collection without purge, or with purge and not
780 # unstore.
781 with self.assertRaises(TypeError):
782 butler.pruneCollection(run1)
783 with self.assertRaises(TypeError):
784 butler.pruneCollection(run2, purge=True)
785 # Add a TAGGED collection and associate ref3 only into it.
786 tag1 = "tag1"
787 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
788 self.assertTrue(registered)
789 # Registering a second time should be allowed.
790 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
791 self.assertFalse(registered)
792 butler.registry.associate(tag1, [ref3])
793 # Add a CHAINED collection that searches run1 and then run2. It
794 # logically contains only ref1, because ref2 is shadowed due to them
795 # having the same data ID and dataset type.
796 chain1 = "chain1"
797 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
798 butler.registry.setCollectionChain(chain1, [run1, run2])
799 # Try to delete RUN collections, which should fail with complete
800 # rollback because they're still referenced by the CHAINED
801 # collection.
802 with self.assertRaises(Exception):
803 butler.pruneCollection(run1, pruge=True, unstore=True)
804 with self.assertRaises(Exception):
805 butler.pruneCollection(run2, pruge=True, unstore=True)
806 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
807 existence = butler.datastore.mexists([ref1, ref2, ref3])
808 self.assertTrue(existence[ref1])
809 self.assertTrue(existence[ref2])
810 self.assertTrue(existence[ref3])
811 # Try to delete CHAINED and TAGGED collections with purge; should not
812 # work.
813 with self.assertRaises(TypeError):
814 butler.pruneCollection(tag1, purge=True, unstore=True)
815 with self.assertRaises(TypeError):
816 butler.pruneCollection(chain1, purge=True, unstore=True)
817 # Remove the tagged collection with unstore=False. This should not
818 # affect the datasets.
819 butler.pruneCollection(tag1)
820 with self.assertRaises(MissingCollectionError):
821 butler.registry.getCollectionType(tag1)
822 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
823 existence = butler.datastore.mexists([ref1, ref2, ref3])
824 self.assertTrue(existence[ref1])
825 self.assertTrue(existence[ref2])
826 self.assertTrue(existence[ref3])
827 # Add the tagged collection back in, and remove it with unstore=True.
828 # This should remove ref3 only from the datastore.
829 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
830 butler.registry.associate(tag1, [ref3])
831 butler.pruneCollection(tag1, unstore=True)
832 with self.assertRaises(MissingCollectionError):
833 butler.registry.getCollectionType(tag1)
834 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
835 existence = butler.datastore.mexists([ref1, ref2, ref3])
836 self.assertTrue(existence[ref1])
837 self.assertTrue(existence[ref2])
838 self.assertFalse(existence[ref3])
839 # Delete the chain with unstore=False. The datasets should not be
840 # affected at all.
841 butler.pruneCollection(chain1)
842 with self.assertRaises(MissingCollectionError):
843 butler.registry.getCollectionType(chain1)
844 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
845 existence = butler.datastore.mexists([ref1, ref2, ref3])
846 self.assertTrue(existence[ref1])
847 self.assertTrue(existence[ref2])
848 self.assertFalse(existence[ref3])
849 # Redefine and then delete the chain with unstore=True. Only ref1
850 # should be unstored (ref3 has already been unstored, but otherwise
851 # would be now).
852 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
853 butler.registry.setCollectionChain(chain1, [run1, run2])
854 butler.pruneCollection(chain1, unstore=True)
855 with self.assertRaises(MissingCollectionError):
856 butler.registry.getCollectionType(chain1)
857 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
858 existence = butler.datastore.mexists([ref1, ref2, ref3])
859 self.assertFalse(existence[ref1])
860 self.assertTrue(existence[ref2])
861 self.assertFalse(existence[ref3])
862 # Remove run1. This removes ref1 and ref3 from the registry (they're
863 # already gone from the datastore, which is fine).
864 butler.pruneCollection(run1, purge=True, unstore=True)
865 with self.assertRaises(MissingCollectionError):
866 butler.registry.getCollectionType(run1)
867 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
868 self.assertTrue(butler.datastore.exists(ref2))
869 # Remove run2. This removes ref2 from the registry and the datastore.
870 butler.pruneCollection(run2, purge=True, unstore=True)
871 with self.assertRaises(MissingCollectionError):
872 butler.registry.getCollectionType(run2)
873 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
875 # Now that the collections have been pruned we can remove the
876 # dataset type
877 butler.registry.removeDatasetType(datasetType.name)
879 def testPickle(self):
880 """Test pickle support."""
881 butler = Butler(self.tmpConfigFile, run=self.default_run)
882 butlerOut = pickle.loads(pickle.dumps(butler))
883 self.assertIsInstance(butlerOut, Butler)
884 self.assertEqual(butlerOut._config, butler._config)
885 self.assertEqual(butlerOut.collections, butler.collections)
886 self.assertEqual(butlerOut.run, butler.run)
888 def testGetDatasetTypes(self):
889 butler = Butler(self.tmpConfigFile, run=self.default_run)
890 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
891 dimensionEntries = [
892 (
893 "instrument",
894 {"instrument": "DummyCam"},
895 {"instrument": "DummyHSC"},
896 {"instrument": "DummyCamComp"},
897 ),
898 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
899 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
900 ]
901 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
902 # Add needed Dimensions
903 for args in dimensionEntries:
904 butler.registry.insertDimensionData(*args)
906 # When a DatasetType is added to the registry entries are not created
907 # for components but querying them can return the components.
908 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
909 components = set()
910 for datasetTypeName in datasetTypeNames:
911 # Create and register a DatasetType
912 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
914 for componentName in storageClass.components:
915 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
917 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
918 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
920 # Now that we have some dataset types registered, validate them
921 butler.validateConfiguration(
922 ignore=[
923 "test_metric_comp",
924 "metric3",
925 "metric5",
926 "calexp",
927 "DummySC",
928 "datasetType.component",
929 "random_data",
930 "random_data_2",
931 ]
932 )
934 # Add a new datasetType that will fail template validation
935 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
936 if self.validationCanFail:
937 with self.assertRaises(ValidationError):
938 butler.validateConfiguration()
940 # Rerun validation but with a subset of dataset type names
941 butler.validateConfiguration(datasetTypeNames=["metric4"])
943 # Rerun validation but ignore the bad datasetType
944 butler.validateConfiguration(
945 ignore=[
946 "test_metric_comp",
947 "metric3",
948 "metric5",
949 "calexp",
950 "DummySC",
951 "datasetType.component",
952 "random_data",
953 "random_data_2",
954 ]
955 )
957 def testTransaction(self):
958 butler = Butler(self.tmpConfigFile, run=self.default_run)
959 datasetTypeName = "test_metric"
960 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
961 dimensionEntries = (
962 ("instrument", {"instrument": "DummyCam"}),
963 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
964 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
965 )
966 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
967 metric = makeExampleMetrics()
968 dataId = {"instrument": "DummyCam", "visit": 42}
969 # Create and register a DatasetType
970 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
971 with self.assertRaises(TransactionTestError):
972 with butler.transaction():
973 # Add needed Dimensions
974 for args in dimensionEntries:
975 butler.registry.insertDimensionData(*args)
976 # Store a dataset
977 ref = butler.put(metric, datasetTypeName, dataId)
978 self.assertIsInstance(ref, DatasetRef)
979 # Test getDirect
980 metricOut = butler.getDirect(ref)
981 self.assertEqual(metric, metricOut)
982 # Test get
983 metricOut = butler.get(datasetTypeName, dataId)
984 self.assertEqual(metric, metricOut)
985 # Check we can get components
986 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
987 raise TransactionTestError("This should roll back the entire transaction")
988 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
989 butler.registry.expandDataId(dataId)
990 # Should raise LookupError for missing data ID value
991 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
992 butler.get(datasetTypeName, dataId)
993 # Also check explicitly if Dataset entry is missing
994 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
995 # Direct retrieval should not find the file in the Datastore
996 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
997 butler.getDirect(ref)
999 def testMakeRepo(self):
1000 """Test that we can write butler configuration to a new repository via
1001 the Butler.makeRepo interface and then instantiate a butler from the
1002 repo root.
1003 """
1004 # Do not run the test if we know this datastore configuration does
1005 # not support a file system root
1006 if self.fullConfigKey is None:
1007 return
1009 # create two separate directories
1010 root1 = tempfile.mkdtemp(dir=self.root)
1011 root2 = tempfile.mkdtemp(dir=self.root)
1013 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1014 limited = Config(self.configFile)
1015 butler1 = Butler(butlerConfig)
1016 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1017 full = Config(self.tmpConfigFile)
1018 butler2 = Butler(butlerConfig)
1019 # Butlers should have the same configuration regardless of whether
1020 # defaults were expanded.
1021 self.assertEqual(butler1._config, butler2._config)
1022 # Config files loaded directly should not be the same.
1023 self.assertNotEqual(limited, full)
1024 # Make sure "limited" doesn't have a few keys we know it should be
1025 # inheriting from defaults.
1026 self.assertIn(self.fullConfigKey, full)
1027 self.assertNotIn(self.fullConfigKey, limited)
1029 # Collections don't appear until something is put in them
1030 collections1 = set(butler1.registry.queryCollections())
1031 self.assertEqual(collections1, set())
1032 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1034 # Check that a config with no associated file name will not
1035 # work properly with relocatable Butler repo
1036 butlerConfig.configFile = None
1037 with self.assertRaises(ValueError):
1038 Butler(butlerConfig)
1040 with self.assertRaises(FileExistsError):
1041 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1043 def testStringification(self):
1044 butler = Butler(self.tmpConfigFile, run=self.default_run)
1045 butlerStr = str(butler)
1047 if self.datastoreStr is not None:
1048 for testStr in self.datastoreStr:
1049 self.assertIn(testStr, butlerStr)
1050 if self.registryStr is not None:
1051 self.assertIn(self.registryStr, butlerStr)
1053 datastoreName = butler.datastore.name
1054 if self.datastoreName is not None:
1055 for testStr in self.datastoreName:
1056 self.assertIn(testStr, datastoreName)
1058 def testButlerRewriteDataId(self):
1059 """Test that dataIds can be rewritten based on dimension records."""
1061 butler = Butler(self.tmpConfigFile, run=self.default_run)
1063 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1064 datasetTypeName = "random_data"
1066 # Create dimension records.
1067 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1068 butler.registry.insertDimensionData(
1069 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1070 )
1071 butler.registry.insertDimensionData(
1072 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1073 )
1075 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1076 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1077 butler.registry.registerDatasetType(datasetType)
1079 n_exposures = 5
1080 dayobs = 20210530
1082 for i in range(n_exposures):
1083 butler.registry.insertDimensionData(
1084 "exposure",
1085 {
1086 "instrument": "DummyCamComp",
1087 "id": i,
1088 "obs_id": f"exp{i}",
1089 "seq_num": i,
1090 "day_obs": dayobs,
1091 "physical_filter": "d-r",
1092 },
1093 )
1095 # Write some data.
1096 for i in range(n_exposures):
1097 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1099 # Use the seq_num for the put to test rewriting.
1100 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1101 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1103 # Check that the exposure is correct in the dataId
1104 self.assertEqual(ref.dataId["exposure"], i)
1106 # and check that we can get the dataset back with the same dataId
1107 new_metric = butler.get(datasetTypeName, dataId=dataId)
1108 self.assertEqual(new_metric, metric)
1111class FileDatastoreButlerTests(ButlerTests):
1112 """Common tests and specialization of ButlerTests for butlers backed
1113 by datastores that inherit from FileDatastore.
1114 """
1116 def checkFileExists(self, root, relpath):
1117 """Checks if file exists at a given path (relative to root).
1119 Test testPutTemplates verifies actual physical existance of the files
1120 in the requested location.
1121 """
1122 uri = ResourcePath(root, forceDirectory=True)
1123 return uri.join(relpath).exists()
1125 def testPutTemplates(self):
1126 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1127 butler = Butler(self.tmpConfigFile, run=self.default_run)
1129 # Add needed Dimensions
1130 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1131 butler.registry.insertDimensionData(
1132 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1133 )
1134 butler.registry.insertDimensionData(
1135 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1136 )
1137 butler.registry.insertDimensionData(
1138 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1139 )
1141 # Create and store a dataset
1142 metric = makeExampleMetrics()
1144 # Create two almost-identical DatasetTypes (both will use default
1145 # template)
1146 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1147 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1148 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1149 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1151 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1152 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1154 # Put with exactly the data ID keys needed
1155 ref = butler.put(metric, "metric1", dataId1)
1156 uri = butler.getURI(ref)
1157 self.assertTrue(
1158 self.checkFileExists(
1159 butler.datastore.root, f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle"
1160 ),
1161 f"Checking existence of {uri}",
1162 )
1164 # Check the template based on dimensions
1165 butler.datastore.templates.validateTemplates([ref])
1167 # Put with extra data ID keys (physical_filter is an optional
1168 # dependency); should not change template (at least the way we're
1169 # defining them to behave now; the important thing is that they
1170 # must be consistent).
1171 ref = butler.put(metric, "metric2", dataId2)
1172 uri = butler.getURI(ref)
1173 self.assertTrue(
1174 self.checkFileExists(
1175 butler.datastore.root, f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle"
1176 ),
1177 f"Checking existence of {uri}",
1178 )
1180 # Check the template based on dimensions
1181 butler.datastore.templates.validateTemplates([ref])
1183 # Use a template that has a typo in dimension record metadata.
1184 # Easier to test with a butler that has a ref with records attached.
1185 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1186 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1187 path = template.format(ref)
1188 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1190 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1191 with self.assertRaises(KeyError):
1192 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1193 template.format(ref)
1195 # Now use a file template that will not result in unique filenames
1196 with self.assertRaises(FileTemplateValidationError):
1197 butler.put(metric, "metric3", dataId1)
1199 def testImportExport(self):
1200 # Run put/get tests just to create and populate a repo.
1201 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1202 self.runImportExportTest(storageClass)
1204 @unittest.expectedFailure
1205 def testImportExportVirtualComposite(self):
1206 # Run put/get tests just to create and populate a repo.
1207 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1208 self.runImportExportTest(storageClass)
1210 def runImportExportTest(self, storageClass):
1211 """This test does an export to a temp directory and an import back
1212 into a new temp directory repo. It does not assume a posix datastore"""
1213 exportButler = self.runPutGetTest(storageClass, "test_metric")
1214 print("Root:", exportButler.datastore.root)
1215 # Test that the repo actually has at least one dataset.
1216 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1217 self.assertGreater(len(datasets), 0)
1218 # Add a DimensionRecord that's unused by those datasets.
1219 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1220 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1221 # Export and then import datasets.
1222 with safeTestTempDir(TESTDIR) as exportDir:
1223 exportFile = os.path.join(exportDir, "exports.yaml")
1224 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1225 export.saveDatasets(datasets)
1226 # Export the same datasets again. This should quietly do
1227 # nothing because of internal deduplication, and it shouldn't
1228 # complain about being asked to export the "htm7" elements even
1229 # though there aren't any in these datasets or in the database.
1230 export.saveDatasets(datasets, elements=["htm7"])
1231 # Save one of the data IDs again; this should be harmless
1232 # because of internal deduplication.
1233 export.saveDataIds([datasets[0].dataId])
1234 # Save some dimension records directly.
1235 export.saveDimensionData("skymap", [skymapRecord])
1236 self.assertTrue(os.path.exists(exportFile))
1237 with safeTestTempDir(TESTDIR) as importDir:
1238 # We always want this to be a local posix butler
1239 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1240 # Calling script.butlerImport tests the implementation of the
1241 # butler command line interface "import" subcommand. Functions
1242 # in the script folder are generally considered protected and
1243 # should not be used as public api.
1244 with open(exportFile, "r") as f:
1245 script.butlerImport(
1246 importDir,
1247 export_file=f,
1248 directory=exportDir,
1249 transfer="auto",
1250 skip_dimensions=None,
1251 reuse_ids=False,
1252 )
1253 importButler = Butler(importDir, run=self.default_run)
1254 for ref in datasets:
1255 with self.subTest(ref=ref):
1256 # Test for existence by passing in the DatasetType and
1257 # data ID separately, to avoid lookup by dataset_id.
1258 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1259 self.assertEqual(
1260 list(importButler.registry.queryDimensionRecords("skymap")),
1261 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1262 )
1264 def testRemoveRuns(self):
1265 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1266 butler = Butler(self.tmpConfigFile, writeable=True)
1267 # Load registry data with dimensions to hang datasets off of.
1268 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1269 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1270 # Add some RUN-type collection.
1271 run1 = "run1"
1272 butler.registry.registerRun(run1)
1273 run2 = "run2"
1274 butler.registry.registerRun(run2)
1275 # put a dataset in each
1276 metric = makeExampleMetrics()
1277 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1278 datasetType = self.addDatasetType(
1279 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1280 )
1281 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1282 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1283 uri1 = butler.getURI(ref1, collections=[run1])
1284 uri2 = butler.getURI(ref2, collections=[run2])
1285 # Remove from both runs with different values for unstore.
1286 butler.removeRuns([run1], unstore=True)
1287 butler.removeRuns([run2], unstore=False)
1288 # Should be nothing in registry for either one, and datastore should
1289 # not think either exists.
1290 with self.assertRaises(MissingCollectionError):
1291 butler.registry.getCollectionType(run1)
1292 with self.assertRaises(MissingCollectionError):
1293 butler.registry.getCollectionType(run2)
1294 self.assertFalse(butler.datastore.exists(ref1))
1295 self.assertFalse(butler.datastore.exists(ref2))
1296 # The ref we unstored should be gone according to the URI, but the
1297 # one we forgot should still be around.
1298 self.assertFalse(uri1.exists())
1299 self.assertTrue(uri2.exists())
1302class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1303 """PosixDatastore specialization of a butler"""
1305 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1306 fullConfigKey = ".datastore.formatters"
1307 validationCanFail = True
1308 datastoreStr = ["/tmp"]
1309 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1310 registryStr = "/gen3.sqlite3"
1312 def testPathConstructor(self):
1313 """Independent test of constructor using PathLike."""
1314 butler = Butler(self.tmpConfigFile, run=self.default_run)
1315 self.assertIsInstance(butler, Butler)
1317 # And again with a Path object with the butler yaml
1318 path = pathlib.Path(self.tmpConfigFile)
1319 butler = Butler(path, writeable=False)
1320 self.assertIsInstance(butler, Butler)
1322 # And again with a Path object without the butler yaml
1323 # (making sure we skip it if the tmp config doesn't end
1324 # in butler.yaml -- which is the case for a subclass)
1325 if self.tmpConfigFile.endswith("butler.yaml"):
1326 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1327 butler = Butler(path, writeable=False)
1328 self.assertIsInstance(butler, Butler)
1330 def testExportTransferCopy(self):
1331 """Test local export using all transfer modes"""
1332 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1333 exportButler = self.runPutGetTest(storageClass, "test_metric")
1334 # Test that the repo actually has at least one dataset.
1335 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1336 self.assertGreater(len(datasets), 0)
1337 uris = [exportButler.getURI(d) for d in datasets]
1338 datastoreRoot = exportButler.datastore.root
1340 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1342 for path in pathsInStore:
1343 # Assume local file system
1344 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1346 for transfer in ("copy", "link", "symlink", "relsymlink"):
1347 with safeTestTempDir(TESTDIR) as exportDir:
1348 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1349 export.saveDatasets(datasets)
1350 for path in pathsInStore:
1351 self.assertTrue(
1352 self.checkFileExists(exportDir, path),
1353 f"Check that mode {transfer} exported files",
1354 )
1356 def testPruneDatasets(self):
1357 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1358 butler = Butler(self.tmpConfigFile, writeable=True)
1359 # Load registry data with dimensions to hang datasets off of.
1360 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1361 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1362 # Add some RUN-type collections.
1363 run1 = "run1"
1364 butler.registry.registerRun(run1)
1365 run2 = "run2"
1366 butler.registry.registerRun(run2)
1367 # put some datasets. ref1 and ref2 have the same data ID, and are in
1368 # different runs. ref3 has a different data ID.
1369 metric = makeExampleMetrics()
1370 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1371 datasetType = self.addDatasetType(
1372 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1373 )
1374 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1375 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1376 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1378 # Simple prune.
1379 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1380 with self.assertRaises(LookupError):
1381 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1383 # Put data back.
1384 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1385 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1386 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1388 # Check that in normal mode, deleting the record will lead to
1389 # trash not touching the file.
1390 uri1 = butler.datastore.getURI(ref1)
1391 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1392 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1393 butler.datastore.trash(ref1)
1394 butler.datastore.emptyTrash()
1395 self.assertTrue(uri1.exists())
1396 uri1.remove() # Clean it up.
1398 # Simulate execution butler setup by deleting the datastore
1399 # record but keeping the file around and trusting.
1400 butler.datastore.trustGetRequest = True
1401 uri2 = butler.datastore.getURI(ref2)
1402 uri3 = butler.datastore.getURI(ref3)
1403 self.assertTrue(uri2.exists())
1404 self.assertTrue(uri3.exists())
1406 # Remove the datastore record.
1407 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1408 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1409 self.assertTrue(uri2.exists())
1410 butler.datastore.trash([ref2, ref3])
1411 # Immediate removal for ref2 file
1412 self.assertFalse(uri2.exists())
1413 # But ref3 has to wait for the empty.
1414 self.assertTrue(uri3.exists())
1415 butler.datastore.emptyTrash()
1416 self.assertFalse(uri3.exists())
1418 # Clear out the datasets from registry.
1419 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1421 def testPytypePutCoercion(self):
1422 """Test python type coercion on Butler.get and put."""
1424 # Store some data with the normal example storage class.
1425 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1426 datasetTypeName = "test_metric"
1427 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
1429 dataId = {"instrument": "DummyCamComp", "visit": 423}
1431 # Put a dict and this should coerce to a MetricsExample
1432 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1433 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1434 test_metric = butler.getDirect(metric_ref)
1435 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1436 self.assertEqual(test_metric.summary, test_dict["summary"])
1437 self.assertEqual(test_metric.output, test_dict["output"])
1439 # Check that the put still works if a DatasetType is given with
1440 # a definition matching this python type.
1441 registry_type = butler.registry.getDatasetType(datasetTypeName)
1442 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
1443 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
1444 self.assertEqual(metric2_ref.datasetType, registry_type)
1446 # The get will return the type expected by registry.
1447 test_metric2 = butler.getDirect(metric2_ref)
1448 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
1450 # Make a new DatasetRef with the compatible but different DatasetType.
1451 # This should now return a dict.
1452 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
1453 test_dict2 = butler.getDirect(new_ref)
1454 self.assertEqual(get_full_type_name(test_dict2), "dict")
1456 # Get it again with the wrong dataset type definition using get()
1457 # rather than getDirect(). This should be consistent with getDirect()
1458 # behavior and return the type of the DatasetType.
1459 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
1460 self.assertEqual(get_full_type_name(test_dict3), "dict")
1462 def testPytypeCoercion(self):
1463 """Test python type coercion on Butler.get and put."""
1465 # Store some data with the normal example storage class.
1466 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1467 datasetTypeName = "test_metric"
1468 butler = self.runPutGetTest(storageClass, datasetTypeName)
1470 dataId = {"instrument": "DummyCamComp", "visit": 423}
1471 metric = butler.get(datasetTypeName, dataId=dataId)
1472 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1474 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1475 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1477 # Now need to hack the registry dataset type definition.
1478 # There is no API for this.
1479 manager = butler.registry._managers.datasets
1480 manager._db.update(
1481 manager._static.dataset_type,
1482 {"name": datasetTypeName},
1483 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1484 )
1486 # Force reset of dataset type cache
1487 butler.registry.refresh()
1489 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1490 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1491 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1493 metric_model = butler.get(datasetTypeName, dataId=dataId)
1494 self.assertNotEqual(type(metric_model), type(metric))
1495 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1497 # Put the model and read it back to show that everything now
1498 # works as normal.
1499 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1500 metric_model_new = butler.get(metric_ref)
1501 self.assertEqual(metric_model_new, metric_model)
1503 # Hack the storage class again to something that will fail on the
1504 # get with no conversion class.
1505 manager._db.update(
1506 manager._static.dataset_type,
1507 {"name": datasetTypeName},
1508 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1509 )
1510 butler.registry.refresh()
1512 with self.assertRaises(ValueError):
1513 butler.get(datasetTypeName, dataId=dataId)
1516@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1517class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1518 """PosixDatastore specialization of a butler using Postgres"""
1520 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1521 fullConfigKey = ".datastore.formatters"
1522 validationCanFail = True
1523 datastoreStr = ["/tmp"]
1524 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1525 registryStr = "PostgreSQL@test"
1527 @staticmethod
1528 def _handler(postgresql):
1529 engine = sqlalchemy.engine.create_engine(postgresql.url())
1530 with engine.begin() as connection:
1531 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1533 @classmethod
1534 def setUpClass(cls):
1535 # Create the postgres test server.
1536 cls.postgresql = testing.postgresql.PostgresqlFactory(
1537 cache_initialized_db=True, on_initialized=cls._handler
1538 )
1539 super().setUpClass()
1541 @classmethod
1542 def tearDownClass(cls):
1543 # Clean up any lingering SQLAlchemy engines/connections
1544 # so they're closed before we shut down the server.
1545 gc.collect()
1546 cls.postgresql.clear_cache()
1547 super().tearDownClass()
1549 def setUp(self):
1550 self.server = self.postgresql()
1552 # Need to add a registry section to the config.
1553 self._temp_config = False
1554 config = Config(self.configFile)
1555 config["registry", "db"] = self.server.url()
1556 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1557 config.dump(fh)
1558 self.configFile = fh.name
1559 self._temp_config = True
1560 super().setUp()
1562 def tearDown(self):
1563 self.server.stop()
1564 if self._temp_config and os.path.exists(self.configFile):
1565 os.remove(self.configFile)
1566 super().tearDown()
1568 def testMakeRepo(self):
1569 # The base class test assumes that it's using sqlite and assumes
1570 # the config file is acceptable to sqlite.
1571 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1574class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1575 """InMemoryDatastore specialization of a butler"""
1577 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1578 fullConfigKey = None
1579 useTempRoot = False
1580 validationCanFail = False
1581 datastoreStr = ["datastore='InMemory"]
1582 datastoreName = ["InMemoryDatastore@"]
1583 registryStr = "/gen3.sqlite3"
1585 def testIngest(self):
1586 pass
1589class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1590 """PosixDatastore specialization"""
1592 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1593 fullConfigKey = ".datastore.datastores.1.formatters"
1594 validationCanFail = True
1595 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1596 datastoreName = [
1597 "InMemoryDatastore@",
1598 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1599 "SecondDatastore",
1600 ]
1601 registryStr = "/gen3.sqlite3"
1604class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1605 """Test that a yaml file in one location can refer to a root in another."""
1607 datastoreStr = ["dir1"]
1608 # Disable the makeRepo test since we are deliberately not using
1609 # butler.yaml as the config name.
1610 fullConfigKey = None
1612 def setUp(self):
1613 self.root = makeTestTempDir(TESTDIR)
1615 # Make a new repository in one place
1616 self.dir1 = os.path.join(self.root, "dir1")
1617 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1619 # Move the yaml file to a different place and add a "root"
1620 self.dir2 = os.path.join(self.root, "dir2")
1621 os.makedirs(self.dir2, exist_ok=True)
1622 configFile1 = os.path.join(self.dir1, "butler.yaml")
1623 config = Config(configFile1)
1624 config["root"] = self.dir1
1625 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1626 config.dumpToUri(configFile2)
1627 os.remove(configFile1)
1628 self.tmpConfigFile = configFile2
1630 def testFileLocations(self):
1631 self.assertNotEqual(self.dir1, self.dir2)
1632 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1633 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1634 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1637class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1638 """Test that a config file created by makeRepo outside of repo works."""
1640 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1642 def setUp(self):
1643 self.root = makeTestTempDir(TESTDIR)
1644 self.root2 = makeTestTempDir(TESTDIR)
1646 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1647 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1649 def tearDown(self):
1650 if os.path.exists(self.root2):
1651 shutil.rmtree(self.root2, ignore_errors=True)
1652 super().tearDown()
1654 def testConfigExistence(self):
1655 c = Config(self.tmpConfigFile)
1656 uri_config = ResourcePath(c["root"])
1657 uri_expected = ResourcePath(self.root, forceDirectory=True)
1658 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1659 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1661 def testPutGet(self):
1662 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1663 self.runPutGetTest(storageClass, "test_metric")
1666class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1667 """Test that a config file created by makeRepo outside of repo works."""
1669 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1671 def setUp(self):
1672 self.root = makeTestTempDir(TESTDIR)
1673 self.root2 = makeTestTempDir(TESTDIR)
1675 self.tmpConfigFile = self.root2
1676 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1678 def testConfigExistence(self):
1679 # Append the yaml file else Config constructor does not know the file
1680 # type.
1681 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1682 super().testConfigExistence()
1685class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1686 """Test that a config file created by makeRepo outside of repo works."""
1688 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1690 def setUp(self):
1691 self.root = makeTestTempDir(TESTDIR)
1692 self.root2 = makeTestTempDir(TESTDIR)
1694 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1695 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1698@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1699class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1700 """S3Datastore specialization of a butler; an S3 storage Datastore +
1701 a local in-memory SqlRegistry.
1702 """
1704 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1705 fullConfigKey = None
1706 validationCanFail = True
1708 bucketName = "anybucketname"
1709 """Name of the Bucket that will be used in the tests. The name is read from
1710 the config file used with the tests during set-up.
1711 """
1713 root = "butlerRoot/"
1714 """Root repository directory expected to be used in case useTempRoot=False.
1715 Otherwise the root is set to a 20 characters long randomly generated string
1716 during set-up.
1717 """
1719 datastoreStr = [f"datastore={root}"]
1720 """Contains all expected root locations in a format expected to be
1721 returned by Butler stringification.
1722 """
1724 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1725 """The expected format of the S3 Datastore string."""
1727 registryStr = "/gen3.sqlite3"
1728 """Expected format of the Registry string."""
1730 mock_s3 = mock_s3()
1731 """The mocked s3 interface from moto."""
1733 def genRoot(self):
1734 """Returns a random string of len 20 to serve as a root
1735 name for the temporary bucket repo.
1737 This is equivalent to tempfile.mkdtemp as this is what self.root
1738 becomes when useTempRoot is True.
1739 """
1740 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1741 return rndstr + "/"
1743 def setUp(self):
1744 config = Config(self.configFile)
1745 uri = ResourcePath(config[".datastore.datastore.root"])
1746 self.bucketName = uri.netloc
1748 # Enable S3 mocking of tests.
1749 self.mock_s3.start()
1751 # set up some fake credentials if they do not exist
1752 self.usingDummyCredentials = setAwsEnvCredentials()
1754 if self.useTempRoot:
1755 self.root = self.genRoot()
1756 rooturi = f"s3://{self.bucketName}/{self.root}"
1757 config.update({"datastore": {"datastore": {"root": rooturi}}})
1759 # need local folder to store registry database
1760 self.reg_dir = makeTestTempDir(TESTDIR)
1761 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1763 # MOTO needs to know that we expect Bucket bucketname to exist
1764 # (this used to be the class attribute bucketName)
1765 s3 = boto3.resource("s3")
1766 s3.create_bucket(Bucket=self.bucketName)
1768 self.datastoreStr = f"datastore={self.root}"
1769 self.datastoreName = [f"FileDatastore@{rooturi}"]
1770 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1771 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1773 def tearDown(self):
1774 s3 = boto3.resource("s3")
1775 bucket = s3.Bucket(self.bucketName)
1776 try:
1777 bucket.objects.all().delete()
1778 except botocore.exceptions.ClientError as e:
1779 if e.response["Error"]["Code"] == "404":
1780 # the key was not reachable - pass
1781 pass
1782 else:
1783 raise
1785 bucket = s3.Bucket(self.bucketName)
1786 bucket.delete()
1788 # Stop the S3 mock.
1789 self.mock_s3.stop()
1791 # unset any potentially set dummy credentials
1792 if self.usingDummyCredentials:
1793 unsetAwsEnvCredentials()
1795 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1796 shutil.rmtree(self.reg_dir, ignore_errors=True)
1798 if self.useTempRoot and os.path.exists(self.root):
1799 shutil.rmtree(self.root, ignore_errors=True)
1801 super().tearDown()
1804@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1805class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1806 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1807 a local in-memory SqlRegistry.
1808 """
1810 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1811 fullConfigKey = None
1812 validationCanFail = True
1814 serverName = "localhost"
1815 """Name of the server that will be used in the tests.
1816 """
1818 portNumber = 8080
1819 """Port on which the webdav server listens. Automatically chosen
1820 at setUpClass via the _getfreeport() method
1821 """
1823 root = "butlerRoot/"
1824 """Root repository directory expected to be used in case useTempRoot=False.
1825 Otherwise the root is set to a 20 characters long randomly generated string
1826 during set-up.
1827 """
1829 datastoreStr = [f"datastore={root}"]
1830 """Contains all expected root locations in a format expected to be
1831 returned by Butler stringification.
1832 """
1834 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1835 """The expected format of the WebdavDatastore string."""
1837 registryStr = "/gen3.sqlite3"
1838 """Expected format of the Registry string."""
1840 serverThread = None
1841 """Thread in which the local webdav server will run"""
1843 stopWebdavServer = False
1844 """This flag will cause the webdav server to
1845 gracefully shut down when True
1846 """
1848 def genRoot(self):
1849 """Returns a random string of len 20 to serve as a root
1850 name for the temporary bucket repo.
1852 This is equivalent to tempfile.mkdtemp as this is what self.root
1853 becomes when useTempRoot is True.
1854 """
1855 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1856 return rndstr + "/"
1858 @classmethod
1859 def setUpClass(cls):
1860 # Do the same as inherited class
1861 cls.storageClassFactory = StorageClassFactory()
1862 cls.storageClassFactory.addFromConfig(cls.configFile)
1864 cls.portNumber = cls._getfreeport()
1865 # Run a local webdav server on which tests will be run
1866 cls.serverThread = Thread(
1867 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1868 )
1869 cls.serverThread.start()
1870 # Wait for it to start
1871 time.sleep(3)
1873 @classmethod
1874 def tearDownClass(cls):
1875 # Ask for graceful shut down of the webdav server
1876 cls.stopWebdavServer = True
1877 # Wait for the thread to exit
1878 cls.serverThread.join()
1879 super().tearDownClass()
1881 def setUp(self):
1882 config = Config(self.configFile)
1884 if self.useTempRoot:
1885 self.root = self.genRoot()
1886 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1887 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1889 # need local folder to store registry database
1890 self.reg_dir = makeTestTempDir(TESTDIR)
1891 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1893 self.datastoreStr = f"datastore={self.root}"
1894 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1896 if not _is_webdav_endpoint(self.rooturi):
1897 raise OSError("Webdav server not running properly: cannot run tests.")
1899 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1900 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1902 def tearDown(self):
1903 # Clear temporary directory
1904 ResourcePath(self.rooturi).remove()
1905 ResourcePath(self.rooturi).session.close()
1907 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1908 shutil.rmtree(self.reg_dir, ignore_errors=True)
1910 if self.useTempRoot and os.path.exists(self.root):
1911 shutil.rmtree(self.root, ignore_errors=True)
1913 super().tearDown()
1915 def _serveWebdav(self, port: int, stopWebdavServer):
1916 """Starts a local webdav-compatible HTTP server,
1917 Listening on http://localhost:port
1918 This server only runs when this test class is instantiated,
1919 and then shuts down. Must be started is a separate thread.
1921 Parameters
1922 ----------
1923 port : `int`
1924 The port number on which the server should listen
1925 """
1926 root_path = gettempdir()
1928 config = {
1929 "host": "0.0.0.0",
1930 "port": port,
1931 "provider_mapping": {"/": root_path},
1932 "http_authenticator": {"domain_controller": None},
1933 "simple_dc": {"user_mapping": {"*": True}},
1934 "verbose": 0,
1935 }
1936 app = WsgiDAVApp(config)
1938 server_args = {
1939 "bind_addr": (config["host"], config["port"]),
1940 "wsgi_app": app,
1941 }
1942 server = wsgi.Server(**server_args)
1943 server.prepare()
1945 try:
1946 # Start the actual server in a separate thread
1947 t = Thread(target=server.serve, daemon=True)
1948 t.start()
1949 # watch stopWebdavServer, and gracefully
1950 # shut down the server when True
1951 while True:
1952 if stopWebdavServer():
1953 break
1954 time.sleep(1)
1955 except KeyboardInterrupt:
1956 print("Caught Ctrl-C, shutting down...")
1957 finally:
1958 server.stop()
1959 t.join()
1961 def _getfreeport():
1962 """
1963 Determines a free port using sockets.
1964 """
1965 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1966 free_socket.bind(("127.0.0.1", 0))
1967 free_socket.listen()
1968 port = free_socket.getsockname()[1]
1969 free_socket.close()
1970 return port
1973class PosixDatastoreTransfers(unittest.TestCase):
1974 """Test data transfers between butlers.
1976 Test for different managers. UUID to UUID and integer to integer are
1977 tested. UUID to integer is not supported since we do not currently
1978 want to allow that. Integer to UUID is supported with the caveat
1979 that UUID4 will be generated and this will be incorrect for raw
1980 dataset types. The test ignores that.
1981 """
1983 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1985 @classmethod
1986 def setUpClass(cls):
1987 cls.storageClassFactory = StorageClassFactory()
1988 cls.storageClassFactory.addFromConfig(cls.configFile)
1990 def setUp(self):
1991 self.root = makeTestTempDir(TESTDIR)
1992 self.config = Config(self.configFile)
1994 def tearDown(self):
1995 removeTestTempDir(self.root)
1997 def create_butler(self, manager, label):
1998 config = Config(self.configFile)
1999 config["registry", "managers", "datasets"] = manager
2000 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
2002 def create_butlers(self, manager1, manager2):
2003 self.source_butler = self.create_butler(manager1, "1")
2004 self.target_butler = self.create_butler(manager2, "2")
2006 def testTransferUuidToUuid(self):
2007 self.create_butlers(
2008 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2009 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2010 )
2011 # Setting id_gen_map should have no effect here
2012 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
2014 def testTransferIntToInt(self):
2015 with self.assertWarns(FutureWarning):
2016 self.create_butlers(
2017 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2018 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2019 )
2020 # int dataset ID only allows UNIQUE
2021 self.assertButlerTransfers()
2023 def testTransferIntToUuid(self):
2024 with self.assertWarns(FutureWarning):
2025 self.create_butlers(
2026 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2027 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2028 )
2029 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
2031 def testTransferMissing(self):
2032 """Test transfers where datastore records are missing.
2034 This is how execution butler works.
2035 """
2036 self.create_butlers(
2037 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2038 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2039 )
2041 # Configure the source butler to allow trust.
2042 self.source_butler.datastore.trustGetRequest = True
2044 self.assertButlerTransfers(purge=True)
2046 def testTransferMissingDisassembly(self):
2047 """Test transfers where datastore records are missing.
2049 This is how execution butler works.
2050 """
2051 self.create_butlers(
2052 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2053 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2054 )
2056 # Configure the source butler to allow trust.
2057 self.source_butler.datastore.trustGetRequest = True
2059 # Test disassembly.
2060 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
2062 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
2063 """Test that a run can be transferred to another butler."""
2065 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2066 datasetTypeName = "random_data"
2068 # Test will create 3 collections and we will want to transfer
2069 # two of those three.
2070 runs = ["run1", "run2", "other"]
2072 # Also want to use two different dataset types to ensure that
2073 # grouping works.
2074 datasetTypeNames = ["random_data", "random_data_2"]
2076 # Create the run collections in the source butler.
2077 for run in runs:
2078 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2080 # Create dimensions in source butler.
2081 n_exposures = 30
2082 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2083 self.source_butler.registry.insertDimensionData(
2084 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2085 )
2086 self.source_butler.registry.insertDimensionData(
2087 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2088 )
2090 for i in range(n_exposures):
2091 self.source_butler.registry.insertDimensionData(
2092 "exposure",
2093 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2094 )
2096 # Create dataset types in the source butler.
2097 dimensions = self.source_butler.registry.dimensions.extract(["instrument", "exposure"])
2098 for datasetTypeName in datasetTypeNames:
2099 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2100 self.source_butler.registry.registerDatasetType(datasetType)
2102 # Write a dataset to an unrelated run -- this will ensure that
2103 # we are rewriting integer dataset ids in the target if necessary.
2104 # Will not be relevant for UUID.
2105 run = "distraction"
2106 butler = Butler(butler=self.source_butler, run=run)
2107 butler.put(
2108 makeExampleMetrics(),
2109 datasetTypeName,
2110 exposure=1,
2111 instrument="DummyCamComp",
2112 physical_filter="d-r",
2113 )
2115 # Write some example metrics to the source
2116 butler = Butler(butler=self.source_butler)
2118 # Set of DatasetRefs that should be in the list of refs to transfer
2119 # but which will not be transferred.
2120 deleted = set()
2122 n_expected = 20 # Number of datasets expected to be transferred
2123 source_refs = []
2124 for i in range(n_exposures):
2125 # Put a third of datasets into each collection, only retain
2126 # two thirds.
2127 index = i % 3
2128 run = runs[index]
2129 datasetTypeName = datasetTypeNames[i % 2]
2131 metric_data = {
2132 "summary": {"counter": i},
2133 "output": {"text": "metric"},
2134 "data": [2 * x for x in range(i)],
2135 }
2136 metric = MetricsExample(**metric_data)
2137 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2138 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2140 # Remove the datastore record using low-level API
2141 if purge:
2142 # Remove records for a fraction.
2143 if index == 1:
2145 # For one of these delete the file as well.
2146 # This allows the "missing" code to filter the
2147 # file out.
2148 if not deleted:
2149 primary, uris = butler.datastore.getURIs(ref)
2150 if primary:
2151 primary.remove()
2152 for uri in uris.values():
2153 uri.remove()
2154 n_expected -= 1
2155 deleted.add(ref)
2157 # Remove the datastore record.
2158 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2160 if index < 2:
2161 source_refs.append(ref)
2162 if ref not in deleted:
2163 new_metric = butler.get(ref.unresolved(), collections=run)
2164 self.assertEqual(new_metric, metric)
2166 # Create some bad dataset types to ensure we check for inconsistent
2167 # definitions.
2168 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2169 for datasetTypeName in datasetTypeNames:
2170 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2171 self.target_butler.registry.registerDatasetType(datasetType)
2172 with self.assertRaises(ConflictingDefinitionError) as cm:
2173 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2174 self.assertIn("dataset type differs", str(cm.exception))
2176 # And remove the bad definitions.
2177 for datasetTypeName in datasetTypeNames:
2178 self.target_butler.registry.removeDatasetType(datasetTypeName)
2180 # Transfer without creating dataset types should fail.
2181 with self.assertRaises(KeyError):
2182 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2184 # Transfer without creating dimensions should fail.
2185 with self.assertRaises(ConflictingDefinitionError) as cm:
2186 self.target_butler.transfer_from(
2187 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2188 )
2189 self.assertIn("dimension", str(cm.exception))
2191 # The failed transfer above leaves registry in an inconsistent
2192 # state because the run is created but then rolled back without
2193 # the collection cache being cleared. For now force a refresh.
2194 # Can remove with DM-35498.
2195 self.target_butler.registry.refresh()
2197 # Now transfer them to the second butler, including dimensions.
2198 with self.assertLogs(level=logging.DEBUG) as cm:
2199 transferred = self.target_butler.transfer_from(
2200 self.source_butler,
2201 source_refs,
2202 id_gen_map=id_gen_map,
2203 register_dataset_types=True,
2204 transfer_dimensions=True,
2205 )
2206 self.assertEqual(len(transferred), n_expected)
2207 log_output = ";".join(cm.output)
2208 self.assertIn("found in datastore for chunk", log_output)
2209 self.assertIn("Creating output run", log_output)
2211 # Do the transfer twice to ensure that it will do nothing extra.
2212 # Only do this if purge=True because it does not work for int
2213 # dataset_id.
2214 if purge:
2215 # This should not need to register dataset types.
2216 transferred = self.target_butler.transfer_from(
2217 self.source_butler, source_refs, id_gen_map=id_gen_map
2218 )
2219 self.assertEqual(len(transferred), n_expected)
2221 # Also do an explicit low-level transfer to trigger some
2222 # edge cases.
2223 with self.assertLogs(level=logging.DEBUG) as cm:
2224 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2225 log_output = ";".join(cm.output)
2226 self.assertIn("no file artifacts exist", log_output)
2228 with self.assertRaises(TypeError):
2229 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2231 with self.assertRaises(ValueError):
2232 self.target_butler.datastore.transfer_from(
2233 self.source_butler.datastore, source_refs, transfer="split"
2234 )
2236 # Now try to get the same refs from the new butler.
2237 for ref in source_refs:
2238 if ref not in deleted:
2239 unresolved_ref = ref.unresolved()
2240 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2241 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2242 self.assertEqual(new_metric, old_metric)
2244 # Now prune run2 collection and create instead a CHAINED collection.
2245 # This should block the transfer.
2246 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2247 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2248 with self.assertRaises(CollectionTypeError):
2249 # Re-importing the run1 datasets can be problematic if they
2250 # use integer IDs so filter those out.
2251 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2252 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2255if __name__ == "__main__": 2255 ↛ 2256line 2255 didn't jump to line 2256, because the condition on line 2255 was never true
2256 unittest.main()