Coverage for tests/test_butler.py: 14%
1211 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 23:50 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-24 23:50 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import gc
26import logging
27import os
28import pathlib
29import pickle
30import posixpath
31import random
32import shutil
33import socket
34import string
35import tempfile
36import time
37import unittest
38from tempfile import gettempdir
39from threading import Thread
41try:
42 import boto3
43 import botocore
44 from moto import mock_s3
45except ImportError:
46 boto3 = None
48 def mock_s3(cls):
49 """A no-op decorator in case moto mock_s3 can not be imported."""
50 return cls
53try:
54 # It's possible but silly to have testing.postgresql installed without
55 # having the postgresql server installed (because then nothing in
56 # testing.postgresql would work), so we use the presence of that module
57 # to test whether we can expect the server to be available.
58 import testing.postgresql
59except ImportError:
60 testing = None
63try:
64 from cheroot import wsgi
65 from wsgidav.wsgidav_app import WsgiDAVApp
66except ImportError:
67 WsgiDAVApp = None
69import astropy.time
70import sqlalchemy
71from lsst.daf.butler import (
72 Butler,
73 ButlerConfig,
74 CollectionSearch,
75 CollectionType,
76 Config,
77 DatasetIdGenEnum,
78 DatasetRef,
79 DatasetType,
80 FileDataset,
81 FileTemplate,
82 FileTemplateValidationError,
83 StorageClassFactory,
84 ValidationError,
85 script,
86)
87from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
88from lsst.daf.butler.registry import (
89 CollectionError,
90 CollectionTypeError,
91 ConflictingDefinitionError,
92 DataIdValueError,
93 MissingCollectionError,
94)
95from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
96from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
97from lsst.resources import ResourcePath
98from lsst.resources.http import _is_webdav_endpoint
99from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
100from lsst.utils import doImport
101from lsst.utils.introspection import get_full_type_name
103TESTDIR = os.path.abspath(os.path.dirname(__file__))
106def makeExampleMetrics():
107 return MetricsExample(
108 {"AM1": 5.2, "AM2": 30.6},
109 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
110 [563, 234, 456.7, 752, 8, 9, 27],
111 )
114class TransactionTestError(Exception):
115 """Specific error for testing transactions, to prevent misdiagnosing
116 that might otherwise occur when a standard exception is used.
117 """
119 pass
122class ButlerConfigTests(unittest.TestCase):
123 """Simple tests for ButlerConfig that are not tested in any other test
124 cases."""
126 def testSearchPath(self):
127 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
128 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
129 config1 = ButlerConfig(configFile)
130 self.assertNotIn("testConfigs", "\n".join(cm.output))
132 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
133 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
134 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
135 self.assertIn("testConfigs", "\n".join(cm.output))
137 key = ("datastore", "records", "table")
138 self.assertNotEqual(config1[key], config2[key])
139 self.assertEqual(config2[key], "override_record")
142class ButlerPutGetTests:
143 """Helper method for running a suite of put/get tests from different
144 butler configurations."""
146 root = None
147 default_run = "ingésτ😺"
149 @staticmethod
150 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
151 """Create a DatasetType and register it"""
152 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
153 registry.registerDatasetType(datasetType)
154 return datasetType
156 @classmethod
157 def setUpClass(cls):
158 cls.storageClassFactory = StorageClassFactory()
159 cls.storageClassFactory.addFromConfig(cls.configFile)
161 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
162 datasetType = datasetRef.datasetType
163 dataId = datasetRef.dataId
164 deferred = butler.getDirectDeferred(datasetRef)
166 for component in components:
167 compTypeName = datasetType.componentTypeName(component)
168 result = butler.get(compTypeName, dataId, collections=collections)
169 self.assertEqual(result, getattr(reference, component))
170 result_deferred = deferred.get(component=component)
171 self.assertEqual(result_deferred, result)
173 def tearDown(self):
174 removeTestTempDir(self.root)
176 def create_butler(self, run, storageClass, datasetTypeName):
177 butler = Butler(self.tmpConfigFile, run=run)
179 collections = set(butler.registry.queryCollections())
180 self.assertEqual(collections, set([run]))
182 # Create and register a DatasetType
183 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
185 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
187 # Add needed Dimensions
188 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
189 butler.registry.insertDimensionData(
190 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
191 )
192 butler.registry.insertDimensionData(
193 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
194 )
195 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
196 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
197 butler.registry.insertDimensionData(
198 "visit",
199 {
200 "instrument": "DummyCamComp",
201 "id": 423,
202 "name": "fourtwentythree",
203 "physical_filter": "d-r",
204 "visit_system": 1,
205 "datetime_begin": visit_start,
206 "datetime_end": visit_end,
207 },
208 )
210 # Add more visits for some later tests
211 for visit_id in (424, 425):
212 butler.registry.insertDimensionData(
213 "visit",
214 {
215 "instrument": "DummyCamComp",
216 "id": visit_id,
217 "name": f"fourtwentyfour_{visit_id}",
218 "physical_filter": "d-r",
219 "visit_system": 1,
220 },
221 )
222 return butler, datasetType
224 def runPutGetTest(self, storageClass, datasetTypeName):
225 # New datasets will be added to run and tag, but we will only look in
226 # tag when looking up datasets.
227 run = self.default_run
228 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
230 # Create and store a dataset
231 metric = makeExampleMetrics()
232 dataId = {"instrument": "DummyCamComp", "visit": 423}
234 # Create a DatasetRef for put
235 refIn = DatasetRef(datasetType, dataId, id=None)
237 # Put with a preexisting id should fail
238 with self.assertRaises(ValueError):
239 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
241 # Put and remove the dataset once as a DatasetRef, once as a dataId,
242 # and once with a DatasetType
244 # Keep track of any collections we add and do not clean up
245 expected_collections = {run}
247 counter = 0
248 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
249 # Since we are using subTest we can get cascading failures
250 # here with the first attempt failing and the others failing
251 # immediately because the dataset already exists. Work around
252 # this by using a distinct run collection each time
253 counter += 1
254 this_run = f"put_run_{counter}"
255 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
256 expected_collections.update({this_run})
258 with self.subTest(args=args):
259 ref = butler.put(metric, *args, run=this_run)
260 self.assertIsInstance(ref, DatasetRef)
262 # Test getDirect
263 metricOut = butler.getDirect(ref)
264 self.assertEqual(metric, metricOut)
265 # Test get
266 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
267 self.assertEqual(metric, metricOut)
268 # Test get with a datasetRef
269 metricOut = butler.get(ref, collections=this_run)
270 self.assertEqual(metric, metricOut)
271 # Test getDeferred with dataId
272 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
273 self.assertEqual(metric, metricOut)
274 # Test getDeferred with a datasetRef
275 metricOut = butler.getDeferred(ref, collections=this_run).get()
276 self.assertEqual(metric, metricOut)
277 # and deferred direct with ref
278 metricOut = butler.getDirectDeferred(ref).get()
279 self.assertEqual(metric, metricOut)
281 # Check we can get components
282 if storageClass.isComposite():
283 self.assertGetComponents(
284 butler, ref, ("summary", "data", "output"), metric, collections=this_run
285 )
287 # Can the artifacts themselves be retrieved?
288 if not butler.datastore.isEphemeral:
289 root_uri = ResourcePath(self.root)
291 for preserve_path in (True, False):
292 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
293 # Use copy so that we can test that overwrite
294 # protection works (using "auto" for File URIs would
295 # use hard links and subsequent transfer would work
296 # because it knows they are the same file).
297 transferred = butler.retrieveArtifacts(
298 [ref], destination, preserve_path=preserve_path, transfer="copy"
299 )
300 self.assertGreater(len(transferred), 0)
301 artifacts = list(ResourcePath.findFileResources([destination]))
302 self.assertEqual(set(transferred), set(artifacts))
304 for artifact in transferred:
305 path_in_destination = artifact.relative_to(destination)
306 self.assertIsNotNone(path_in_destination)
308 # when path is not preserved there should not be
309 # any path separators.
310 num_seps = path_in_destination.count("/")
311 if preserve_path:
312 self.assertGreater(num_seps, 0)
313 else:
314 self.assertEqual(num_seps, 0)
316 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
317 n_uris = len(secondary_uris)
318 if primary_uri:
319 n_uris += 1
320 self.assertEqual(
321 len(artifacts),
322 n_uris,
323 "Comparing expected artifacts vs actual:"
324 f" {artifacts} vs {primary_uri} and {secondary_uris}",
325 )
327 if preserve_path:
328 # No need to run these twice
329 with self.assertRaises(ValueError):
330 butler.retrieveArtifacts([ref], destination, transfer="move")
332 with self.assertRaises(FileExistsError):
333 butler.retrieveArtifacts([ref], destination)
335 transferred_again = butler.retrieveArtifacts(
336 [ref], destination, preserve_path=preserve_path, overwrite=True
337 )
338 self.assertEqual(set(transferred_again), set(transferred))
340 # Now remove the dataset completely.
341 butler.pruneDatasets([ref], purge=True, unstore=True)
342 # Lookup with original args should still fail.
343 with self.assertRaises(LookupError):
344 butler.datasetExists(*args, collections=this_run)
345 # getDirect() should still fail.
346 with self.assertRaises(FileNotFoundError):
347 butler.getDirect(ref)
348 # Registry shouldn't be able to find it by dataset_id anymore.
349 self.assertIsNone(butler.registry.getDataset(ref.id))
351 # Do explicit registry removal since we know they are
352 # empty
353 butler.registry.removeCollection(this_run)
354 expected_collections.remove(this_run)
356 # Put the dataset again, since the last thing we did was remove it
357 # and we want to use the default collection.
358 ref = butler.put(metric, refIn)
360 # Get with parameters
361 stop = 4
362 sliced = butler.get(ref, parameters={"slice": slice(stop)})
363 self.assertNotEqual(metric, sliced)
364 self.assertEqual(metric.summary, sliced.summary)
365 self.assertEqual(metric.output, sliced.output)
366 self.assertEqual(metric.data[:stop], sliced.data)
367 # getDeferred with parameters
368 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
369 self.assertNotEqual(metric, sliced)
370 self.assertEqual(metric.summary, sliced.summary)
371 self.assertEqual(metric.output, sliced.output)
372 self.assertEqual(metric.data[:stop], sliced.data)
373 # getDeferred with deferred parameters
374 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
375 self.assertNotEqual(metric, sliced)
376 self.assertEqual(metric.summary, sliced.summary)
377 self.assertEqual(metric.output, sliced.output)
378 self.assertEqual(metric.data[:stop], sliced.data)
380 if storageClass.isComposite():
381 # Check that components can be retrieved
382 metricOut = butler.get(ref.datasetType.name, dataId)
383 compNameS = ref.datasetType.componentTypeName("summary")
384 compNameD = ref.datasetType.componentTypeName("data")
385 summary = butler.get(compNameS, dataId)
386 self.assertEqual(summary, metric.summary)
387 data = butler.get(compNameD, dataId)
388 self.assertEqual(data, metric.data)
390 if "counter" in storageClass.derivedComponents:
391 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
392 self.assertEqual(count, len(data))
394 count = butler.get(
395 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
396 )
397 self.assertEqual(count, stop)
399 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
400 summary = butler.getDirect(compRef)
401 self.assertEqual(summary, metric.summary)
403 # Create a Dataset type that has the same name but is inconsistent.
404 inconsistentDatasetType = DatasetType(
405 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
406 )
408 # Getting with a dataset type that does not match registry fails
409 with self.assertRaises(ValueError):
410 butler.get(inconsistentDatasetType, dataId)
412 # Combining a DatasetRef with a dataId should fail
413 with self.assertRaises(ValueError):
414 butler.get(ref, dataId)
415 # Getting with an explicit ref should fail if the id doesn't match
416 with self.assertRaises(ValueError):
417 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
419 # Getting a dataset with unknown parameters should fail
420 with self.assertRaises(KeyError):
421 butler.get(ref, parameters={"unsupported": True})
423 # Check we have a collection
424 collections = set(butler.registry.queryCollections())
425 self.assertEqual(collections, expected_collections)
427 # Clean up to check that we can remove something that may have
428 # already had a component removed
429 butler.pruneDatasets([ref], unstore=True, purge=True)
431 # Check that we can configure a butler to accept a put even
432 # if it already has the dataset in registry.
433 ref = butler.put(metric, refIn)
435 # Repeat put will fail.
436 with self.assertRaises(ConflictingDefinitionError):
437 butler.put(metric, refIn)
439 # Remove the datastore entry.
440 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
442 # Put will still fail
443 with self.assertRaises(ConflictingDefinitionError):
444 butler.put(metric, refIn)
446 # Allow the put to succeed
447 butler._allow_put_of_predefined_dataset = True
448 ref2 = butler.put(metric, refIn)
449 self.assertEqual(ref2.id, ref.id)
451 # A second put will still fail but with a different exception
452 # than before.
453 with self.assertRaises(ConflictingDefinitionError):
454 butler.put(metric, refIn)
456 # Reset the flag to avoid confusion
457 butler._allow_put_of_predefined_dataset = False
459 # Leave the dataset in place since some downstream tests require
460 # something to be present
462 return butler
464 def testDeferredCollectionPassing(self):
465 # Construct a butler with no run or collection, but make it writeable.
466 butler = Butler(self.tmpConfigFile, writeable=True)
467 # Create and register a DatasetType
468 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
469 datasetType = self.addDatasetType(
470 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
471 )
472 # Add needed Dimensions
473 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
474 butler.registry.insertDimensionData(
475 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
476 )
477 butler.registry.insertDimensionData(
478 "visit",
479 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
480 )
481 dataId = {"instrument": "DummyCamComp", "visit": 423}
482 # Create dataset.
483 metric = makeExampleMetrics()
484 # Register a new run and put dataset.
485 run = "deferred"
486 self.assertTrue(butler.registry.registerRun(run))
487 # Second time it will be allowed but indicate no-op
488 self.assertFalse(butler.registry.registerRun(run))
489 ref = butler.put(metric, datasetType, dataId, run=run)
490 # Putting with no run should fail with TypeError.
491 with self.assertRaises(CollectionError):
492 butler.put(metric, datasetType, dataId)
493 # Dataset should exist.
494 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
495 # We should be able to get the dataset back, but with and without
496 # a deferred dataset handle.
497 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
498 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
499 # Trying to find the dataset without any collection is a TypeError.
500 with self.assertRaises(CollectionError):
501 butler.datasetExists(datasetType, dataId)
502 with self.assertRaises(CollectionError):
503 butler.get(datasetType, dataId)
504 # Associate the dataset with a different collection.
505 butler.registry.registerCollection("tagged")
506 butler.registry.associate("tagged", [ref])
507 # Deleting the dataset from the new collection should make it findable
508 # in the original collection.
509 butler.pruneDatasets([ref], tags=["tagged"])
510 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
513class ButlerTests(ButlerPutGetTests):
514 """Tests for Butler."""
516 useTempRoot = True
518 def setUp(self):
519 """Create a new butler root for each test."""
520 self.root = makeTestTempDir(TESTDIR)
521 Butler.makeRepo(self.root, config=Config(self.configFile))
522 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
524 def testConstructor(self):
525 """Independent test of constructor."""
526 butler = Butler(self.tmpConfigFile, run=self.default_run)
527 self.assertIsInstance(butler, Butler)
529 # Check that butler.yaml is added automatically.
530 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
531 config_dir = self.tmpConfigFile[: -len(end)]
532 butler = Butler(config_dir, run=self.default_run)
533 self.assertIsInstance(butler, Butler)
535 # Even with a ResourcePath.
536 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
537 self.assertIsInstance(butler, Butler)
539 collections = set(butler.registry.queryCollections())
540 self.assertEqual(collections, {self.default_run})
542 # Check that some special characters can be included in run name.
543 special_run = "u@b.c-A"
544 butler_special = Butler(butler=butler, run=special_run)
545 collections = set(butler_special.registry.queryCollections("*@*"))
546 self.assertEqual(collections, {special_run})
548 butler2 = Butler(butler=butler, collections=["other"])
549 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
550 self.assertIsNone(butler2.run)
551 self.assertIs(butler.datastore, butler2.datastore)
553 # Test that we can use an environment variable to find this
554 # repository.
555 butler_index = Config()
556 butler_index["label"] = self.tmpConfigFile
557 for suffix in (".yaml", ".json"):
558 # Ensure that the content differs so that we know that
559 # we aren't reusing the cache.
560 bad_label = f"s3://bucket/not_real{suffix}"
561 butler_index["bad_label"] = bad_label
562 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
563 butler_index.dumpToUri(temp_file)
564 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
565 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
566 uri = Butler.get_repo_uri("bad_label")
567 self.assertEqual(uri, ResourcePath(bad_label))
568 uri = Butler.get_repo_uri("label")
569 butler = Butler(uri, writeable=False)
570 self.assertIsInstance(butler, Butler)
571 butler = Butler("label", writeable=False)
572 self.assertIsInstance(butler, Butler)
573 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
574 Butler("not_there", writeable=False)
575 with self.assertRaises(KeyError) as cm:
576 Butler.get_repo_uri("missing")
577 self.assertIn("not known to", str(cm.exception))
578 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
579 with self.assertRaises(FileNotFoundError):
580 Butler.get_repo_uri("label")
581 self.assertEqual(Butler.get_known_repos(), set())
582 with self.assertRaises(KeyError) as cm:
583 # No environment variable set.
584 Butler.get_repo_uri("label")
585 self.assertIn("No repository index defined", str(cm.exception))
586 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
587 # No aliases registered.
588 Butler("not_there")
589 self.assertEqual(Butler.get_known_repos(), set())
591 def testBasicPutGet(self):
592 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
593 self.runPutGetTest(storageClass, "test_metric")
595 def testCompositePutGetConcrete(self):
596 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
597 butler = self.runPutGetTest(storageClass, "test_metric")
599 # Should *not* be disassembled
600 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
601 self.assertEqual(len(datasets), 1)
602 uri, components = butler.getURIs(datasets[0])
603 self.assertIsInstance(uri, ResourcePath)
604 self.assertFalse(components)
605 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
606 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
608 # Predicted dataset
609 dataId = {"instrument": "DummyCamComp", "visit": 424}
610 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
611 self.assertFalse(components)
612 self.assertIsInstance(uri, ResourcePath)
613 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
614 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
616 def testCompositePutGetVirtual(self):
617 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
618 butler = self.runPutGetTest(storageClass, "test_metric_comp")
620 # Should be disassembled
621 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
622 self.assertEqual(len(datasets), 1)
623 uri, components = butler.getURIs(datasets[0])
625 if butler.datastore.isEphemeral:
626 # Never disassemble in-memory datastore
627 self.assertIsInstance(uri, ResourcePath)
628 self.assertFalse(components)
629 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
630 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
631 else:
632 self.assertIsNone(uri)
633 self.assertEqual(set(components), set(storageClass.components))
634 for compuri in components.values():
635 self.assertIsInstance(compuri, ResourcePath)
636 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
637 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
639 # Predicted dataset
640 dataId = {"instrument": "DummyCamComp", "visit": 424}
641 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
643 if butler.datastore.isEphemeral:
644 # Never disassembled
645 self.assertIsInstance(uri, ResourcePath)
646 self.assertFalse(components)
647 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
648 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
649 else:
650 self.assertIsNone(uri)
651 self.assertEqual(set(components), set(storageClass.components))
652 for compuri in components.values():
653 self.assertIsInstance(compuri, ResourcePath)
654 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
655 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
657 def testIngest(self):
658 butler = Butler(self.tmpConfigFile, run=self.default_run)
660 # Create and register a DatasetType
661 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
663 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
664 datasetTypeName = "metric"
666 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
668 # Add needed Dimensions
669 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
670 butler.registry.insertDimensionData(
671 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
672 )
673 for detector in (1, 2):
674 butler.registry.insertDimensionData(
675 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
676 )
678 butler.registry.insertDimensionData(
679 "visit",
680 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
681 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
682 )
684 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
685 dataRoot = os.path.join(TESTDIR, "data", "basic")
686 datasets = []
687 for detector in (1, 2):
688 detector_name = f"detector_{detector}"
689 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
690 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
691 # Create a DatasetRef for ingest
692 refIn = DatasetRef(datasetType, dataId, id=None)
694 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
696 butler.ingest(*datasets, transfer="copy")
698 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
699 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
701 metrics1 = butler.get(datasetTypeName, dataId1)
702 metrics2 = butler.get(datasetTypeName, dataId2)
703 self.assertNotEqual(metrics1, metrics2)
705 # Compare URIs
706 uri1 = butler.getURI(datasetTypeName, dataId1)
707 uri2 = butler.getURI(datasetTypeName, dataId2)
708 self.assertNotEqual(uri1, uri2)
710 # Now do a multi-dataset but single file ingest
711 metricFile = os.path.join(dataRoot, "detectors.yaml")
712 refs = []
713 for detector in (1, 2):
714 detector_name = f"detector_{detector}"
715 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
716 # Create a DatasetRef for ingest
717 refs.append(DatasetRef(datasetType, dataId, id=None))
719 datasets = []
720 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
722 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
724 # Check that the datastore recorded no file size.
725 # Not all datastores can support this.
726 try:
727 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
728 self.assertEqual(infos[0].file_size, -1)
729 except AttributeError:
730 pass
732 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
733 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
735 multi1 = butler.get(datasetTypeName, dataId1)
736 multi2 = butler.get(datasetTypeName, dataId2)
738 self.assertEqual(multi1, metrics1)
739 self.assertEqual(multi2, metrics2)
741 # Compare URIs
742 uri1 = butler.getURI(datasetTypeName, dataId1)
743 uri2 = butler.getURI(datasetTypeName, dataId2)
744 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
746 # Test that removing one does not break the second
747 # This line will issue a warning log message for a ChainedDatastore
748 # that uses an InMemoryDatastore since in-memory can not ingest
749 # files.
750 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
751 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
752 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
753 multi2b = butler.get(datasetTypeName, dataId2)
754 self.assertEqual(multi2, multi2b)
756 def testPruneCollections(self):
757 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
758 butler = Butler(self.tmpConfigFile, writeable=True)
759 # Load registry data with dimensions to hang datasets off of.
760 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
761 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
762 # Add some RUN-type collections.
763 run1 = "run1"
764 butler.registry.registerRun(run1)
765 run2 = "run2"
766 butler.registry.registerRun(run2)
767 # put some datasets. ref1 and ref2 have the same data ID, and are in
768 # different runs. ref3 has a different data ID.
769 metric = makeExampleMetrics()
770 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
771 datasetType = self.addDatasetType(
772 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
773 )
774 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
775 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
776 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
778 # Try to delete a RUN collection without purge, or with purge and not
779 # unstore.
780 with self.assertRaises(TypeError):
781 butler.pruneCollection(run1)
782 with self.assertRaises(TypeError):
783 butler.pruneCollection(run2, purge=True)
784 # Add a TAGGED collection and associate ref3 only into it.
785 tag1 = "tag1"
786 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
787 self.assertTrue(registered)
788 # Registering a second time should be allowed.
789 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
790 self.assertFalse(registered)
791 butler.registry.associate(tag1, [ref3])
792 # Add a CHAINED collection that searches run1 and then run2. It
793 # logically contains only ref1, because ref2 is shadowed due to them
794 # having the same data ID and dataset type.
795 chain1 = "chain1"
796 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
797 butler.registry.setCollectionChain(chain1, [run1, run2])
798 # Try to delete RUN collections, which should fail with complete
799 # rollback because they're still referenced by the CHAINED
800 # collection.
801 with self.assertRaises(Exception):
802 butler.pruneCollection(run1, pruge=True, unstore=True)
803 with self.assertRaises(Exception):
804 butler.pruneCollection(run2, pruge=True, unstore=True)
805 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
806 existence = butler.datastore.mexists([ref1, ref2, ref3])
807 self.assertTrue(existence[ref1])
808 self.assertTrue(existence[ref2])
809 self.assertTrue(existence[ref3])
810 # Try to delete CHAINED and TAGGED collections with purge; should not
811 # work.
812 with self.assertRaises(TypeError):
813 butler.pruneCollection(tag1, purge=True, unstore=True)
814 with self.assertRaises(TypeError):
815 butler.pruneCollection(chain1, purge=True, unstore=True)
816 # Remove the tagged collection with unstore=False. This should not
817 # affect the datasets.
818 butler.pruneCollection(tag1)
819 with self.assertRaises(MissingCollectionError):
820 butler.registry.getCollectionType(tag1)
821 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
822 existence = butler.datastore.mexists([ref1, ref2, ref3])
823 self.assertTrue(existence[ref1])
824 self.assertTrue(existence[ref2])
825 self.assertTrue(existence[ref3])
826 # Add the tagged collection back in, and remove it with unstore=True.
827 # This should remove ref3 only from the datastore.
828 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
829 butler.registry.associate(tag1, [ref3])
830 butler.pruneCollection(tag1, unstore=True)
831 with self.assertRaises(MissingCollectionError):
832 butler.registry.getCollectionType(tag1)
833 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
834 existence = butler.datastore.mexists([ref1, ref2, ref3])
835 self.assertTrue(existence[ref1])
836 self.assertTrue(existence[ref2])
837 self.assertFalse(existence[ref3])
838 # Delete the chain with unstore=False. The datasets should not be
839 # affected at all.
840 butler.pruneCollection(chain1)
841 with self.assertRaises(MissingCollectionError):
842 butler.registry.getCollectionType(chain1)
843 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
844 existence = butler.datastore.mexists([ref1, ref2, ref3])
845 self.assertTrue(existence[ref1])
846 self.assertTrue(existence[ref2])
847 self.assertFalse(existence[ref3])
848 # Redefine and then delete the chain with unstore=True. Only ref1
849 # should be unstored (ref3 has already been unstored, but otherwise
850 # would be now).
851 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
852 butler.registry.setCollectionChain(chain1, [run1, run2])
853 butler.pruneCollection(chain1, unstore=True)
854 with self.assertRaises(MissingCollectionError):
855 butler.registry.getCollectionType(chain1)
856 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
857 existence = butler.datastore.mexists([ref1, ref2, ref3])
858 self.assertFalse(existence[ref1])
859 self.assertTrue(existence[ref2])
860 self.assertFalse(existence[ref3])
861 # Remove run1. This removes ref1 and ref3 from the registry (they're
862 # already gone from the datastore, which is fine).
863 butler.pruneCollection(run1, purge=True, unstore=True)
864 with self.assertRaises(MissingCollectionError):
865 butler.registry.getCollectionType(run1)
866 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
867 self.assertTrue(butler.datastore.exists(ref2))
868 # Remove run2. This removes ref2 from the registry and the datastore.
869 butler.pruneCollection(run2, purge=True, unstore=True)
870 with self.assertRaises(MissingCollectionError):
871 butler.registry.getCollectionType(run2)
872 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
874 # Now that the collections have been pruned we can remove the
875 # dataset type
876 butler.registry.removeDatasetType(datasetType.name)
878 def testPickle(self):
879 """Test pickle support."""
880 butler = Butler(self.tmpConfigFile, run=self.default_run)
881 butlerOut = pickle.loads(pickle.dumps(butler))
882 self.assertIsInstance(butlerOut, Butler)
883 self.assertEqual(butlerOut._config, butler._config)
884 self.assertEqual(butlerOut.collections, butler.collections)
885 self.assertEqual(butlerOut.run, butler.run)
887 def testGetDatasetTypes(self):
888 butler = Butler(self.tmpConfigFile, run=self.default_run)
889 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
890 dimensionEntries = [
891 (
892 "instrument",
893 {"instrument": "DummyCam"},
894 {"instrument": "DummyHSC"},
895 {"instrument": "DummyCamComp"},
896 ),
897 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
898 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
899 ]
900 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
901 # Add needed Dimensions
902 for args in dimensionEntries:
903 butler.registry.insertDimensionData(*args)
905 # When a DatasetType is added to the registry entries are not created
906 # for components but querying them can return the components.
907 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
908 components = set()
909 for datasetTypeName in datasetTypeNames:
910 # Create and register a DatasetType
911 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
913 for componentName in storageClass.components:
914 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
916 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
917 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
919 # Now that we have some dataset types registered, validate them
920 butler.validateConfiguration(
921 ignore=[
922 "test_metric_comp",
923 "metric3",
924 "metric5",
925 "calexp",
926 "DummySC",
927 "datasetType.component",
928 "random_data",
929 "random_data_2",
930 ]
931 )
933 # Add a new datasetType that will fail template validation
934 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
935 if self.validationCanFail:
936 with self.assertRaises(ValidationError):
937 butler.validateConfiguration()
939 # Rerun validation but with a subset of dataset type names
940 butler.validateConfiguration(datasetTypeNames=["metric4"])
942 # Rerun validation but ignore the bad datasetType
943 butler.validateConfiguration(
944 ignore=[
945 "test_metric_comp",
946 "metric3",
947 "metric5",
948 "calexp",
949 "DummySC",
950 "datasetType.component",
951 "random_data",
952 "random_data_2",
953 ]
954 )
956 def testTransaction(self):
957 butler = Butler(self.tmpConfigFile, run=self.default_run)
958 datasetTypeName = "test_metric"
959 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
960 dimensionEntries = (
961 ("instrument", {"instrument": "DummyCam"}),
962 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
963 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
964 )
965 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
966 metric = makeExampleMetrics()
967 dataId = {"instrument": "DummyCam", "visit": 42}
968 # Create and register a DatasetType
969 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
970 with self.assertRaises(TransactionTestError):
971 with butler.transaction():
972 # Add needed Dimensions
973 for args in dimensionEntries:
974 butler.registry.insertDimensionData(*args)
975 # Store a dataset
976 ref = butler.put(metric, datasetTypeName, dataId)
977 self.assertIsInstance(ref, DatasetRef)
978 # Test getDirect
979 metricOut = butler.getDirect(ref)
980 self.assertEqual(metric, metricOut)
981 # Test get
982 metricOut = butler.get(datasetTypeName, dataId)
983 self.assertEqual(metric, metricOut)
984 # Check we can get components
985 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
986 raise TransactionTestError("This should roll back the entire transaction")
987 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
988 butler.registry.expandDataId(dataId)
989 # Should raise LookupError for missing data ID value
990 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
991 butler.get(datasetTypeName, dataId)
992 # Also check explicitly if Dataset entry is missing
993 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
994 # Direct retrieval should not find the file in the Datastore
995 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
996 butler.getDirect(ref)
998 def testMakeRepo(self):
999 """Test that we can write butler configuration to a new repository via
1000 the Butler.makeRepo interface and then instantiate a butler from the
1001 repo root.
1002 """
1003 # Do not run the test if we know this datastore configuration does
1004 # not support a file system root
1005 if self.fullConfigKey is None:
1006 return
1008 # create two separate directories
1009 root1 = tempfile.mkdtemp(dir=self.root)
1010 root2 = tempfile.mkdtemp(dir=self.root)
1012 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1013 limited = Config(self.configFile)
1014 butler1 = Butler(butlerConfig)
1015 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1016 full = Config(self.tmpConfigFile)
1017 butler2 = Butler(butlerConfig)
1018 # Butlers should have the same configuration regardless of whether
1019 # defaults were expanded.
1020 self.assertEqual(butler1._config, butler2._config)
1021 # Config files loaded directly should not be the same.
1022 self.assertNotEqual(limited, full)
1023 # Make sure "limited" doesn't have a few keys we know it should be
1024 # inheriting from defaults.
1025 self.assertIn(self.fullConfigKey, full)
1026 self.assertNotIn(self.fullConfigKey, limited)
1028 # Collections don't appear until something is put in them
1029 collections1 = set(butler1.registry.queryCollections())
1030 self.assertEqual(collections1, set())
1031 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1033 # Check that a config with no associated file name will not
1034 # work properly with relocatable Butler repo
1035 butlerConfig.configFile = None
1036 with self.assertRaises(ValueError):
1037 Butler(butlerConfig)
1039 with self.assertRaises(FileExistsError):
1040 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1042 def testStringification(self):
1043 butler = Butler(self.tmpConfigFile, run=self.default_run)
1044 butlerStr = str(butler)
1046 if self.datastoreStr is not None:
1047 for testStr in self.datastoreStr:
1048 self.assertIn(testStr, butlerStr)
1049 if self.registryStr is not None:
1050 self.assertIn(self.registryStr, butlerStr)
1052 datastoreName = butler.datastore.name
1053 if self.datastoreName is not None:
1054 for testStr in self.datastoreName:
1055 self.assertIn(testStr, datastoreName)
1057 def testButlerRewriteDataId(self):
1058 """Test that dataIds can be rewritten based on dimension records."""
1060 butler = Butler(self.tmpConfigFile, run=self.default_run)
1062 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1063 datasetTypeName = "random_data"
1065 # Create dimension records.
1066 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1067 butler.registry.insertDimensionData(
1068 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1069 )
1070 butler.registry.insertDimensionData(
1071 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1072 )
1074 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1075 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1076 butler.registry.registerDatasetType(datasetType)
1078 n_exposures = 5
1079 dayobs = 20210530
1081 for i in range(n_exposures):
1082 butler.registry.insertDimensionData(
1083 "exposure",
1084 {
1085 "instrument": "DummyCamComp",
1086 "id": i,
1087 "obs_id": f"exp{i}",
1088 "seq_num": i,
1089 "day_obs": dayobs,
1090 "physical_filter": "d-r",
1091 },
1092 )
1094 # Write some data.
1095 for i in range(n_exposures):
1096 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1098 # Use the seq_num for the put to test rewriting.
1099 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1100 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1102 # Check that the exposure is correct in the dataId
1103 self.assertEqual(ref.dataId["exposure"], i)
1105 # and check that we can get the dataset back with the same dataId
1106 new_metric = butler.get(datasetTypeName, dataId=dataId)
1107 self.assertEqual(new_metric, metric)
1110class FileDatastoreButlerTests(ButlerTests):
1111 """Common tests and specialization of ButlerTests for butlers backed
1112 by datastores that inherit from FileDatastore.
1113 """
1115 def checkFileExists(self, root, relpath):
1116 """Checks if file exists at a given path (relative to root).
1118 Test testPutTemplates verifies actual physical existance of the files
1119 in the requested location.
1120 """
1121 uri = ResourcePath(root, forceDirectory=True)
1122 return uri.join(relpath).exists()
1124 def testPutTemplates(self):
1125 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1126 butler = Butler(self.tmpConfigFile, run=self.default_run)
1128 # Add needed Dimensions
1129 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1130 butler.registry.insertDimensionData(
1131 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1132 )
1133 butler.registry.insertDimensionData(
1134 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1135 )
1136 butler.registry.insertDimensionData(
1137 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1138 )
1140 # Create and store a dataset
1141 metric = makeExampleMetrics()
1143 # Create two almost-identical DatasetTypes (both will use default
1144 # template)
1145 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1146 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1147 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1148 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1150 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1151 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1153 # Put with exactly the data ID keys needed
1154 ref = butler.put(metric, "metric1", dataId1)
1155 uri = butler.getURI(ref)
1156 self.assertTrue(
1157 self.checkFileExists(
1158 butler.datastore.root, f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle"
1159 ),
1160 f"Checking existence of {uri}",
1161 )
1163 # Check the template based on dimensions
1164 butler.datastore.templates.validateTemplates([ref])
1166 # Put with extra data ID keys (physical_filter is an optional
1167 # dependency); should not change template (at least the way we're
1168 # defining them to behave now; the important thing is that they
1169 # must be consistent).
1170 ref = butler.put(metric, "metric2", dataId2)
1171 uri = butler.getURI(ref)
1172 self.assertTrue(
1173 self.checkFileExists(
1174 butler.datastore.root, f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle"
1175 ),
1176 f"Checking existence of {uri}",
1177 )
1179 # Check the template based on dimensions
1180 butler.datastore.templates.validateTemplates([ref])
1182 # Use a template that has a typo in dimension record metadata.
1183 # Easier to test with a butler that has a ref with records attached.
1184 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1185 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1186 path = template.format(ref)
1187 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1189 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1190 with self.assertRaises(KeyError):
1191 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1192 template.format(ref)
1194 # Now use a file template that will not result in unique filenames
1195 with self.assertRaises(FileTemplateValidationError):
1196 butler.put(metric, "metric3", dataId1)
1198 def testImportExport(self):
1199 # Run put/get tests just to create and populate a repo.
1200 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1201 self.runImportExportTest(storageClass)
1203 @unittest.expectedFailure
1204 def testImportExportVirtualComposite(self):
1205 # Run put/get tests just to create and populate a repo.
1206 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1207 self.runImportExportTest(storageClass)
1209 def runImportExportTest(self, storageClass):
1210 """This test does an export to a temp directory and an import back
1211 into a new temp directory repo. It does not assume a posix datastore"""
1212 exportButler = self.runPutGetTest(storageClass, "test_metric")
1213 print("Root:", exportButler.datastore.root)
1214 # Test that the repo actually has at least one dataset.
1215 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1216 self.assertGreater(len(datasets), 0)
1217 # Add a DimensionRecord that's unused by those datasets.
1218 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1219 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1220 # Export and then import datasets.
1221 with safeTestTempDir(TESTDIR) as exportDir:
1222 exportFile = os.path.join(exportDir, "exports.yaml")
1223 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1224 export.saveDatasets(datasets)
1225 # Export the same datasets again. This should quietly do
1226 # nothing because of internal deduplication, and it shouldn't
1227 # complain about being asked to export the "htm7" elements even
1228 # though there aren't any in these datasets or in the database.
1229 export.saveDatasets(datasets, elements=["htm7"])
1230 # Save one of the data IDs again; this should be harmless
1231 # because of internal deduplication.
1232 export.saveDataIds([datasets[0].dataId])
1233 # Save some dimension records directly.
1234 export.saveDimensionData("skymap", [skymapRecord])
1235 self.assertTrue(os.path.exists(exportFile))
1236 with safeTestTempDir(TESTDIR) as importDir:
1237 # We always want this to be a local posix butler
1238 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1239 # Calling script.butlerImport tests the implementation of the
1240 # butler command line interface "import" subcommand. Functions
1241 # in the script folder are generally considered protected and
1242 # should not be used as public api.
1243 with open(exportFile, "r") as f:
1244 script.butlerImport(
1245 importDir,
1246 export_file=f,
1247 directory=exportDir,
1248 transfer="auto",
1249 skip_dimensions=None,
1250 reuse_ids=False,
1251 )
1252 importButler = Butler(importDir, run=self.default_run)
1253 for ref in datasets:
1254 with self.subTest(ref=ref):
1255 # Test for existence by passing in the DatasetType and
1256 # data ID separately, to avoid lookup by dataset_id.
1257 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1258 self.assertEqual(
1259 list(importButler.registry.queryDimensionRecords("skymap")),
1260 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1261 )
1263 def testRemoveRuns(self):
1264 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1265 butler = Butler(self.tmpConfigFile, writeable=True)
1266 # Load registry data with dimensions to hang datasets off of.
1267 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1268 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1269 # Add some RUN-type collection.
1270 run1 = "run1"
1271 butler.registry.registerRun(run1)
1272 run2 = "run2"
1273 butler.registry.registerRun(run2)
1274 # put a dataset in each
1275 metric = makeExampleMetrics()
1276 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1277 datasetType = self.addDatasetType(
1278 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1279 )
1280 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1281 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1282 uri1 = butler.getURI(ref1, collections=[run1])
1283 uri2 = butler.getURI(ref2, collections=[run2])
1284 # Remove from both runs with different values for unstore.
1285 butler.removeRuns([run1], unstore=True)
1286 butler.removeRuns([run2], unstore=False)
1287 # Should be nothing in registry for either one, and datastore should
1288 # not think either exists.
1289 with self.assertRaises(MissingCollectionError):
1290 butler.registry.getCollectionType(run1)
1291 with self.assertRaises(MissingCollectionError):
1292 butler.registry.getCollectionType(run2)
1293 self.assertFalse(butler.datastore.exists(ref1))
1294 self.assertFalse(butler.datastore.exists(ref2))
1295 # The ref we unstored should be gone according to the URI, but the
1296 # one we forgot should still be around.
1297 self.assertFalse(uri1.exists())
1298 self.assertTrue(uri2.exists())
1301class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1302 """PosixDatastore specialization of a butler"""
1304 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1305 fullConfigKey = ".datastore.formatters"
1306 validationCanFail = True
1307 datastoreStr = ["/tmp"]
1308 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1309 registryStr = "/gen3.sqlite3"
1311 def testPathConstructor(self):
1312 """Independent test of constructor using PathLike."""
1313 butler = Butler(self.tmpConfigFile, run=self.default_run)
1314 self.assertIsInstance(butler, Butler)
1316 # And again with a Path object with the butler yaml
1317 path = pathlib.Path(self.tmpConfigFile)
1318 butler = Butler(path, writeable=False)
1319 self.assertIsInstance(butler, Butler)
1321 # And again with a Path object without the butler yaml
1322 # (making sure we skip it if the tmp config doesn't end
1323 # in butler.yaml -- which is the case for a subclass)
1324 if self.tmpConfigFile.endswith("butler.yaml"):
1325 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1326 butler = Butler(path, writeable=False)
1327 self.assertIsInstance(butler, Butler)
1329 def testExportTransferCopy(self):
1330 """Test local export using all transfer modes"""
1331 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1332 exportButler = self.runPutGetTest(storageClass, "test_metric")
1333 # Test that the repo actually has at least one dataset.
1334 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1335 self.assertGreater(len(datasets), 0)
1336 uris = [exportButler.getURI(d) for d in datasets]
1337 datastoreRoot = exportButler.datastore.root
1339 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1341 for path in pathsInStore:
1342 # Assume local file system
1343 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1345 for transfer in ("copy", "link", "symlink", "relsymlink"):
1346 with safeTestTempDir(TESTDIR) as exportDir:
1347 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1348 export.saveDatasets(datasets)
1349 for path in pathsInStore:
1350 self.assertTrue(
1351 self.checkFileExists(exportDir, path),
1352 f"Check that mode {transfer} exported files",
1353 )
1355 def testPruneDatasets(self):
1356 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1357 butler = Butler(self.tmpConfigFile, writeable=True)
1358 # Load registry data with dimensions to hang datasets off of.
1359 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1360 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1361 # Add some RUN-type collections.
1362 run1 = "run1"
1363 butler.registry.registerRun(run1)
1364 run2 = "run2"
1365 butler.registry.registerRun(run2)
1366 # put some datasets. ref1 and ref2 have the same data ID, and are in
1367 # different runs. ref3 has a different data ID.
1368 metric = makeExampleMetrics()
1369 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1370 datasetType = self.addDatasetType(
1371 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1372 )
1373 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1374 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1375 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1377 # Simple prune.
1378 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1379 with self.assertRaises(LookupError):
1380 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1382 # Put data back.
1383 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1384 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1385 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1387 # Check that in normal mode, deleting the record will lead to
1388 # trash not touching the file.
1389 uri1 = butler.datastore.getURI(ref1)
1390 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1391 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1392 butler.datastore.trash(ref1)
1393 butler.datastore.emptyTrash()
1394 self.assertTrue(uri1.exists())
1395 uri1.remove() # Clean it up.
1397 # Simulate execution butler setup by deleting the datastore
1398 # record but keeping the file around and trusting.
1399 butler.datastore.trustGetRequest = True
1400 uri2 = butler.datastore.getURI(ref2)
1401 uri3 = butler.datastore.getURI(ref3)
1402 self.assertTrue(uri2.exists())
1403 self.assertTrue(uri3.exists())
1405 # Remove the datastore record.
1406 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1407 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1408 self.assertTrue(uri2.exists())
1409 butler.datastore.trash([ref2, ref3])
1410 # Immediate removal for ref2 file
1411 self.assertFalse(uri2.exists())
1412 # But ref3 has to wait for the empty.
1413 self.assertTrue(uri3.exists())
1414 butler.datastore.emptyTrash()
1415 self.assertFalse(uri3.exists())
1417 # Clear out the datasets from registry.
1418 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1420 def testPytypePutCoercion(self):
1421 """Test python type coercion on Butler.get and put."""
1423 # Store some data with the normal example storage class.
1424 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1425 datasetTypeName = "test_metric"
1426 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
1428 dataId = {"instrument": "DummyCamComp", "visit": 423}
1430 # Put a dict and this should coerce to a MetricsExample
1431 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1432 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1433 test_metric = butler.getDirect(metric_ref)
1434 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1435 self.assertEqual(test_metric.summary, test_dict["summary"])
1436 self.assertEqual(test_metric.output, test_dict["output"])
1438 # Check that the put still works if a DatasetType is given with
1439 # a definition matching this python type.
1440 registry_type = butler.registry.getDatasetType(datasetTypeName)
1441 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
1442 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
1443 self.assertEqual(metric2_ref.datasetType, registry_type)
1445 # The get will return the type expected by registry.
1446 test_metric2 = butler.getDirect(metric2_ref)
1447 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
1449 # Make a new DatasetRef with the compatible but different DatasetType.
1450 # This should now return a dict.
1451 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
1452 test_dict2 = butler.getDirect(new_ref)
1453 self.assertEqual(get_full_type_name(test_dict2), "dict")
1455 # Get it again with the wrong dataset type definition using get()
1456 # rather than getDirect(). This should be consistent with getDirect()
1457 # behavior and return the type of the DatasetType.
1458 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
1459 self.assertEqual(get_full_type_name(test_dict3), "dict")
1461 def testPytypeCoercion(self):
1462 """Test python type coercion on Butler.get and put."""
1464 # Store some data with the normal example storage class.
1465 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1466 datasetTypeName = "test_metric"
1467 butler = self.runPutGetTest(storageClass, datasetTypeName)
1469 dataId = {"instrument": "DummyCamComp", "visit": 423}
1470 metric = butler.get(datasetTypeName, dataId=dataId)
1471 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1473 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1474 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1476 # Now need to hack the registry dataset type definition.
1477 # There is no API for this.
1478 manager = butler.registry._managers.datasets
1479 manager._db.update(
1480 manager._static.dataset_type,
1481 {"name": datasetTypeName},
1482 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1483 )
1485 # Force reset of dataset type cache
1486 butler.registry.refresh()
1488 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1489 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1490 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1492 metric_model = butler.get(datasetTypeName, dataId=dataId)
1493 self.assertNotEqual(type(metric_model), type(metric))
1494 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1496 # Put the model and read it back to show that everything now
1497 # works as normal.
1498 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1499 metric_model_new = butler.get(metric_ref)
1500 self.assertEqual(metric_model_new, metric_model)
1502 # Hack the storage class again to something that will fail on the
1503 # get with no conversion class.
1504 manager._db.update(
1505 manager._static.dataset_type,
1506 {"name": datasetTypeName},
1507 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1508 )
1509 butler.registry.refresh()
1511 with self.assertRaises(ValueError):
1512 butler.get(datasetTypeName, dataId=dataId)
1515@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1516class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1517 """PosixDatastore specialization of a butler using Postgres"""
1519 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1520 fullConfigKey = ".datastore.formatters"
1521 validationCanFail = True
1522 datastoreStr = ["/tmp"]
1523 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1524 registryStr = "PostgreSQL@test"
1526 @staticmethod
1527 def _handler(postgresql):
1528 engine = sqlalchemy.engine.create_engine(postgresql.url())
1529 with engine.begin() as connection:
1530 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1532 @classmethod
1533 def setUpClass(cls):
1534 # Create the postgres test server.
1535 cls.postgresql = testing.postgresql.PostgresqlFactory(
1536 cache_initialized_db=True, on_initialized=cls._handler
1537 )
1538 super().setUpClass()
1540 @classmethod
1541 def tearDownClass(cls):
1542 # Clean up any lingering SQLAlchemy engines/connections
1543 # so they're closed before we shut down the server.
1544 gc.collect()
1545 cls.postgresql.clear_cache()
1546 super().tearDownClass()
1548 def setUp(self):
1549 self.server = self.postgresql()
1551 # Need to add a registry section to the config.
1552 self._temp_config = False
1553 config = Config(self.configFile)
1554 config["registry", "db"] = self.server.url()
1555 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1556 config.dump(fh)
1557 self.configFile = fh.name
1558 self._temp_config = True
1559 super().setUp()
1561 def tearDown(self):
1562 self.server.stop()
1563 if self._temp_config and os.path.exists(self.configFile):
1564 os.remove(self.configFile)
1565 super().tearDown()
1567 def testMakeRepo(self):
1568 # The base class test assumes that it's using sqlite and assumes
1569 # the config file is acceptable to sqlite.
1570 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1573class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1574 """InMemoryDatastore specialization of a butler"""
1576 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1577 fullConfigKey = None
1578 useTempRoot = False
1579 validationCanFail = False
1580 datastoreStr = ["datastore='InMemory"]
1581 datastoreName = ["InMemoryDatastore@"]
1582 registryStr = "/gen3.sqlite3"
1584 def testIngest(self):
1585 pass
1588class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1589 """PosixDatastore specialization"""
1591 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1592 fullConfigKey = ".datastore.datastores.1.formatters"
1593 validationCanFail = True
1594 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1595 datastoreName = [
1596 "InMemoryDatastore@",
1597 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1598 "SecondDatastore",
1599 ]
1600 registryStr = "/gen3.sqlite3"
1603class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1604 """Test that a yaml file in one location can refer to a root in another."""
1606 datastoreStr = ["dir1"]
1607 # Disable the makeRepo test since we are deliberately not using
1608 # butler.yaml as the config name.
1609 fullConfigKey = None
1611 def setUp(self):
1612 self.root = makeTestTempDir(TESTDIR)
1614 # Make a new repository in one place
1615 self.dir1 = os.path.join(self.root, "dir1")
1616 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1618 # Move the yaml file to a different place and add a "root"
1619 self.dir2 = os.path.join(self.root, "dir2")
1620 os.makedirs(self.dir2, exist_ok=True)
1621 configFile1 = os.path.join(self.dir1, "butler.yaml")
1622 config = Config(configFile1)
1623 config["root"] = self.dir1
1624 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1625 config.dumpToUri(configFile2)
1626 os.remove(configFile1)
1627 self.tmpConfigFile = configFile2
1629 def testFileLocations(self):
1630 self.assertNotEqual(self.dir1, self.dir2)
1631 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1632 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1633 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1636class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1637 """Test that a config file created by makeRepo outside of repo works."""
1639 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1641 def setUp(self):
1642 self.root = makeTestTempDir(TESTDIR)
1643 self.root2 = makeTestTempDir(TESTDIR)
1645 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1646 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1648 def tearDown(self):
1649 if os.path.exists(self.root2):
1650 shutil.rmtree(self.root2, ignore_errors=True)
1651 super().tearDown()
1653 def testConfigExistence(self):
1654 c = Config(self.tmpConfigFile)
1655 uri_config = ResourcePath(c["root"])
1656 uri_expected = ResourcePath(self.root, forceDirectory=True)
1657 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1658 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1660 def testPutGet(self):
1661 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1662 self.runPutGetTest(storageClass, "test_metric")
1665class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1666 """Test that a config file created by makeRepo outside of repo works."""
1668 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1670 def setUp(self):
1671 self.root = makeTestTempDir(TESTDIR)
1672 self.root2 = makeTestTempDir(TESTDIR)
1674 self.tmpConfigFile = self.root2
1675 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1677 def testConfigExistence(self):
1678 # Append the yaml file else Config constructor does not know the file
1679 # type.
1680 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1681 super().testConfigExistence()
1684class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1685 """Test that a config file created by makeRepo outside of repo works."""
1687 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1689 def setUp(self):
1690 self.root = makeTestTempDir(TESTDIR)
1691 self.root2 = makeTestTempDir(TESTDIR)
1693 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1694 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1697@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1698class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1699 """S3Datastore specialization of a butler; an S3 storage Datastore +
1700 a local in-memory SqlRegistry.
1701 """
1703 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1704 fullConfigKey = None
1705 validationCanFail = True
1707 bucketName = "anybucketname"
1708 """Name of the Bucket that will be used in the tests. The name is read from
1709 the config file used with the tests during set-up.
1710 """
1712 root = "butlerRoot/"
1713 """Root repository directory expected to be used in case useTempRoot=False.
1714 Otherwise the root is set to a 20 characters long randomly generated string
1715 during set-up.
1716 """
1718 datastoreStr = [f"datastore={root}"]
1719 """Contains all expected root locations in a format expected to be
1720 returned by Butler stringification.
1721 """
1723 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1724 """The expected format of the S3 Datastore string."""
1726 registryStr = "/gen3.sqlite3"
1727 """Expected format of the Registry string."""
1729 mock_s3 = mock_s3()
1730 """The mocked s3 interface from moto."""
1732 def genRoot(self):
1733 """Returns a random string of len 20 to serve as a root
1734 name for the temporary bucket repo.
1736 This is equivalent to tempfile.mkdtemp as this is what self.root
1737 becomes when useTempRoot is True.
1738 """
1739 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1740 return rndstr + "/"
1742 def setUp(self):
1743 config = Config(self.configFile)
1744 uri = ResourcePath(config[".datastore.datastore.root"])
1745 self.bucketName = uri.netloc
1747 # Enable S3 mocking of tests.
1748 self.mock_s3.start()
1750 # set up some fake credentials if they do not exist
1751 self.usingDummyCredentials = setAwsEnvCredentials()
1753 if self.useTempRoot:
1754 self.root = self.genRoot()
1755 rooturi = f"s3://{self.bucketName}/{self.root}"
1756 config.update({"datastore": {"datastore": {"root": rooturi}}})
1758 # need local folder to store registry database
1759 self.reg_dir = makeTestTempDir(TESTDIR)
1760 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1762 # MOTO needs to know that we expect Bucket bucketname to exist
1763 # (this used to be the class attribute bucketName)
1764 s3 = boto3.resource("s3")
1765 s3.create_bucket(Bucket=self.bucketName)
1767 self.datastoreStr = f"datastore={self.root}"
1768 self.datastoreName = [f"FileDatastore@{rooturi}"]
1769 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1770 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1772 def tearDown(self):
1773 s3 = boto3.resource("s3")
1774 bucket = s3.Bucket(self.bucketName)
1775 try:
1776 bucket.objects.all().delete()
1777 except botocore.exceptions.ClientError as e:
1778 if e.response["Error"]["Code"] == "404":
1779 # the key was not reachable - pass
1780 pass
1781 else:
1782 raise
1784 bucket = s3.Bucket(self.bucketName)
1785 bucket.delete()
1787 # Stop the S3 mock.
1788 self.mock_s3.stop()
1790 # unset any potentially set dummy credentials
1791 if self.usingDummyCredentials:
1792 unsetAwsEnvCredentials()
1794 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1795 shutil.rmtree(self.reg_dir, ignore_errors=True)
1797 if self.useTempRoot and os.path.exists(self.root):
1798 shutil.rmtree(self.root, ignore_errors=True)
1800 super().tearDown()
1803@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1804class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1805 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1806 a local in-memory SqlRegistry.
1807 """
1809 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1810 fullConfigKey = None
1811 validationCanFail = True
1813 serverName = "localhost"
1814 """Name of the server that will be used in the tests.
1815 """
1817 portNumber = 8080
1818 """Port on which the webdav server listens. Automatically chosen
1819 at setUpClass via the _getfreeport() method
1820 """
1822 root = "butlerRoot/"
1823 """Root repository directory expected to be used in case useTempRoot=False.
1824 Otherwise the root is set to a 20 characters long randomly generated string
1825 during set-up.
1826 """
1828 datastoreStr = [f"datastore={root}"]
1829 """Contains all expected root locations in a format expected to be
1830 returned by Butler stringification.
1831 """
1833 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1834 """The expected format of the WebdavDatastore string."""
1836 registryStr = "/gen3.sqlite3"
1837 """Expected format of the Registry string."""
1839 serverThread = None
1840 """Thread in which the local webdav server will run"""
1842 stopWebdavServer = False
1843 """This flag will cause the webdav server to
1844 gracefully shut down when True
1845 """
1847 def genRoot(self):
1848 """Returns a random string of len 20 to serve as a root
1849 name for the temporary bucket repo.
1851 This is equivalent to tempfile.mkdtemp as this is what self.root
1852 becomes when useTempRoot is True.
1853 """
1854 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1855 return rndstr + "/"
1857 @classmethod
1858 def setUpClass(cls):
1859 # Do the same as inherited class
1860 cls.storageClassFactory = StorageClassFactory()
1861 cls.storageClassFactory.addFromConfig(cls.configFile)
1863 cls.portNumber = cls._getfreeport()
1864 # Run a local webdav server on which tests will be run
1865 cls.serverThread = Thread(
1866 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1867 )
1868 cls.serverThread.start()
1869 # Wait for it to start
1870 time.sleep(3)
1872 @classmethod
1873 def tearDownClass(cls):
1874 # Ask for graceful shut down of the webdav server
1875 cls.stopWebdavServer = True
1876 # Wait for the thread to exit
1877 cls.serverThread.join()
1878 super().tearDownClass()
1880 def setUp(self):
1881 config = Config(self.configFile)
1883 if self.useTempRoot:
1884 self.root = self.genRoot()
1885 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1886 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1888 # need local folder to store registry database
1889 self.reg_dir = makeTestTempDir(TESTDIR)
1890 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1892 self.datastoreStr = f"datastore={self.root}"
1893 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1895 if not _is_webdav_endpoint(self.rooturi):
1896 raise OSError("Webdav server not running properly: cannot run tests.")
1898 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1899 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1901 def tearDown(self):
1902 # Clear temporary directory
1903 ResourcePath(self.rooturi).remove()
1904 ResourcePath(self.rooturi).session.close()
1906 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1907 shutil.rmtree(self.reg_dir, ignore_errors=True)
1909 if self.useTempRoot and os.path.exists(self.root):
1910 shutil.rmtree(self.root, ignore_errors=True)
1912 super().tearDown()
1914 def _serveWebdav(self, port: int, stopWebdavServer):
1915 """Starts a local webdav-compatible HTTP server,
1916 Listening on http://localhost:port
1917 This server only runs when this test class is instantiated,
1918 and then shuts down. Must be started is a separate thread.
1920 Parameters
1921 ----------
1922 port : `int`
1923 The port number on which the server should listen
1924 """
1925 root_path = gettempdir()
1927 config = {
1928 "host": "0.0.0.0",
1929 "port": port,
1930 "provider_mapping": {"/": root_path},
1931 "http_authenticator": {"domain_controller": None},
1932 "simple_dc": {"user_mapping": {"*": True}},
1933 "verbose": 0,
1934 }
1935 app = WsgiDAVApp(config)
1937 server_args = {
1938 "bind_addr": (config["host"], config["port"]),
1939 "wsgi_app": app,
1940 }
1941 server = wsgi.Server(**server_args)
1942 server.prepare()
1944 try:
1945 # Start the actual server in a separate thread
1946 t = Thread(target=server.serve, daemon=True)
1947 t.start()
1948 # watch stopWebdavServer, and gracefully
1949 # shut down the server when True
1950 while True:
1951 if stopWebdavServer():
1952 break
1953 time.sleep(1)
1954 except KeyboardInterrupt:
1955 print("Caught Ctrl-C, shutting down...")
1956 finally:
1957 server.stop()
1958 t.join()
1960 def _getfreeport():
1961 """
1962 Determines a free port using sockets.
1963 """
1964 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1965 free_socket.bind(("127.0.0.1", 0))
1966 free_socket.listen()
1967 port = free_socket.getsockname()[1]
1968 free_socket.close()
1969 return port
1972class PosixDatastoreTransfers(unittest.TestCase):
1973 """Test data transfers between butlers.
1975 Test for different managers. UUID to UUID and integer to integer are
1976 tested. UUID to integer is not supported since we do not currently
1977 want to allow that. Integer to UUID is supported with the caveat
1978 that UUID4 will be generated and this will be incorrect for raw
1979 dataset types. The test ignores that.
1980 """
1982 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1984 @classmethod
1985 def setUpClass(cls):
1986 cls.storageClassFactory = StorageClassFactory()
1987 cls.storageClassFactory.addFromConfig(cls.configFile)
1989 def setUp(self):
1990 self.root = makeTestTempDir(TESTDIR)
1991 self.config = Config(self.configFile)
1993 def tearDown(self):
1994 removeTestTempDir(self.root)
1996 def create_butler(self, manager, label):
1997 config = Config(self.configFile)
1998 config["registry", "managers", "datasets"] = manager
1999 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
2001 def create_butlers(self, manager1, manager2):
2002 self.source_butler = self.create_butler(manager1, "1")
2003 self.target_butler = self.create_butler(manager2, "2")
2005 def testTransferUuidToUuid(self):
2006 self.create_butlers(
2007 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2008 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2009 )
2010 # Setting id_gen_map should have no effect here
2011 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
2013 def testTransferIntToInt(self):
2014 with self.assertWarns(FutureWarning):
2015 self.create_butlers(
2016 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2017 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2018 )
2019 # int dataset ID only allows UNIQUE
2020 self.assertButlerTransfers()
2022 def testTransferIntToUuid(self):
2023 with self.assertWarns(FutureWarning):
2024 self.create_butlers(
2025 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2026 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2027 )
2028 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
2030 def testTransferMissing(self):
2031 """Test transfers where datastore records are missing.
2033 This is how execution butler works.
2034 """
2035 self.create_butlers(
2036 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2037 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2038 )
2040 # Configure the source butler to allow trust.
2041 self.source_butler.datastore.trustGetRequest = True
2043 self.assertButlerTransfers(purge=True)
2045 def testTransferMissingDisassembly(self):
2046 """Test transfers where datastore records are missing.
2048 This is how execution butler works.
2049 """
2050 self.create_butlers(
2051 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2052 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2053 )
2055 # Configure the source butler to allow trust.
2056 self.source_butler.datastore.trustGetRequest = True
2058 # Test disassembly.
2059 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
2061 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
2062 """Test that a run can be transferred to another butler."""
2064 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2065 datasetTypeName = "random_data"
2067 # Test will create 3 collections and we will want to transfer
2068 # two of those three.
2069 runs = ["run1", "run2", "other"]
2071 # Also want to use two different dataset types to ensure that
2072 # grouping works.
2073 datasetTypeNames = ["random_data", "random_data_2"]
2075 # Create the run collections in the source butler.
2076 for run in runs:
2077 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2079 # Create dimensions in both butlers (transfer will not create them).
2080 n_exposures = 30
2081 for butler in (self.source_butler, self.target_butler):
2082 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2083 butler.registry.insertDimensionData(
2084 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2085 )
2086 butler.registry.insertDimensionData(
2087 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2088 )
2090 for i in range(n_exposures):
2091 butler.registry.insertDimensionData(
2092 "exposure",
2093 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2094 )
2096 # Create dataset types in the source butler.
2097 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
2098 for datasetTypeName in datasetTypeNames:
2099 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2100 self.source_butler.registry.registerDatasetType(datasetType)
2102 # Write a dataset to an unrelated run -- this will ensure that
2103 # we are rewriting integer dataset ids in the target if necessary.
2104 # Will not be relevant for UUID.
2105 run = "distraction"
2106 butler = Butler(butler=self.source_butler, run=run)
2107 butler.put(
2108 makeExampleMetrics(),
2109 datasetTypeName,
2110 exposure=1,
2111 instrument="DummyCamComp",
2112 physical_filter="d-r",
2113 )
2115 # Write some example metrics to the source
2116 butler = Butler(butler=self.source_butler)
2118 # Set of DatasetRefs that should be in the list of refs to transfer
2119 # but which will not be transferred.
2120 deleted = set()
2122 n_expected = 20 # Number of datasets expected to be transferred
2123 source_refs = []
2124 for i in range(n_exposures):
2125 # Put a third of datasets into each collection, only retain
2126 # two thirds.
2127 index = i % 3
2128 run = runs[index]
2129 datasetTypeName = datasetTypeNames[i % 2]
2131 metric_data = {
2132 "summary": {"counter": i},
2133 "output": {"text": "metric"},
2134 "data": [2 * x for x in range(i)],
2135 }
2136 metric = MetricsExample(**metric_data)
2137 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2138 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2140 # Remove the datastore record using low-level API
2141 if purge:
2142 # Remove records for a fraction.
2143 if index == 1:
2144 # For one of these delete the file as well.
2145 # This allows the "missing" code to filter the
2146 # file out.
2147 if not deleted:
2148 primary, uris = butler.datastore.getURIs(ref)
2149 if primary:
2150 primary.remove()
2151 for uri in uris.values():
2152 uri.remove()
2153 n_expected -= 1
2154 deleted.add(ref)
2156 # Remove the datastore record.
2157 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2159 if index < 2:
2160 source_refs.append(ref)
2161 if ref not in deleted:
2162 new_metric = butler.get(ref.unresolved(), collections=run)
2163 self.assertEqual(new_metric, metric)
2165 # Create some bad dataset types to ensure we check for inconsistent
2166 # definitions.
2167 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2168 for datasetTypeName in datasetTypeNames:
2169 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2170 self.target_butler.registry.registerDatasetType(datasetType)
2171 with self.assertRaises(ConflictingDefinitionError):
2172 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2173 # And remove the bad definitions.
2174 for datasetTypeName in datasetTypeNames:
2175 self.target_butler.registry.removeDatasetType(datasetTypeName)
2177 # Transfer without creating dataset types should fail.
2178 with self.assertRaises(KeyError):
2179 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2181 # Now transfer them to the second butler
2182 with self.assertLogs(level=logging.DEBUG) as cm:
2183 transferred = self.target_butler.transfer_from(
2184 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2185 )
2186 self.assertEqual(len(transferred), n_expected)
2187 log_output = ";".join(cm.output)
2188 self.assertIn("found in datastore for chunk", log_output)
2189 self.assertIn("Creating output run", log_output)
2191 # Do the transfer twice to ensure that it will do nothing extra.
2192 # Only do this if purge=True because it does not work for int
2193 # dataset_id.
2194 if purge:
2195 # This should not need to register dataset types.
2196 transferred = self.target_butler.transfer_from(
2197 self.source_butler, source_refs, id_gen_map=id_gen_map
2198 )
2199 self.assertEqual(len(transferred), n_expected)
2201 # Also do an explicit low-level transfer to trigger some
2202 # edge cases.
2203 with self.assertLogs(level=logging.DEBUG) as cm:
2204 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2205 log_output = ";".join(cm.output)
2206 self.assertIn("no file artifacts exist", log_output)
2208 with self.assertRaises(TypeError):
2209 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2211 with self.assertRaises(ValueError):
2212 self.target_butler.datastore.transfer_from(
2213 self.source_butler.datastore, source_refs, transfer="split"
2214 )
2216 # Now try to get the same refs from the new butler.
2217 for ref in source_refs:
2218 if ref not in deleted:
2219 unresolved_ref = ref.unresolved()
2220 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2221 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2222 self.assertEqual(new_metric, old_metric)
2224 # Now prune run2 collection and create instead a CHAINED collection.
2225 # This should block the transfer.
2226 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2227 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2228 with self.assertRaises(CollectionTypeError):
2229 # Re-importing the run1 datasets can be problematic if they
2230 # use integer IDs so filter those out.
2231 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2232 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2235if __name__ == "__main__": 2235 ↛ 2236line 2235 didn't jump to line 2236, because the condition on line 2235 was never true
2236 unittest.main()