Coverage for tests/test_butler.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionSearch,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import (
77 CollectionError,
78 CollectionTypeError,
79 ConflictingDefinitionError,
80 DataIdValueError,
81 MissingCollectionError,
82)
83from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
84from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
85from lsst.resources import ResourcePath
86from lsst.resources.http import isWebdavEndpoint
87from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
88from lsst.utils import doImport
89from lsst.utils.introspection import get_full_type_name
91TESTDIR = os.path.abspath(os.path.dirname(__file__))
94def makeExampleMetrics():
95 return MetricsExample(
96 {"AM1": 5.2, "AM2": 30.6},
97 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
98 [563, 234, 456.7, 752, 8, 9, 27],
99 )
102class TransactionTestError(Exception):
103 """Specific error for testing transactions, to prevent misdiagnosing
104 that might otherwise occur when a standard exception is used.
105 """
107 pass
110class ButlerConfigTests(unittest.TestCase):
111 """Simple tests for ButlerConfig that are not tested in any other test
112 cases."""
114 def testSearchPath(self):
115 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
116 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
117 config1 = ButlerConfig(configFile)
118 self.assertNotIn("testConfigs", "\n".join(cm.output))
120 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
121 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
122 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
123 self.assertIn("testConfigs", "\n".join(cm.output))
125 key = ("datastore", "records", "table")
126 self.assertNotEqual(config1[key], config2[key])
127 self.assertEqual(config2[key], "override_record")
130class ButlerPutGetTests:
131 """Helper method for running a suite of put/get tests from different
132 butler configurations."""
134 root = None
136 @staticmethod
137 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
138 """Create a DatasetType and register it"""
139 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
140 registry.registerDatasetType(datasetType)
141 return datasetType
143 @classmethod
144 def setUpClass(cls):
145 cls.storageClassFactory = StorageClassFactory()
146 cls.storageClassFactory.addFromConfig(cls.configFile)
148 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
149 datasetType = datasetRef.datasetType
150 dataId = datasetRef.dataId
151 deferred = butler.getDirectDeferred(datasetRef)
153 for component in components:
154 compTypeName = datasetType.componentTypeName(component)
155 result = butler.get(compTypeName, dataId, collections=collections)
156 self.assertEqual(result, getattr(reference, component))
157 result_deferred = deferred.get(component=component)
158 self.assertEqual(result_deferred, result)
160 def tearDown(self):
161 removeTestTempDir(self.root)
163 def create_butler(self, run, storageClass, datasetTypeName):
164 butler = Butler(self.tmpConfigFile, run=run)
166 collections = set(butler.registry.queryCollections())
167 self.assertEqual(collections, set([run]))
169 # Create and register a DatasetType
170 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
172 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
174 # Add needed Dimensions
175 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
176 butler.registry.insertDimensionData(
177 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
178 )
179 butler.registry.insertDimensionData(
180 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
181 )
182 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
183 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
184 butler.registry.insertDimensionData(
185 "visit",
186 {
187 "instrument": "DummyCamComp",
188 "id": 423,
189 "name": "fourtwentythree",
190 "physical_filter": "d-r",
191 "visit_system": 1,
192 "datetime_begin": visit_start,
193 "datetime_end": visit_end,
194 },
195 )
197 # Add more visits for some later tests
198 for visit_id in (424, 425):
199 butler.registry.insertDimensionData(
200 "visit",
201 {
202 "instrument": "DummyCamComp",
203 "id": visit_id,
204 "name": f"fourtwentyfour_{visit_id}",
205 "physical_filter": "d-r",
206 "visit_system": 1,
207 },
208 )
209 return butler, datasetType
211 def runPutGetTest(self, storageClass, datasetTypeName):
212 # New datasets will be added to run and tag, but we will only look in
213 # tag when looking up datasets.
214 run = "ingest"
215 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
217 # Create and store a dataset
218 metric = makeExampleMetrics()
219 dataId = {"instrument": "DummyCamComp", "visit": 423}
221 # Create a DatasetRef for put
222 refIn = DatasetRef(datasetType, dataId, id=None)
224 # Put with a preexisting id should fail
225 with self.assertRaises(ValueError):
226 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
228 # Put and remove the dataset once as a DatasetRef, once as a dataId,
229 # and once with a DatasetType
231 # Keep track of any collections we add and do not clean up
232 expected_collections = {run}
234 counter = 0
235 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
236 # Since we are using subTest we can get cascading failures
237 # here with the first attempt failing and the others failing
238 # immediately because the dataset already exists. Work around
239 # this by using a distinct run collection each time
240 counter += 1
241 this_run = f"put_run_{counter}"
242 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
243 expected_collections.update({this_run})
245 with self.subTest(args=args):
246 ref = butler.put(metric, *args, run=this_run)
247 self.assertIsInstance(ref, DatasetRef)
249 # Test getDirect
250 metricOut = butler.getDirect(ref)
251 self.assertEqual(metric, metricOut)
252 # Test get
253 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
254 self.assertEqual(metric, metricOut)
255 # Test get with a datasetRef
256 metricOut = butler.get(ref, collections=this_run)
257 self.assertEqual(metric, metricOut)
258 # Test getDeferred with dataId
259 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
260 self.assertEqual(metric, metricOut)
261 # Test getDeferred with a datasetRef
262 metricOut = butler.getDeferred(ref, collections=this_run).get()
263 self.assertEqual(metric, metricOut)
264 # and deferred direct with ref
265 metricOut = butler.getDirectDeferred(ref).get()
266 self.assertEqual(metric, metricOut)
268 # Check we can get components
269 if storageClass.isComposite():
270 self.assertGetComponents(
271 butler, ref, ("summary", "data", "output"), metric, collections=this_run
272 )
274 # Can the artifacts themselves be retrieved?
275 if not butler.datastore.isEphemeral:
276 root_uri = ResourcePath(self.root)
278 for preserve_path in (True, False):
279 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
280 # Use copy so that we can test that overwrite
281 # protection works (using "auto" for File URIs would
282 # use hard links and subsequent transfer would work
283 # because it knows they are the same file).
284 transferred = butler.retrieveArtifacts(
285 [ref], destination, preserve_path=preserve_path, transfer="copy"
286 )
287 self.assertGreater(len(transferred), 0)
288 artifacts = list(ResourcePath.findFileResources([destination]))
289 self.assertEqual(set(transferred), set(artifacts))
291 for artifact in transferred:
292 path_in_destination = artifact.relative_to(destination)
293 self.assertIsNotNone(path_in_destination)
295 # when path is not preserved there should not be
296 # any path separators.
297 num_seps = path_in_destination.count("/")
298 if preserve_path:
299 self.assertGreater(num_seps, 0)
300 else:
301 self.assertEqual(num_seps, 0)
303 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
304 n_uris = len(secondary_uris)
305 if primary_uri:
306 n_uris += 1
307 self.assertEqual(
308 len(artifacts),
309 n_uris,
310 "Comparing expected artifacts vs actual:"
311 f" {artifacts} vs {primary_uri} and {secondary_uris}",
312 )
314 if preserve_path:
315 # No need to run these twice
316 with self.assertRaises(ValueError):
317 butler.retrieveArtifacts([ref], destination, transfer="move")
319 with self.assertRaises(FileExistsError):
320 butler.retrieveArtifacts([ref], destination)
322 transferred_again = butler.retrieveArtifacts(
323 [ref], destination, preserve_path=preserve_path, overwrite=True
324 )
325 self.assertEqual(set(transferred_again), set(transferred))
327 # Now remove the dataset completely.
328 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
329 # Lookup with original args should still fail.
330 with self.assertRaises(LookupError):
331 butler.datasetExists(*args, collections=this_run)
332 # getDirect() should still fail.
333 with self.assertRaises(FileNotFoundError):
334 butler.getDirect(ref)
335 # Registry shouldn't be able to find it by dataset_id anymore.
336 self.assertIsNone(butler.registry.getDataset(ref.id))
338 # Do explicit registry removal since we know they are
339 # empty
340 butler.registry.removeCollection(this_run)
341 expected_collections.remove(this_run)
343 # Put the dataset again, since the last thing we did was remove it
344 # and we want to use the default collection.
345 ref = butler.put(metric, refIn)
347 # Get with parameters
348 stop = 4
349 sliced = butler.get(ref, parameters={"slice": slice(stop)})
350 self.assertNotEqual(metric, sliced)
351 self.assertEqual(metric.summary, sliced.summary)
352 self.assertEqual(metric.output, sliced.output)
353 self.assertEqual(metric.data[:stop], sliced.data)
354 # getDeferred with parameters
355 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
356 self.assertNotEqual(metric, sliced)
357 self.assertEqual(metric.summary, sliced.summary)
358 self.assertEqual(metric.output, sliced.output)
359 self.assertEqual(metric.data[:stop], sliced.data)
360 # getDeferred with deferred parameters
361 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
362 self.assertNotEqual(metric, sliced)
363 self.assertEqual(metric.summary, sliced.summary)
364 self.assertEqual(metric.output, sliced.output)
365 self.assertEqual(metric.data[:stop], sliced.data)
367 if storageClass.isComposite():
368 # Check that components can be retrieved
369 metricOut = butler.get(ref.datasetType.name, dataId)
370 compNameS = ref.datasetType.componentTypeName("summary")
371 compNameD = ref.datasetType.componentTypeName("data")
372 summary = butler.get(compNameS, dataId)
373 self.assertEqual(summary, metric.summary)
374 data = butler.get(compNameD, dataId)
375 self.assertEqual(data, metric.data)
377 if "counter" in storageClass.derivedComponents:
378 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
379 self.assertEqual(count, len(data))
381 count = butler.get(
382 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
383 )
384 self.assertEqual(count, stop)
386 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
387 summary = butler.getDirect(compRef)
388 self.assertEqual(summary, metric.summary)
390 # Create a Dataset type that has the same name but is inconsistent.
391 inconsistentDatasetType = DatasetType(
392 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
393 )
395 # Getting with a dataset type that does not match registry fails
396 with self.assertRaises(ValueError):
397 butler.get(inconsistentDatasetType, dataId)
399 # Combining a DatasetRef with a dataId should fail
400 with self.assertRaises(ValueError):
401 butler.get(ref, dataId)
402 # Getting with an explicit ref should fail if the id doesn't match
403 with self.assertRaises(ValueError):
404 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
406 # Getting a dataset with unknown parameters should fail
407 with self.assertRaises(KeyError):
408 butler.get(ref, parameters={"unsupported": True})
410 # Check we have a collection
411 collections = set(butler.registry.queryCollections())
412 self.assertEqual(collections, expected_collections)
414 # Clean up to check that we can remove something that may have
415 # already had a component removed
416 butler.pruneDatasets([ref], unstore=True, purge=True)
418 # Check that we can configure a butler to accept a put even
419 # if it already has the dataset in registry.
420 ref = butler.put(metric, refIn)
422 # Repeat put will fail.
423 with self.assertRaises(ConflictingDefinitionError):
424 butler.put(metric, refIn)
426 # Remove the datastore entry.
427 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
429 # Put will still fail
430 with self.assertRaises(ConflictingDefinitionError):
431 butler.put(metric, refIn)
433 # Allow the put to succeed
434 butler._allow_put_of_predefined_dataset = True
435 ref2 = butler.put(metric, refIn)
436 self.assertEqual(ref2.id, ref.id)
438 # A second put will still fail but with a different exception
439 # than before.
440 with self.assertRaises(ConflictingDefinitionError):
441 butler.put(metric, refIn)
443 # Reset the flag to avoid confusion
444 butler._allow_put_of_predefined_dataset = False
446 # Leave the dataset in place since some downstream tests require
447 # something to be present
449 return butler
451 def testDeferredCollectionPassing(self):
452 # Construct a butler with no run or collection, but make it writeable.
453 butler = Butler(self.tmpConfigFile, writeable=True)
454 # Create and register a DatasetType
455 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
456 datasetType = self.addDatasetType(
457 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
458 )
459 # Add needed Dimensions
460 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
461 butler.registry.insertDimensionData(
462 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
463 )
464 butler.registry.insertDimensionData(
465 "visit",
466 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
467 )
468 dataId = {"instrument": "DummyCamComp", "visit": 423}
469 # Create dataset.
470 metric = makeExampleMetrics()
471 # Register a new run and put dataset.
472 run = "deferred"
473 self.assertTrue(butler.registry.registerRun(run))
474 # Second time it will be allowed but indicate no-op
475 self.assertFalse(butler.registry.registerRun(run))
476 ref = butler.put(metric, datasetType, dataId, run=run)
477 # Putting with no run should fail with TypeError.
478 with self.assertRaises(CollectionError):
479 butler.put(metric, datasetType, dataId)
480 # Dataset should exist.
481 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
482 # We should be able to get the dataset back, but with and without
483 # a deferred dataset handle.
484 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
485 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
486 # Trying to find the dataset without any collection is a TypeError.
487 with self.assertRaises(CollectionError):
488 butler.datasetExists(datasetType, dataId)
489 with self.assertRaises(CollectionError):
490 butler.get(datasetType, dataId)
491 # Associate the dataset with a different collection.
492 butler.registry.registerCollection("tagged")
493 butler.registry.associate("tagged", [ref])
494 # Deleting the dataset from the new collection should make it findable
495 # in the original collection.
496 butler.pruneDatasets([ref], tags=["tagged"])
497 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
500class ButlerTests(ButlerPutGetTests):
501 """Tests for Butler."""
503 useTempRoot = True
505 def setUp(self):
506 """Create a new butler root for each test."""
507 self.root = makeTestTempDir(TESTDIR)
508 Butler.makeRepo(self.root, config=Config(self.configFile))
509 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
511 def testConstructor(self):
512 """Independent test of constructor."""
513 butler = Butler(self.tmpConfigFile, run="ingest")
514 self.assertIsInstance(butler, Butler)
516 # Check that butler.yaml is added automatically.
517 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
518 config_dir = self.tmpConfigFile[: -len(end)]
519 butler = Butler(config_dir, run="ingest")
520 self.assertIsInstance(butler, Butler)
522 collections = set(butler.registry.queryCollections())
523 self.assertEqual(collections, {"ingest"})
525 # Check that some special characters can be included in run name.
526 special_run = "u@b.c-A"
527 butler_special = Butler(butler=butler, run=special_run)
528 collections = set(butler_special.registry.queryCollections("*@*"))
529 self.assertEqual(collections, {special_run})
531 butler2 = Butler(butler=butler, collections=["other"])
532 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
533 self.assertIsNone(butler2.run)
534 self.assertIs(butler.datastore, butler2.datastore)
536 # Test that we can use an environment variable to find this
537 # repository.
538 butler_index = Config()
539 butler_index["label"] = self.tmpConfigFile
540 for suffix in (".yaml", ".json"):
541 # Ensure that the content differs so that we know that
542 # we aren't reusing the cache.
543 bad_label = f"s3://bucket/not_real{suffix}"
544 butler_index["bad_label"] = bad_label
545 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
546 butler_index.dumpToUri(temp_file)
547 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
548 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
549 uri = Butler.get_repo_uri("bad_label")
550 self.assertEqual(uri, ResourcePath(bad_label))
551 uri = Butler.get_repo_uri("label")
552 butler = Butler(uri, writeable=False)
553 self.assertIsInstance(butler, Butler)
554 with self.assertRaises(KeyError) as cm:
555 Butler.get_repo_uri("missing")
556 self.assertIn("not known to", str(cm.exception))
557 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
558 with self.assertRaises(FileNotFoundError):
559 Butler.get_repo_uri("label")
560 self.assertEqual(Butler.get_known_repos(), set())
561 with self.assertRaises(KeyError) as cm:
562 # No environment variable set.
563 Butler.get_repo_uri("label")
564 self.assertIn("No repository index defined", str(cm.exception))
565 self.assertEqual(Butler.get_known_repos(), set())
567 def testBasicPutGet(self):
568 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
569 self.runPutGetTest(storageClass, "test_metric")
571 def testCompositePutGetConcrete(self):
573 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
574 butler = self.runPutGetTest(storageClass, "test_metric")
576 # Should *not* be disassembled
577 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
578 self.assertEqual(len(datasets), 1)
579 uri, components = butler.getURIs(datasets[0])
580 self.assertIsInstance(uri, ResourcePath)
581 self.assertFalse(components)
582 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
583 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
585 # Predicted dataset
586 dataId = {"instrument": "DummyCamComp", "visit": 424}
587 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
588 self.assertFalse(components)
589 self.assertIsInstance(uri, ResourcePath)
590 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
591 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
593 def testCompositePutGetVirtual(self):
594 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
595 butler = self.runPutGetTest(storageClass, "test_metric_comp")
597 # Should be disassembled
598 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
599 self.assertEqual(len(datasets), 1)
600 uri, components = butler.getURIs(datasets[0])
602 if butler.datastore.isEphemeral:
603 # Never disassemble in-memory datastore
604 self.assertIsInstance(uri, ResourcePath)
605 self.assertFalse(components)
606 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
607 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
608 else:
609 self.assertIsNone(uri)
610 self.assertEqual(set(components), set(storageClass.components))
611 for compuri in components.values():
612 self.assertIsInstance(compuri, ResourcePath)
613 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
614 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
616 # Predicted dataset
617 dataId = {"instrument": "DummyCamComp", "visit": 424}
618 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
620 if butler.datastore.isEphemeral:
621 # Never disassembled
622 self.assertIsInstance(uri, ResourcePath)
623 self.assertFalse(components)
624 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
625 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
626 else:
627 self.assertIsNone(uri)
628 self.assertEqual(set(components), set(storageClass.components))
629 for compuri in components.values():
630 self.assertIsInstance(compuri, ResourcePath)
631 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
632 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
634 def testIngest(self):
635 butler = Butler(self.tmpConfigFile, run="ingest")
637 # Create and register a DatasetType
638 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
640 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
641 datasetTypeName = "metric"
643 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
645 # Add needed Dimensions
646 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
647 butler.registry.insertDimensionData(
648 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
649 )
650 for detector in (1, 2):
651 butler.registry.insertDimensionData(
652 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
653 )
655 butler.registry.insertDimensionData(
656 "visit",
657 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
658 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
659 )
661 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
662 dataRoot = os.path.join(TESTDIR, "data", "basic")
663 datasets = []
664 for detector in (1, 2):
665 detector_name = f"detector_{detector}"
666 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
667 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
668 # Create a DatasetRef for ingest
669 refIn = DatasetRef(datasetType, dataId, id=None)
671 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
673 butler.ingest(*datasets, transfer="copy")
675 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
676 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
678 metrics1 = butler.get(datasetTypeName, dataId1)
679 metrics2 = butler.get(datasetTypeName, dataId2)
680 self.assertNotEqual(metrics1, metrics2)
682 # Compare URIs
683 uri1 = butler.getURI(datasetTypeName, dataId1)
684 uri2 = butler.getURI(datasetTypeName, dataId2)
685 self.assertNotEqual(uri1, uri2)
687 # Now do a multi-dataset but single file ingest
688 metricFile = os.path.join(dataRoot, "detectors.yaml")
689 refs = []
690 for detector in (1, 2):
691 detector_name = f"detector_{detector}"
692 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
693 # Create a DatasetRef for ingest
694 refs.append(DatasetRef(datasetType, dataId, id=None))
696 datasets = []
697 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
699 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
701 # Check that the datastore recorded no file size.
702 # Not all datastores can support this.
703 try:
704 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
705 self.assertEqual(infos[0].file_size, -1)
706 except AttributeError:
707 pass
709 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
710 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
712 multi1 = butler.get(datasetTypeName, dataId1)
713 multi2 = butler.get(datasetTypeName, dataId2)
715 self.assertEqual(multi1, metrics1)
716 self.assertEqual(multi2, metrics2)
718 # Compare URIs
719 uri1 = butler.getURI(datasetTypeName, dataId1)
720 uri2 = butler.getURI(datasetTypeName, dataId2)
721 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
723 # Test that removing one does not break the second
724 # This line will issue a warning log message for a ChainedDatastore
725 # that uses an InMemoryDatastore since in-memory can not ingest
726 # files.
727 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
728 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
729 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
730 multi2b = butler.get(datasetTypeName, dataId2)
731 self.assertEqual(multi2, multi2b)
733 def testPruneCollections(self):
734 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
735 butler = Butler(self.tmpConfigFile, writeable=True)
736 # Load registry data with dimensions to hang datasets off of.
737 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
738 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
739 # Add some RUN-type collections.
740 run1 = "run1"
741 butler.registry.registerRun(run1)
742 run2 = "run2"
743 butler.registry.registerRun(run2)
744 # put some datasets. ref1 and ref2 have the same data ID, and are in
745 # different runs. ref3 has a different data ID.
746 metric = makeExampleMetrics()
747 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
748 datasetType = self.addDatasetType(
749 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
750 )
751 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
752 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
753 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
755 # Try to delete a RUN collection without purge, or with purge and not
756 # unstore.
757 with self.assertRaises(TypeError):
758 butler.pruneCollection(run1)
759 with self.assertRaises(TypeError):
760 butler.pruneCollection(run2, purge=True)
761 # Add a TAGGED collection and associate ref3 only into it.
762 tag1 = "tag1"
763 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
764 self.assertTrue(registered)
765 # Registering a second time should be allowed.
766 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
767 self.assertFalse(registered)
768 butler.registry.associate(tag1, [ref3])
769 # Add a CHAINED collection that searches run1 and then run2. It
770 # logically contains only ref1, because ref2 is shadowed due to them
771 # having the same data ID and dataset type.
772 chain1 = "chain1"
773 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
774 butler.registry.setCollectionChain(chain1, [run1, run2])
775 # Try to delete RUN collections, which should fail with complete
776 # rollback because they're still referenced by the CHAINED
777 # collection.
778 with self.assertRaises(Exception):
779 butler.pruneCollection(run1, pruge=True, unstore=True)
780 with self.assertRaises(Exception):
781 butler.pruneCollection(run2, pruge=True, unstore=True)
782 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
783 existence = butler.datastore.mexists([ref1, ref2, ref3])
784 self.assertTrue(existence[ref1])
785 self.assertTrue(existence[ref2])
786 self.assertTrue(existence[ref3])
787 # Try to delete CHAINED and TAGGED collections with purge; should not
788 # work.
789 with self.assertRaises(TypeError):
790 butler.pruneCollection(tag1, purge=True, unstore=True)
791 with self.assertRaises(TypeError):
792 butler.pruneCollection(chain1, purge=True, unstore=True)
793 # Remove the tagged collection with unstore=False. This should not
794 # affect the datasets.
795 butler.pruneCollection(tag1)
796 with self.assertRaises(MissingCollectionError):
797 butler.registry.getCollectionType(tag1)
798 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
799 existence = butler.datastore.mexists([ref1, ref2, ref3])
800 self.assertTrue(existence[ref1])
801 self.assertTrue(existence[ref2])
802 self.assertTrue(existence[ref3])
803 # Add the tagged collection back in, and remove it with unstore=True.
804 # This should remove ref3 only from the datastore.
805 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
806 butler.registry.associate(tag1, [ref3])
807 butler.pruneCollection(tag1, unstore=True)
808 with self.assertRaises(MissingCollectionError):
809 butler.registry.getCollectionType(tag1)
810 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
811 existence = butler.datastore.mexists([ref1, ref2, ref3])
812 self.assertTrue(existence[ref1])
813 self.assertTrue(existence[ref2])
814 self.assertFalse(existence[ref3])
815 # Delete the chain with unstore=False. The datasets should not be
816 # affected at all.
817 butler.pruneCollection(chain1)
818 with self.assertRaises(MissingCollectionError):
819 butler.registry.getCollectionType(chain1)
820 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
821 existence = butler.datastore.mexists([ref1, ref2, ref3])
822 self.assertTrue(existence[ref1])
823 self.assertTrue(existence[ref2])
824 self.assertFalse(existence[ref3])
825 # Redefine and then delete the chain with unstore=True. Only ref1
826 # should be unstored (ref3 has already been unstored, but otherwise
827 # would be now).
828 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
829 butler.registry.setCollectionChain(chain1, [run1, run2])
830 butler.pruneCollection(chain1, unstore=True)
831 with self.assertRaises(MissingCollectionError):
832 butler.registry.getCollectionType(chain1)
833 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
834 existence = butler.datastore.mexists([ref1, ref2, ref3])
835 self.assertFalse(existence[ref1])
836 self.assertTrue(existence[ref2])
837 self.assertFalse(existence[ref3])
838 # Remove run1. This removes ref1 and ref3 from the registry (they're
839 # already gone from the datastore, which is fine).
840 butler.pruneCollection(run1, purge=True, unstore=True)
841 with self.assertRaises(MissingCollectionError):
842 butler.registry.getCollectionType(run1)
843 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
844 self.assertTrue(butler.datastore.exists(ref2))
845 # Remove run2. This removes ref2 from the registry and the datastore.
846 butler.pruneCollection(run2, purge=True, unstore=True)
847 with self.assertRaises(MissingCollectionError):
848 butler.registry.getCollectionType(run2)
849 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
851 # Now that the collections have been pruned we can remove the
852 # dataset type
853 butler.registry.removeDatasetType(datasetType.name)
855 def testPickle(self):
856 """Test pickle support."""
857 butler = Butler(self.tmpConfigFile, run="ingest")
858 butlerOut = pickle.loads(pickle.dumps(butler))
859 self.assertIsInstance(butlerOut, Butler)
860 self.assertEqual(butlerOut._config, butler._config)
861 self.assertEqual(butlerOut.collections, butler.collections)
862 self.assertEqual(butlerOut.run, butler.run)
864 def testGetDatasetTypes(self):
865 butler = Butler(self.tmpConfigFile, run="ingest")
866 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
867 dimensionEntries = [
868 (
869 "instrument",
870 {"instrument": "DummyCam"},
871 {"instrument": "DummyHSC"},
872 {"instrument": "DummyCamComp"},
873 ),
874 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
875 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
876 ]
877 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
878 # Add needed Dimensions
879 for args in dimensionEntries:
880 butler.registry.insertDimensionData(*args)
882 # When a DatasetType is added to the registry entries are not created
883 # for components but querying them can return the components.
884 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
885 components = set()
886 for datasetTypeName in datasetTypeNames:
887 # Create and register a DatasetType
888 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
890 for componentName in storageClass.components:
891 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
893 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
894 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
896 # Now that we have some dataset types registered, validate them
897 butler.validateConfiguration(
898 ignore=[
899 "test_metric_comp",
900 "metric3",
901 "calexp",
902 "DummySC",
903 "datasetType.component",
904 "random_data",
905 "random_data_2",
906 ]
907 )
909 # Add a new datasetType that will fail template validation
910 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
911 if self.validationCanFail:
912 with self.assertRaises(ValidationError):
913 butler.validateConfiguration()
915 # Rerun validation but with a subset of dataset type names
916 butler.validateConfiguration(datasetTypeNames=["metric4"])
918 # Rerun validation but ignore the bad datasetType
919 butler.validateConfiguration(
920 ignore=[
921 "test_metric_comp",
922 "metric3",
923 "calexp",
924 "DummySC",
925 "datasetType.component",
926 "random_data",
927 "random_data_2",
928 ]
929 )
931 def testTransaction(self):
932 butler = Butler(self.tmpConfigFile, run="ingest")
933 datasetTypeName = "test_metric"
934 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
935 dimensionEntries = (
936 ("instrument", {"instrument": "DummyCam"}),
937 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
938 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
939 )
940 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
941 metric = makeExampleMetrics()
942 dataId = {"instrument": "DummyCam", "visit": 42}
943 # Create and register a DatasetType
944 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
945 with self.assertRaises(TransactionTestError):
946 with butler.transaction():
947 # Add needed Dimensions
948 for args in dimensionEntries:
949 butler.registry.insertDimensionData(*args)
950 # Store a dataset
951 ref = butler.put(metric, datasetTypeName, dataId)
952 self.assertIsInstance(ref, DatasetRef)
953 # Test getDirect
954 metricOut = butler.getDirect(ref)
955 self.assertEqual(metric, metricOut)
956 # Test get
957 metricOut = butler.get(datasetTypeName, dataId)
958 self.assertEqual(metric, metricOut)
959 # Check we can get components
960 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
961 raise TransactionTestError("This should roll back the entire transaction")
962 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
963 butler.registry.expandDataId(dataId)
964 # Should raise LookupError for missing data ID value
965 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
966 butler.get(datasetTypeName, dataId)
967 # Also check explicitly if Dataset entry is missing
968 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
969 # Direct retrieval should not find the file in the Datastore
970 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
971 butler.getDirect(ref)
973 def testMakeRepo(self):
974 """Test that we can write butler configuration to a new repository via
975 the Butler.makeRepo interface and then instantiate a butler from the
976 repo root.
977 """
978 # Do not run the test if we know this datastore configuration does
979 # not support a file system root
980 if self.fullConfigKey is None:
981 return
983 # create two separate directories
984 root1 = tempfile.mkdtemp(dir=self.root)
985 root2 = tempfile.mkdtemp(dir=self.root)
987 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
988 limited = Config(self.configFile)
989 butler1 = Butler(butlerConfig)
990 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
991 full = Config(self.tmpConfigFile)
992 butler2 = Butler(butlerConfig)
993 # Butlers should have the same configuration regardless of whether
994 # defaults were expanded.
995 self.assertEqual(butler1._config, butler2._config)
996 # Config files loaded directly should not be the same.
997 self.assertNotEqual(limited, full)
998 # Make sure "limited" doesn't have a few keys we know it should be
999 # inheriting from defaults.
1000 self.assertIn(self.fullConfigKey, full)
1001 self.assertNotIn(self.fullConfigKey, limited)
1003 # Collections don't appear until something is put in them
1004 collections1 = set(butler1.registry.queryCollections())
1005 self.assertEqual(collections1, set())
1006 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1008 # Check that a config with no associated file name will not
1009 # work properly with relocatable Butler repo
1010 butlerConfig.configFile = None
1011 with self.assertRaises(ValueError):
1012 Butler(butlerConfig)
1014 with self.assertRaises(FileExistsError):
1015 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1017 def testStringification(self):
1018 butler = Butler(self.tmpConfigFile, run="ingest")
1019 butlerStr = str(butler)
1021 if self.datastoreStr is not None:
1022 for testStr in self.datastoreStr:
1023 self.assertIn(testStr, butlerStr)
1024 if self.registryStr is not None:
1025 self.assertIn(self.registryStr, butlerStr)
1027 datastoreName = butler.datastore.name
1028 if self.datastoreName is not None:
1029 for testStr in self.datastoreName:
1030 self.assertIn(testStr, datastoreName)
1032 def testButlerRewriteDataId(self):
1033 """Test that dataIds can be rewritten based on dimension records."""
1035 butler = Butler(self.tmpConfigFile, run="ingest")
1037 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1038 datasetTypeName = "random_data"
1040 # Create dimension records.
1041 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1042 butler.registry.insertDimensionData(
1043 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1044 )
1045 butler.registry.insertDimensionData(
1046 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1047 )
1049 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1050 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1051 butler.registry.registerDatasetType(datasetType)
1053 n_exposures = 5
1054 dayobs = 20210530
1056 for i in range(n_exposures):
1057 butler.registry.insertDimensionData(
1058 "exposure",
1059 {
1060 "instrument": "DummyCamComp",
1061 "id": i,
1062 "obs_id": f"exp{i}",
1063 "seq_num": i,
1064 "day_obs": dayobs,
1065 "physical_filter": "d-r",
1066 },
1067 )
1069 # Write some data.
1070 for i in range(n_exposures):
1071 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1073 # Use the seq_num for the put to test rewriting.
1074 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1075 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1077 # Check that the exposure is correct in the dataId
1078 self.assertEqual(ref.dataId["exposure"], i)
1080 # and check that we can get the dataset back with the same dataId
1081 new_metric = butler.get(datasetTypeName, dataId=dataId)
1082 self.assertEqual(new_metric, metric)
1085class FileDatastoreButlerTests(ButlerTests):
1086 """Common tests and specialization of ButlerTests for butlers backed
1087 by datastores that inherit from FileDatastore.
1088 """
1090 def checkFileExists(self, root, relpath):
1091 """Checks if file exists at a given path (relative to root).
1093 Test testPutTemplates verifies actual physical existance of the files
1094 in the requested location.
1095 """
1096 uri = ResourcePath(root, forceDirectory=True)
1097 return uri.join(relpath).exists()
1099 def testPutTemplates(self):
1100 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1101 butler = Butler(self.tmpConfigFile, run="ingest")
1103 # Add needed Dimensions
1104 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1105 butler.registry.insertDimensionData(
1106 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1107 )
1108 butler.registry.insertDimensionData(
1109 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1110 )
1111 butler.registry.insertDimensionData(
1112 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1113 )
1115 # Create and store a dataset
1116 metric = makeExampleMetrics()
1118 # Create two almost-identical DatasetTypes (both will use default
1119 # template)
1120 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1121 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1122 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1123 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1125 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1126 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1128 # Put with exactly the data ID keys needed
1129 ref = butler.put(metric, "metric1", dataId1)
1130 uri = butler.getURI(ref)
1131 self.assertTrue(
1132 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"),
1133 f"Checking existence of {uri}",
1134 )
1136 # Check the template based on dimensions
1137 butler.datastore.templates.validateTemplates([ref])
1139 # Put with extra data ID keys (physical_filter is an optional
1140 # dependency); should not change template (at least the way we're
1141 # defining them to behave now; the important thing is that they
1142 # must be consistent).
1143 ref = butler.put(metric, "metric2", dataId2)
1144 uri = butler.getURI(ref)
1145 self.assertTrue(
1146 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"),
1147 f"Checking existence of {uri}",
1148 )
1150 # Check the template based on dimensions
1151 butler.datastore.templates.validateTemplates([ref])
1153 # Now use a file template that will not result in unique filenames
1154 with self.assertRaises(FileTemplateValidationError):
1155 butler.put(metric, "metric3", dataId1)
1157 def testImportExport(self):
1158 # Run put/get tests just to create and populate a repo.
1159 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1160 self.runImportExportTest(storageClass)
1162 @unittest.expectedFailure
1163 def testImportExportVirtualComposite(self):
1164 # Run put/get tests just to create and populate a repo.
1165 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1166 self.runImportExportTest(storageClass)
1168 def runImportExportTest(self, storageClass):
1169 """This test does an export to a temp directory and an import back
1170 into a new temp directory repo. It does not assume a posix datastore"""
1171 exportButler = self.runPutGetTest(storageClass, "test_metric")
1172 print("Root:", exportButler.datastore.root)
1173 # Test that the repo actually has at least one dataset.
1174 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1175 self.assertGreater(len(datasets), 0)
1176 # Add a DimensionRecord that's unused by those datasets.
1177 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1178 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1179 # Export and then import datasets.
1180 with safeTestTempDir(TESTDIR) as exportDir:
1181 exportFile = os.path.join(exportDir, "exports.yaml")
1182 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1183 export.saveDatasets(datasets)
1184 # Export the same datasets again. This should quietly do
1185 # nothing because of internal deduplication, and it shouldn't
1186 # complain about being asked to export the "htm7" elements even
1187 # though there aren't any in these datasets or in the database.
1188 export.saveDatasets(datasets, elements=["htm7"])
1189 # Save one of the data IDs again; this should be harmless
1190 # because of internal deduplication.
1191 export.saveDataIds([datasets[0].dataId])
1192 # Save some dimension records directly.
1193 export.saveDimensionData("skymap", [skymapRecord])
1194 self.assertTrue(os.path.exists(exportFile))
1195 with safeTestTempDir(TESTDIR) as importDir:
1196 # We always want this to be a local posix butler
1197 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1198 # Calling script.butlerImport tests the implementation of the
1199 # butler command line interface "import" subcommand. Functions
1200 # in the script folder are generally considered protected and
1201 # should not be used as public api.
1202 with open(exportFile, "r") as f:
1203 script.butlerImport(
1204 importDir,
1205 export_file=f,
1206 directory=exportDir,
1207 transfer="auto",
1208 skip_dimensions=None,
1209 reuse_ids=False,
1210 )
1211 importButler = Butler(importDir, run="ingest")
1212 for ref in datasets:
1213 with self.subTest(ref=ref):
1214 # Test for existence by passing in the DatasetType and
1215 # data ID separately, to avoid lookup by dataset_id.
1216 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1217 self.assertEqual(
1218 list(importButler.registry.queryDimensionRecords("skymap")),
1219 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1220 )
1222 def testRemoveRuns(self):
1223 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1224 butler = Butler(self.tmpConfigFile, writeable=True)
1225 # Load registry data with dimensions to hang datasets off of.
1226 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1227 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1228 # Add some RUN-type collection.
1229 run1 = "run1"
1230 butler.registry.registerRun(run1)
1231 run2 = "run2"
1232 butler.registry.registerRun(run2)
1233 # put a dataset in each
1234 metric = makeExampleMetrics()
1235 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1236 datasetType = self.addDatasetType(
1237 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1238 )
1239 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1240 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1241 uri1 = butler.getURI(ref1, collections=[run1])
1242 uri2 = butler.getURI(ref2, collections=[run2])
1243 # Remove from both runs with different values for unstore.
1244 butler.removeRuns([run1], unstore=True)
1245 butler.removeRuns([run2], unstore=False)
1246 # Should be nothing in registry for either one, and datastore should
1247 # not think either exists.
1248 with self.assertRaises(MissingCollectionError):
1249 butler.registry.getCollectionType(run1)
1250 with self.assertRaises(MissingCollectionError):
1251 butler.registry.getCollectionType(run2)
1252 self.assertFalse(butler.datastore.exists(ref1))
1253 self.assertFalse(butler.datastore.exists(ref2))
1254 # The ref we unstored should be gone according to the URI, but the
1255 # one we forgot should still be around.
1256 self.assertFalse(uri1.exists())
1257 self.assertTrue(uri2.exists())
1260class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1261 """PosixDatastore specialization of a butler"""
1263 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1264 fullConfigKey = ".datastore.formatters"
1265 validationCanFail = True
1266 datastoreStr = ["/tmp"]
1267 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1268 registryStr = "/gen3.sqlite3"
1270 def testPathConstructor(self):
1271 """Independent test of constructor using PathLike."""
1272 butler = Butler(self.tmpConfigFile, run="ingest")
1273 self.assertIsInstance(butler, Butler)
1275 # And again with a Path object with the butler yaml
1276 path = pathlib.Path(self.tmpConfigFile)
1277 butler = Butler(path, writeable=False)
1278 self.assertIsInstance(butler, Butler)
1280 # And again with a Path object without the butler yaml
1281 # (making sure we skip it if the tmp config doesn't end
1282 # in butler.yaml -- which is the case for a subclass)
1283 if self.tmpConfigFile.endswith("butler.yaml"):
1284 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1285 butler = Butler(path, writeable=False)
1286 self.assertIsInstance(butler, Butler)
1288 def testExportTransferCopy(self):
1289 """Test local export using all transfer modes"""
1290 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1291 exportButler = self.runPutGetTest(storageClass, "test_metric")
1292 # Test that the repo actually has at least one dataset.
1293 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1294 self.assertGreater(len(datasets), 0)
1295 uris = [exportButler.getURI(d) for d in datasets]
1296 datastoreRoot = exportButler.datastore.root
1298 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1300 for path in pathsInStore:
1301 # Assume local file system
1302 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1304 for transfer in ("copy", "link", "symlink", "relsymlink"):
1305 with safeTestTempDir(TESTDIR) as exportDir:
1306 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1307 export.saveDatasets(datasets)
1308 for path in pathsInStore:
1309 self.assertTrue(
1310 self.checkFileExists(exportDir, path),
1311 f"Check that mode {transfer} exported files",
1312 )
1314 def testPruneDatasets(self):
1315 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1316 butler = Butler(self.tmpConfigFile, writeable=True)
1317 # Load registry data with dimensions to hang datasets off of.
1318 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1319 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1320 # Add some RUN-type collections.
1321 run1 = "run1"
1322 butler.registry.registerRun(run1)
1323 run2 = "run2"
1324 butler.registry.registerRun(run2)
1325 # put some datasets. ref1 and ref2 have the same data ID, and are in
1326 # different runs. ref3 has a different data ID.
1327 metric = makeExampleMetrics()
1328 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1329 datasetType = self.addDatasetType(
1330 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1331 )
1332 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1333 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1334 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1336 # Simple prune.
1337 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1338 with self.assertRaises(LookupError):
1339 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1341 # Put data back.
1342 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1343 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1344 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1346 # Check that in normal mode, deleting the record will lead to
1347 # trash not touching the file.
1348 uri1 = butler.datastore.getURI(ref1)
1349 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1350 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1351 butler.datastore.trash(ref1)
1352 butler.datastore.emptyTrash()
1353 self.assertTrue(uri1.exists())
1354 uri1.remove() # Clean it up.
1356 # Simulate execution butler setup by deleting the datastore
1357 # record but keeping the file around and trusting.
1358 butler.datastore.trustGetRequest = True
1359 uri2 = butler.datastore.getURI(ref2)
1360 uri3 = butler.datastore.getURI(ref3)
1361 self.assertTrue(uri2.exists())
1362 self.assertTrue(uri3.exists())
1364 # Remove the datastore record.
1365 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1366 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1367 self.assertTrue(uri2.exists())
1368 butler.datastore.trash([ref2, ref3])
1369 # Immediate removal for ref2 file
1370 self.assertFalse(uri2.exists())
1371 # But ref3 has to wait for the empty.
1372 self.assertTrue(uri3.exists())
1373 butler.datastore.emptyTrash()
1374 self.assertFalse(uri3.exists())
1376 # Clear out the datasets from registry.
1377 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1379 def testPytypePutCoercion(self):
1380 """Test python type coercion on Butler.get and put."""
1382 # Store some data with the normal example storage class.
1383 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1384 datasetTypeName = "test_metric"
1385 butler, _ = self.create_butler("ingest", storageClass, datasetTypeName)
1387 dataId = {"instrument": "DummyCamComp", "visit": 423}
1389 # Put a dict and this should coerce to a MetricsExample
1390 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1391 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1392 test_metric = butler.getDirect(metric_ref)
1393 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1394 self.assertEqual(test_metric.summary, test_dict["summary"])
1395 self.assertEqual(test_metric.output, test_dict["output"])
1397 # Check that the put still works if a DatasetType is given with
1398 # a definition matching this python type.
1399 registry_type = butler.registry.getDatasetType(datasetTypeName)
1400 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
1401 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
1402 self.assertEqual(metric2_ref.datasetType, registry_type)
1404 # The get will return the type expected by registry.
1405 test_metric2 = butler.getDirect(metric2_ref)
1406 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
1408 # Make a new DatasetRef with the compatible but different DatasetType.
1409 # This should now return a dict.
1410 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
1411 test_dict2 = butler.getDirect(new_ref)
1412 self.assertEqual(get_full_type_name(test_dict2), "dict")
1414 # Get it again with the wrong dataset type definition using get()
1415 # rather than getDirect(). This should be consistent with getDirect()
1416 # behavior and return the type of the DatasetType.
1417 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
1418 self.assertEqual(get_full_type_name(test_dict3), "dict")
1420 def testPytypeCoercion(self):
1421 """Test python type coercion on Butler.get and put."""
1423 # Store some data with the normal example storage class.
1424 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1425 datasetTypeName = "test_metric"
1426 butler = self.runPutGetTest(storageClass, datasetTypeName)
1428 dataId = {"instrument": "DummyCamComp", "visit": 423}
1429 metric = butler.get(datasetTypeName, dataId=dataId)
1430 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1432 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1433 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1435 # Now need to hack the registry dataset type definition.
1436 # There is no API for this.
1437 manager = butler.registry._managers.datasets
1438 manager._db.update(
1439 manager._static.dataset_type,
1440 {"name": datasetTypeName},
1441 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1442 )
1444 # Force reset of dataset type cache
1445 butler.registry.refresh()
1447 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1448 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1449 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1451 metric_model = butler.get(datasetTypeName, dataId=dataId)
1452 self.assertNotEqual(type(metric_model), type(metric))
1453 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1455 # Put the model and read it back to show that everything now
1456 # works as normal.
1457 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1458 metric_model_new = butler.get(metric_ref)
1459 self.assertEqual(metric_model_new, metric_model)
1461 # Hack the storage class again to something that will fail on the
1462 # get with no conversion class.
1463 manager._db.update(
1464 manager._static.dataset_type,
1465 {"name": datasetTypeName},
1466 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1467 )
1468 butler.registry.refresh()
1470 with self.assertRaises(ValueError):
1471 butler.get(datasetTypeName, dataId=dataId)
1474class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1475 """InMemoryDatastore specialization of a butler"""
1477 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1478 fullConfigKey = None
1479 useTempRoot = False
1480 validationCanFail = False
1481 datastoreStr = ["datastore='InMemory"]
1482 datastoreName = ["InMemoryDatastore@"]
1483 registryStr = "/gen3.sqlite3"
1485 def testIngest(self):
1486 pass
1489class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1490 """PosixDatastore specialization"""
1492 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1493 fullConfigKey = ".datastore.datastores.1.formatters"
1494 validationCanFail = True
1495 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1496 datastoreName = [
1497 "InMemoryDatastore@",
1498 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1499 "SecondDatastore",
1500 ]
1501 registryStr = "/gen3.sqlite3"
1504class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1505 """Test that a yaml file in one location can refer to a root in another."""
1507 datastoreStr = ["dir1"]
1508 # Disable the makeRepo test since we are deliberately not using
1509 # butler.yaml as the config name.
1510 fullConfigKey = None
1512 def setUp(self):
1513 self.root = makeTestTempDir(TESTDIR)
1515 # Make a new repository in one place
1516 self.dir1 = os.path.join(self.root, "dir1")
1517 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1519 # Move the yaml file to a different place and add a "root"
1520 self.dir2 = os.path.join(self.root, "dir2")
1521 os.makedirs(self.dir2, exist_ok=True)
1522 configFile1 = os.path.join(self.dir1, "butler.yaml")
1523 config = Config(configFile1)
1524 config["root"] = self.dir1
1525 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1526 config.dumpToUri(configFile2)
1527 os.remove(configFile1)
1528 self.tmpConfigFile = configFile2
1530 def testFileLocations(self):
1531 self.assertNotEqual(self.dir1, self.dir2)
1532 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1533 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1534 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1537class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1538 """Test that a config file created by makeRepo outside of repo works."""
1540 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1542 def setUp(self):
1543 self.root = makeTestTempDir(TESTDIR)
1544 self.root2 = makeTestTempDir(TESTDIR)
1546 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1547 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1549 def tearDown(self):
1550 if os.path.exists(self.root2):
1551 shutil.rmtree(self.root2, ignore_errors=True)
1552 super().tearDown()
1554 def testConfigExistence(self):
1555 c = Config(self.tmpConfigFile)
1556 uri_config = ResourcePath(c["root"])
1557 uri_expected = ResourcePath(self.root, forceDirectory=True)
1558 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1559 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1561 def testPutGet(self):
1562 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1563 self.runPutGetTest(storageClass, "test_metric")
1566class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1567 """Test that a config file created by makeRepo outside of repo works."""
1569 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1571 def setUp(self):
1572 self.root = makeTestTempDir(TESTDIR)
1573 self.root2 = makeTestTempDir(TESTDIR)
1575 self.tmpConfigFile = self.root2
1576 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1578 def testConfigExistence(self):
1579 # Append the yaml file else Config constructor does not know the file
1580 # type.
1581 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1582 super().testConfigExistence()
1585class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1586 """Test that a config file created by makeRepo outside of repo works."""
1588 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1590 def setUp(self):
1591 self.root = makeTestTempDir(TESTDIR)
1592 self.root2 = makeTestTempDir(TESTDIR)
1594 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1595 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1598@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1599class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1600 """S3Datastore specialization of a butler; an S3 storage Datastore +
1601 a local in-memory SqlRegistry.
1602 """
1604 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1605 fullConfigKey = None
1606 validationCanFail = True
1608 bucketName = "anybucketname"
1609 """Name of the Bucket that will be used in the tests. The name is read from
1610 the config file used with the tests during set-up.
1611 """
1613 root = "butlerRoot/"
1614 """Root repository directory expected to be used in case useTempRoot=False.
1615 Otherwise the root is set to a 20 characters long randomly generated string
1616 during set-up.
1617 """
1619 datastoreStr = [f"datastore={root}"]
1620 """Contains all expected root locations in a format expected to be
1621 returned by Butler stringification.
1622 """
1624 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1625 """The expected format of the S3 Datastore string."""
1627 registryStr = "/gen3.sqlite3"
1628 """Expected format of the Registry string."""
1630 mock_s3 = mock_s3()
1631 """The mocked s3 interface from moto."""
1633 def genRoot(self):
1634 """Returns a random string of len 20 to serve as a root
1635 name for the temporary bucket repo.
1637 This is equivalent to tempfile.mkdtemp as this is what self.root
1638 becomes when useTempRoot is True.
1639 """
1640 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1641 return rndstr + "/"
1643 def setUp(self):
1644 config = Config(self.configFile)
1645 uri = ResourcePath(config[".datastore.datastore.root"])
1646 self.bucketName = uri.netloc
1648 # Enable S3 mocking of tests.
1649 self.mock_s3.start()
1651 # set up some fake credentials if they do not exist
1652 self.usingDummyCredentials = setAwsEnvCredentials()
1654 if self.useTempRoot:
1655 self.root = self.genRoot()
1656 rooturi = f"s3://{self.bucketName}/{self.root}"
1657 config.update({"datastore": {"datastore": {"root": rooturi}}})
1659 # need local folder to store registry database
1660 self.reg_dir = makeTestTempDir(TESTDIR)
1661 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1663 # MOTO needs to know that we expect Bucket bucketname to exist
1664 # (this used to be the class attribute bucketName)
1665 s3 = boto3.resource("s3")
1666 s3.create_bucket(Bucket=self.bucketName)
1668 self.datastoreStr = f"datastore={self.root}"
1669 self.datastoreName = [f"FileDatastore@{rooturi}"]
1670 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1671 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1673 def tearDown(self):
1674 s3 = boto3.resource("s3")
1675 bucket = s3.Bucket(self.bucketName)
1676 try:
1677 bucket.objects.all().delete()
1678 except botocore.exceptions.ClientError as e:
1679 if e.response["Error"]["Code"] == "404":
1680 # the key was not reachable - pass
1681 pass
1682 else:
1683 raise
1685 bucket = s3.Bucket(self.bucketName)
1686 bucket.delete()
1688 # Stop the S3 mock.
1689 self.mock_s3.stop()
1691 # unset any potentially set dummy credentials
1692 if self.usingDummyCredentials:
1693 unsetAwsEnvCredentials()
1695 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1696 shutil.rmtree(self.reg_dir, ignore_errors=True)
1698 if self.useTempRoot and os.path.exists(self.root):
1699 shutil.rmtree(self.root, ignore_errors=True)
1702@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1703# Mock required environment variables during tests
1704@unittest.mock.patch.dict(
1705 os.environ,
1706 {
1707 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1708 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1709 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1710 },
1711)
1712class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1713 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1714 a local in-memory SqlRegistry.
1715 """
1717 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1718 fullConfigKey = None
1719 validationCanFail = True
1721 serverName = "localhost"
1722 """Name of the server that will be used in the tests.
1723 """
1725 portNumber = 8080
1726 """Port on which the webdav server listens. Automatically chosen
1727 at setUpClass via the _getfreeport() method
1728 """
1730 root = "butlerRoot/"
1731 """Root repository directory expected to be used in case useTempRoot=False.
1732 Otherwise the root is set to a 20 characters long randomly generated string
1733 during set-up.
1734 """
1736 datastoreStr = [f"datastore={root}"]
1737 """Contains all expected root locations in a format expected to be
1738 returned by Butler stringification.
1739 """
1741 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1742 """The expected format of the WebdavDatastore string."""
1744 registryStr = "/gen3.sqlite3"
1745 """Expected format of the Registry string."""
1747 serverThread = None
1748 """Thread in which the local webdav server will run"""
1750 stopWebdavServer = False
1751 """This flag will cause the webdav server to
1752 gracefully shut down when True
1753 """
1755 def genRoot(self):
1756 """Returns a random string of len 20 to serve as a root
1757 name for the temporary bucket repo.
1759 This is equivalent to tempfile.mkdtemp as this is what self.root
1760 becomes when useTempRoot is True.
1761 """
1762 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1763 return rndstr + "/"
1765 @classmethod
1766 def setUpClass(cls):
1767 # Do the same as inherited class
1768 cls.storageClassFactory = StorageClassFactory()
1769 cls.storageClassFactory.addFromConfig(cls.configFile)
1771 cls.portNumber = cls._getfreeport()
1772 # Run a local webdav server on which tests will be run
1773 cls.serverThread = Thread(
1774 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1775 )
1776 cls.serverThread.start()
1777 # Wait for it to start
1778 time.sleep(3)
1780 @classmethod
1781 def tearDownClass(cls):
1782 # Ask for graceful shut down of the webdav server
1783 cls.stopWebdavServer = True
1784 # Wait for the thread to exit
1785 cls.serverThread.join()
1787 # Mock required environment variables during tests
1788 @unittest.mock.patch.dict(
1789 os.environ,
1790 {
1791 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1792 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1793 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1794 },
1795 )
1796 def setUp(self):
1797 config = Config(self.configFile)
1799 if self.useTempRoot:
1800 self.root = self.genRoot()
1801 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1802 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1804 # need local folder to store registry database
1805 self.reg_dir = makeTestTempDir(TESTDIR)
1806 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1808 self.datastoreStr = f"datastore={self.root}"
1809 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1811 if not isWebdavEndpoint(self.rooturi):
1812 raise OSError("Webdav server not running properly: cannot run tests.")
1814 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1815 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1817 # Mock required environment variables during tests
1818 @unittest.mock.patch.dict(
1819 os.environ,
1820 {
1821 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1822 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1823 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1824 },
1825 )
1826 def tearDown(self):
1827 # Clear temporary directory
1828 ResourcePath(self.rooturi).remove()
1829 ResourcePath(self.rooturi).session.close()
1831 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1832 shutil.rmtree(self.reg_dir, ignore_errors=True)
1834 if self.useTempRoot and os.path.exists(self.root):
1835 shutil.rmtree(self.root, ignore_errors=True)
1837 def _serveWebdav(self, port: int, stopWebdavServer):
1838 """Starts a local webdav-compatible HTTP server,
1839 Listening on http://localhost:port
1840 This server only runs when this test class is instantiated,
1841 and then shuts down. Must be started is a separate thread.
1843 Parameters
1844 ----------
1845 port : `int`
1846 The port number on which the server should listen
1847 """
1848 root_path = gettempdir()
1850 config = {
1851 "host": "0.0.0.0",
1852 "port": port,
1853 "provider_mapping": {"/": root_path},
1854 "http_authenticator": {"domain_controller": None},
1855 "simple_dc": {"user_mapping": {"*": True}},
1856 "verbose": 0,
1857 }
1858 app = WsgiDAVApp(config)
1860 server_args = {
1861 "bind_addr": (config["host"], config["port"]),
1862 "wsgi_app": app,
1863 }
1864 server = wsgi.Server(**server_args)
1865 server.prepare()
1867 try:
1868 # Start the actual server in a separate thread
1869 t = Thread(target=server.serve, daemon=True)
1870 t.start()
1871 # watch stopWebdavServer, and gracefully
1872 # shut down the server when True
1873 while True:
1874 if stopWebdavServer():
1875 break
1876 time.sleep(1)
1877 except KeyboardInterrupt:
1878 print("Caught Ctrl-C, shutting down...")
1879 finally:
1880 server.stop()
1881 t.join()
1883 def _getfreeport():
1884 """
1885 Determines a free port using sockets.
1886 """
1887 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1888 free_socket.bind(("0.0.0.0", 0))
1889 free_socket.listen()
1890 port = free_socket.getsockname()[1]
1891 free_socket.close()
1892 return port
1895class PosixDatastoreTransfers(unittest.TestCase):
1896 """Test data transfers between butlers.
1898 Test for different managers. UUID to UUID and integer to integer are
1899 tested. UUID to integer is not supported since we do not currently
1900 want to allow that. Integer to UUID is supported with the caveat
1901 that UUID4 will be generated and this will be incorrect for raw
1902 dataset types. The test ignores that.
1903 """
1905 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1907 @classmethod
1908 def setUpClass(cls):
1909 cls.storageClassFactory = StorageClassFactory()
1910 cls.storageClassFactory.addFromConfig(cls.configFile)
1912 def setUp(self):
1913 self.root = makeTestTempDir(TESTDIR)
1914 self.config = Config(self.configFile)
1916 def tearDown(self):
1917 removeTestTempDir(self.root)
1919 def create_butler(self, manager, label):
1920 config = Config(self.configFile)
1921 config["registry", "managers", "datasets"] = manager
1922 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1924 def create_butlers(self, manager1, manager2):
1925 self.source_butler = self.create_butler(manager1, "1")
1926 self.target_butler = self.create_butler(manager2, "2")
1928 def testTransferUuidToUuid(self):
1929 self.create_butlers(
1930 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1931 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1932 )
1933 # Setting id_gen_map should have no effect here
1934 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1936 def testTransferIntToInt(self):
1937 self.create_butlers(
1938 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1939 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1940 )
1941 # int dataset ID only allows UNIQUE
1942 self.assertButlerTransfers()
1944 def testTransferIntToUuid(self):
1945 self.create_butlers(
1946 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1947 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1948 )
1949 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1951 def testTransferMissing(self):
1952 """Test transfers where datastore records are missing.
1954 This is how execution butler works.
1955 """
1956 self.create_butlers(
1957 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1958 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1959 )
1961 # Configure the source butler to allow trust.
1962 self.source_butler.datastore.trustGetRequest = True
1964 self.assertButlerTransfers(purge=True)
1966 def testTransferMissingDisassembly(self):
1967 """Test transfers where datastore records are missing.
1969 This is how execution butler works.
1970 """
1971 self.create_butlers(
1972 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1973 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1974 )
1976 # Configure the source butler to allow trust.
1977 self.source_butler.datastore.trustGetRequest = True
1979 # Test disassembly.
1980 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1982 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1983 """Test that a run can be transferred to another butler."""
1985 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1986 datasetTypeName = "random_data"
1988 # Test will create 3 collections and we will want to transfer
1989 # two of those three.
1990 runs = ["run1", "run2", "other"]
1992 # Also want to use two different dataset types to ensure that
1993 # grouping works.
1994 datasetTypeNames = ["random_data", "random_data_2"]
1996 # Create the run collections in the source butler.
1997 for run in runs:
1998 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2000 # Create dimensions in both butlers (transfer will not create them).
2001 n_exposures = 30
2002 for butler in (self.source_butler, self.target_butler):
2003 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2004 butler.registry.insertDimensionData(
2005 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2006 )
2007 butler.registry.insertDimensionData(
2008 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2009 )
2011 for i in range(n_exposures):
2012 butler.registry.insertDimensionData(
2013 "exposure",
2014 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2015 )
2017 # Create dataset types in the source butler.
2018 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
2019 for datasetTypeName in datasetTypeNames:
2020 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2021 self.source_butler.registry.registerDatasetType(datasetType)
2023 # Write a dataset to an unrelated run -- this will ensure that
2024 # we are rewriting integer dataset ids in the target if necessary.
2025 # Will not be relevant for UUID.
2026 run = "distraction"
2027 butler = Butler(butler=self.source_butler, run=run)
2028 butler.put(
2029 makeExampleMetrics(),
2030 datasetTypeName,
2031 exposure=1,
2032 instrument="DummyCamComp",
2033 physical_filter="d-r",
2034 )
2036 # Write some example metrics to the source
2037 butler = Butler(butler=self.source_butler)
2039 # Set of DatasetRefs that should be in the list of refs to transfer
2040 # but which will not be transferred.
2041 deleted = set()
2043 n_expected = 20 # Number of datasets expected to be transferred
2044 source_refs = []
2045 for i in range(n_exposures):
2046 # Put a third of datasets into each collection, only retain
2047 # two thirds.
2048 index = i % 3
2049 run = runs[index]
2050 datasetTypeName = datasetTypeNames[i % 2]
2052 metric_data = {
2053 "summary": {"counter": i},
2054 "output": {"text": "metric"},
2055 "data": [2 * x for x in range(i)],
2056 }
2057 metric = MetricsExample(**metric_data)
2058 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2059 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2061 # Remove the datastore record using low-level API
2062 if purge:
2063 # Remove records for a fraction.
2064 if index == 1:
2066 # For one of these delete the file as well.
2067 # This allows the "missing" code to filter the
2068 # file out.
2069 if not deleted:
2070 primary, uris = butler.datastore.getURIs(ref)
2071 if primary:
2072 primary.remove()
2073 for uri in uris.values():
2074 uri.remove()
2075 n_expected -= 1
2076 deleted.add(ref)
2078 # Remove the datastore record.
2079 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2081 if index < 2:
2082 source_refs.append(ref)
2083 if ref not in deleted:
2084 new_metric = butler.get(ref.unresolved(), collections=run)
2085 self.assertEqual(new_metric, metric)
2087 # Create some bad dataset types to ensure we check for inconsistent
2088 # definitions.
2089 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2090 for datasetTypeName in datasetTypeNames:
2091 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2092 self.target_butler.registry.registerDatasetType(datasetType)
2093 with self.assertRaises(ConflictingDefinitionError):
2094 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2095 # And remove the bad definitions.
2096 for datasetTypeName in datasetTypeNames:
2097 self.target_butler.registry.removeDatasetType(datasetTypeName)
2099 # Transfer without creating dataset types should fail.
2100 with self.assertRaises(KeyError):
2101 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2103 # Now transfer them to the second butler
2104 with self.assertLogs(level=logging.DEBUG) as cm:
2105 transferred = self.target_butler.transfer_from(
2106 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2107 )
2108 self.assertEqual(len(transferred), n_expected)
2109 log_output = ";".join(cm.output)
2110 self.assertIn("found in datastore for chunk", log_output)
2111 self.assertIn("Creating output run", log_output)
2113 # Do the transfer twice to ensure that it will do nothing extra.
2114 # Only do this if purge=True because it does not work for int
2115 # dataset_id.
2116 if purge:
2117 # This should not need to register dataset types.
2118 transferred = self.target_butler.transfer_from(
2119 self.source_butler, source_refs, id_gen_map=id_gen_map
2120 )
2121 self.assertEqual(len(transferred), n_expected)
2123 # Also do an explicit low-level transfer to trigger some
2124 # edge cases.
2125 with self.assertLogs(level=logging.DEBUG) as cm:
2126 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2127 log_output = ";".join(cm.output)
2128 self.assertIn("no file artifacts exist", log_output)
2130 with self.assertRaises(TypeError):
2131 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2133 with self.assertRaises(ValueError):
2134 self.target_butler.datastore.transfer_from(
2135 self.source_butler.datastore, source_refs, transfer="split"
2136 )
2138 # Now try to get the same refs from the new butler.
2139 for ref in source_refs:
2140 if ref not in deleted:
2141 unresolved_ref = ref.unresolved()
2142 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2143 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2144 self.assertEqual(new_metric, old_metric)
2146 # Now prune run2 collection and create instead a CHAINED collection.
2147 # This should block the transfer.
2148 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2149 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2150 with self.assertRaises(CollectionTypeError):
2151 # Re-importing the run1 datasets can be problematic if they
2152 # use integer IDs so filter those out.
2153 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2154 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2157if __name__ == "__main__": 2157 ↛ 2158line 2157 didn't jump to line 2158, because the condition on line 2157 was never true
2158 unittest.main()