Coverage for tests/test_butler.py: 15%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionSearch,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import (
77 CollectionError,
78 CollectionTypeError,
79 ConflictingDefinitionError,
80 DataIdValueError,
81 MissingCollectionError,
82)
83from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
84from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
85from lsst.resources import ResourcePath
86from lsst.resources.http import _is_webdav_endpoint
87from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
88from lsst.utils import doImport
89from lsst.utils.introspection import get_full_type_name
91TESTDIR = os.path.abspath(os.path.dirname(__file__))
94def makeExampleMetrics():
95 return MetricsExample(
96 {"AM1": 5.2, "AM2": 30.6},
97 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
98 [563, 234, 456.7, 752, 8, 9, 27],
99 )
102class TransactionTestError(Exception):
103 """Specific error for testing transactions, to prevent misdiagnosing
104 that might otherwise occur when a standard exception is used.
105 """
107 pass
110class ButlerConfigTests(unittest.TestCase):
111 """Simple tests for ButlerConfig that are not tested in any other test
112 cases."""
114 def testSearchPath(self):
115 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
116 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
117 config1 = ButlerConfig(configFile)
118 self.assertNotIn("testConfigs", "\n".join(cm.output))
120 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
121 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
122 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
123 self.assertIn("testConfigs", "\n".join(cm.output))
125 key = ("datastore", "records", "table")
126 self.assertNotEqual(config1[key], config2[key])
127 self.assertEqual(config2[key], "override_record")
130class ButlerPutGetTests:
131 """Helper method for running a suite of put/get tests from different
132 butler configurations."""
134 root = None
135 default_run = "ingésτ😺"
137 @staticmethod
138 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
139 """Create a DatasetType and register it"""
140 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
141 registry.registerDatasetType(datasetType)
142 return datasetType
144 @classmethod
145 def setUpClass(cls):
146 cls.storageClassFactory = StorageClassFactory()
147 cls.storageClassFactory.addFromConfig(cls.configFile)
149 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
150 datasetType = datasetRef.datasetType
151 dataId = datasetRef.dataId
152 deferred = butler.getDirectDeferred(datasetRef)
154 for component in components:
155 compTypeName = datasetType.componentTypeName(component)
156 result = butler.get(compTypeName, dataId, collections=collections)
157 self.assertEqual(result, getattr(reference, component))
158 result_deferred = deferred.get(component=component)
159 self.assertEqual(result_deferred, result)
161 def tearDown(self):
162 removeTestTempDir(self.root)
164 def create_butler(self, run, storageClass, datasetTypeName):
165 butler = Butler(self.tmpConfigFile, run=run)
167 collections = set(butler.registry.queryCollections())
168 self.assertEqual(collections, set([run]))
170 # Create and register a DatasetType
171 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
173 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
175 # Add needed Dimensions
176 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
177 butler.registry.insertDimensionData(
178 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
179 )
180 butler.registry.insertDimensionData(
181 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
182 )
183 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
184 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
185 butler.registry.insertDimensionData(
186 "visit",
187 {
188 "instrument": "DummyCamComp",
189 "id": 423,
190 "name": "fourtwentythree",
191 "physical_filter": "d-r",
192 "visit_system": 1,
193 "datetime_begin": visit_start,
194 "datetime_end": visit_end,
195 },
196 )
198 # Add more visits for some later tests
199 for visit_id in (424, 425):
200 butler.registry.insertDimensionData(
201 "visit",
202 {
203 "instrument": "DummyCamComp",
204 "id": visit_id,
205 "name": f"fourtwentyfour_{visit_id}",
206 "physical_filter": "d-r",
207 "visit_system": 1,
208 },
209 )
210 return butler, datasetType
212 def runPutGetTest(self, storageClass, datasetTypeName):
213 # New datasets will be added to run and tag, but we will only look in
214 # tag when looking up datasets.
215 run = self.default_run
216 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
218 # Create and store a dataset
219 metric = makeExampleMetrics()
220 dataId = {"instrument": "DummyCamComp", "visit": 423}
222 # Create a DatasetRef for put
223 refIn = DatasetRef(datasetType, dataId, id=None)
225 # Put with a preexisting id should fail
226 with self.assertRaises(ValueError):
227 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
229 # Put and remove the dataset once as a DatasetRef, once as a dataId,
230 # and once with a DatasetType
232 # Keep track of any collections we add and do not clean up
233 expected_collections = {run}
235 counter = 0
236 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
237 # Since we are using subTest we can get cascading failures
238 # here with the first attempt failing and the others failing
239 # immediately because the dataset already exists. Work around
240 # this by using a distinct run collection each time
241 counter += 1
242 this_run = f"put_run_{counter}"
243 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
244 expected_collections.update({this_run})
246 with self.subTest(args=args):
247 ref = butler.put(metric, *args, run=this_run)
248 self.assertIsInstance(ref, DatasetRef)
250 # Test getDirect
251 metricOut = butler.getDirect(ref)
252 self.assertEqual(metric, metricOut)
253 # Test get
254 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
255 self.assertEqual(metric, metricOut)
256 # Test get with a datasetRef
257 metricOut = butler.get(ref, collections=this_run)
258 self.assertEqual(metric, metricOut)
259 # Test getDeferred with dataId
260 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
261 self.assertEqual(metric, metricOut)
262 # Test getDeferred with a datasetRef
263 metricOut = butler.getDeferred(ref, collections=this_run).get()
264 self.assertEqual(metric, metricOut)
265 # and deferred direct with ref
266 metricOut = butler.getDirectDeferred(ref).get()
267 self.assertEqual(metric, metricOut)
269 # Check we can get components
270 if storageClass.isComposite():
271 self.assertGetComponents(
272 butler, ref, ("summary", "data", "output"), metric, collections=this_run
273 )
275 # Can the artifacts themselves be retrieved?
276 if not butler.datastore.isEphemeral:
277 root_uri = ResourcePath(self.root)
279 for preserve_path in (True, False):
280 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
281 # Use copy so that we can test that overwrite
282 # protection works (using "auto" for File URIs would
283 # use hard links and subsequent transfer would work
284 # because it knows they are the same file).
285 transferred = butler.retrieveArtifacts(
286 [ref], destination, preserve_path=preserve_path, transfer="copy"
287 )
288 self.assertGreater(len(transferred), 0)
289 artifacts = list(ResourcePath.findFileResources([destination]))
290 self.assertEqual(set(transferred), set(artifacts))
292 for artifact in transferred:
293 path_in_destination = artifact.relative_to(destination)
294 self.assertIsNotNone(path_in_destination)
296 # when path is not preserved there should not be
297 # any path separators.
298 num_seps = path_in_destination.count("/")
299 if preserve_path:
300 self.assertGreater(num_seps, 0)
301 else:
302 self.assertEqual(num_seps, 0)
304 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
305 n_uris = len(secondary_uris)
306 if primary_uri:
307 n_uris += 1
308 self.assertEqual(
309 len(artifacts),
310 n_uris,
311 "Comparing expected artifacts vs actual:"
312 f" {artifacts} vs {primary_uri} and {secondary_uris}",
313 )
315 if preserve_path:
316 # No need to run these twice
317 with self.assertRaises(ValueError):
318 butler.retrieveArtifacts([ref], destination, transfer="move")
320 with self.assertRaises(FileExistsError):
321 butler.retrieveArtifacts([ref], destination)
323 transferred_again = butler.retrieveArtifacts(
324 [ref], destination, preserve_path=preserve_path, overwrite=True
325 )
326 self.assertEqual(set(transferred_again), set(transferred))
328 # Now remove the dataset completely.
329 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
330 # Lookup with original args should still fail.
331 with self.assertRaises(LookupError):
332 butler.datasetExists(*args, collections=this_run)
333 # getDirect() should still fail.
334 with self.assertRaises(FileNotFoundError):
335 butler.getDirect(ref)
336 # Registry shouldn't be able to find it by dataset_id anymore.
337 self.assertIsNone(butler.registry.getDataset(ref.id))
339 # Do explicit registry removal since we know they are
340 # empty
341 butler.registry.removeCollection(this_run)
342 expected_collections.remove(this_run)
344 # Put the dataset again, since the last thing we did was remove it
345 # and we want to use the default collection.
346 ref = butler.put(metric, refIn)
348 # Get with parameters
349 stop = 4
350 sliced = butler.get(ref, parameters={"slice": slice(stop)})
351 self.assertNotEqual(metric, sliced)
352 self.assertEqual(metric.summary, sliced.summary)
353 self.assertEqual(metric.output, sliced.output)
354 self.assertEqual(metric.data[:stop], sliced.data)
355 # getDeferred with parameters
356 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
357 self.assertNotEqual(metric, sliced)
358 self.assertEqual(metric.summary, sliced.summary)
359 self.assertEqual(metric.output, sliced.output)
360 self.assertEqual(metric.data[:stop], sliced.data)
361 # getDeferred with deferred parameters
362 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
363 self.assertNotEqual(metric, sliced)
364 self.assertEqual(metric.summary, sliced.summary)
365 self.assertEqual(metric.output, sliced.output)
366 self.assertEqual(metric.data[:stop], sliced.data)
368 if storageClass.isComposite():
369 # Check that components can be retrieved
370 metricOut = butler.get(ref.datasetType.name, dataId)
371 compNameS = ref.datasetType.componentTypeName("summary")
372 compNameD = ref.datasetType.componentTypeName("data")
373 summary = butler.get(compNameS, dataId)
374 self.assertEqual(summary, metric.summary)
375 data = butler.get(compNameD, dataId)
376 self.assertEqual(data, metric.data)
378 if "counter" in storageClass.derivedComponents:
379 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
380 self.assertEqual(count, len(data))
382 count = butler.get(
383 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
384 )
385 self.assertEqual(count, stop)
387 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
388 summary = butler.getDirect(compRef)
389 self.assertEqual(summary, metric.summary)
391 # Create a Dataset type that has the same name but is inconsistent.
392 inconsistentDatasetType = DatasetType(
393 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
394 )
396 # Getting with a dataset type that does not match registry fails
397 with self.assertRaises(ValueError):
398 butler.get(inconsistentDatasetType, dataId)
400 # Combining a DatasetRef with a dataId should fail
401 with self.assertRaises(ValueError):
402 butler.get(ref, dataId)
403 # Getting with an explicit ref should fail if the id doesn't match
404 with self.assertRaises(ValueError):
405 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
407 # Getting a dataset with unknown parameters should fail
408 with self.assertRaises(KeyError):
409 butler.get(ref, parameters={"unsupported": True})
411 # Check we have a collection
412 collections = set(butler.registry.queryCollections())
413 self.assertEqual(collections, expected_collections)
415 # Clean up to check that we can remove something that may have
416 # already had a component removed
417 butler.pruneDatasets([ref], unstore=True, purge=True)
419 # Check that we can configure a butler to accept a put even
420 # if it already has the dataset in registry.
421 ref = butler.put(metric, refIn)
423 # Repeat put will fail.
424 with self.assertRaises(ConflictingDefinitionError):
425 butler.put(metric, refIn)
427 # Remove the datastore entry.
428 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
430 # Put will still fail
431 with self.assertRaises(ConflictingDefinitionError):
432 butler.put(metric, refIn)
434 # Allow the put to succeed
435 butler._allow_put_of_predefined_dataset = True
436 ref2 = butler.put(metric, refIn)
437 self.assertEqual(ref2.id, ref.id)
439 # A second put will still fail but with a different exception
440 # than before.
441 with self.assertRaises(ConflictingDefinitionError):
442 butler.put(metric, refIn)
444 # Reset the flag to avoid confusion
445 butler._allow_put_of_predefined_dataset = False
447 # Leave the dataset in place since some downstream tests require
448 # something to be present
450 return butler
452 def testDeferredCollectionPassing(self):
453 # Construct a butler with no run or collection, but make it writeable.
454 butler = Butler(self.tmpConfigFile, writeable=True)
455 # Create and register a DatasetType
456 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
457 datasetType = self.addDatasetType(
458 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
459 )
460 # Add needed Dimensions
461 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
462 butler.registry.insertDimensionData(
463 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
464 )
465 butler.registry.insertDimensionData(
466 "visit",
467 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
468 )
469 dataId = {"instrument": "DummyCamComp", "visit": 423}
470 # Create dataset.
471 metric = makeExampleMetrics()
472 # Register a new run and put dataset.
473 run = "deferred"
474 self.assertTrue(butler.registry.registerRun(run))
475 # Second time it will be allowed but indicate no-op
476 self.assertFalse(butler.registry.registerRun(run))
477 ref = butler.put(metric, datasetType, dataId, run=run)
478 # Putting with no run should fail with TypeError.
479 with self.assertRaises(CollectionError):
480 butler.put(metric, datasetType, dataId)
481 # Dataset should exist.
482 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
483 # We should be able to get the dataset back, but with and without
484 # a deferred dataset handle.
485 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
486 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
487 # Trying to find the dataset without any collection is a TypeError.
488 with self.assertRaises(CollectionError):
489 butler.datasetExists(datasetType, dataId)
490 with self.assertRaises(CollectionError):
491 butler.get(datasetType, dataId)
492 # Associate the dataset with a different collection.
493 butler.registry.registerCollection("tagged")
494 butler.registry.associate("tagged", [ref])
495 # Deleting the dataset from the new collection should make it findable
496 # in the original collection.
497 butler.pruneDatasets([ref], tags=["tagged"])
498 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
501class ButlerTests(ButlerPutGetTests):
502 """Tests for Butler."""
504 useTempRoot = True
506 def setUp(self):
507 """Create a new butler root for each test."""
508 self.root = makeTestTempDir(TESTDIR)
509 Butler.makeRepo(self.root, config=Config(self.configFile))
510 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
512 def testConstructor(self):
513 """Independent test of constructor."""
514 butler = Butler(self.tmpConfigFile, run=self.default_run)
515 self.assertIsInstance(butler, Butler)
517 # Check that butler.yaml is added automatically.
518 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
519 config_dir = self.tmpConfigFile[: -len(end)]
520 butler = Butler(config_dir, run=self.default_run)
521 self.assertIsInstance(butler, Butler)
523 # Even with a ResourcePath.
524 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
525 self.assertIsInstance(butler, Butler)
527 collections = set(butler.registry.queryCollections())
528 self.assertEqual(collections, {self.default_run})
530 # Check that some special characters can be included in run name.
531 special_run = "u@b.c-A"
532 butler_special = Butler(butler=butler, run=special_run)
533 collections = set(butler_special.registry.queryCollections("*@*"))
534 self.assertEqual(collections, {special_run})
536 butler2 = Butler(butler=butler, collections=["other"])
537 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
538 self.assertIsNone(butler2.run)
539 self.assertIs(butler.datastore, butler2.datastore)
541 # Test that we can use an environment variable to find this
542 # repository.
543 butler_index = Config()
544 butler_index["label"] = self.tmpConfigFile
545 for suffix in (".yaml", ".json"):
546 # Ensure that the content differs so that we know that
547 # we aren't reusing the cache.
548 bad_label = f"s3://bucket/not_real{suffix}"
549 butler_index["bad_label"] = bad_label
550 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
551 butler_index.dumpToUri(temp_file)
552 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
553 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
554 uri = Butler.get_repo_uri("bad_label")
555 self.assertEqual(uri, ResourcePath(bad_label))
556 uri = Butler.get_repo_uri("label")
557 butler = Butler(uri, writeable=False)
558 self.assertIsInstance(butler, Butler)
559 butler = Butler("label", writeable=False)
560 self.assertIsInstance(butler, Butler)
561 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
562 Butler("not_there", writeable=False)
563 with self.assertRaises(KeyError) as cm:
564 Butler.get_repo_uri("missing")
565 self.assertIn("not known to", str(cm.exception))
566 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
567 with self.assertRaises(FileNotFoundError):
568 Butler.get_repo_uri("label")
569 self.assertEqual(Butler.get_known_repos(), set())
570 with self.assertRaises(KeyError) as cm:
571 # No environment variable set.
572 Butler.get_repo_uri("label")
573 self.assertIn("No repository index defined", str(cm.exception))
574 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
575 # No aliases registered.
576 Butler("not_there")
577 self.assertEqual(Butler.get_known_repos(), set())
579 def testBasicPutGet(self):
580 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
581 self.runPutGetTest(storageClass, "test_metric")
583 def testCompositePutGetConcrete(self):
585 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
586 butler = self.runPutGetTest(storageClass, "test_metric")
588 # Should *not* be disassembled
589 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
590 self.assertEqual(len(datasets), 1)
591 uri, components = butler.getURIs(datasets[0])
592 self.assertIsInstance(uri, ResourcePath)
593 self.assertFalse(components)
594 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
595 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
597 # Predicted dataset
598 dataId = {"instrument": "DummyCamComp", "visit": 424}
599 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
600 self.assertFalse(components)
601 self.assertIsInstance(uri, ResourcePath)
602 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
603 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
605 def testCompositePutGetVirtual(self):
606 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
607 butler = self.runPutGetTest(storageClass, "test_metric_comp")
609 # Should be disassembled
610 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
611 self.assertEqual(len(datasets), 1)
612 uri, components = butler.getURIs(datasets[0])
614 if butler.datastore.isEphemeral:
615 # Never disassemble in-memory datastore
616 self.assertIsInstance(uri, ResourcePath)
617 self.assertFalse(components)
618 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
619 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
620 else:
621 self.assertIsNone(uri)
622 self.assertEqual(set(components), set(storageClass.components))
623 for compuri in components.values():
624 self.assertIsInstance(compuri, ResourcePath)
625 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
626 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
628 # Predicted dataset
629 dataId = {"instrument": "DummyCamComp", "visit": 424}
630 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
632 if butler.datastore.isEphemeral:
633 # Never disassembled
634 self.assertIsInstance(uri, ResourcePath)
635 self.assertFalse(components)
636 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
637 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
638 else:
639 self.assertIsNone(uri)
640 self.assertEqual(set(components), set(storageClass.components))
641 for compuri in components.values():
642 self.assertIsInstance(compuri, ResourcePath)
643 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
644 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
646 def testIngest(self):
647 butler = Butler(self.tmpConfigFile, run=self.default_run)
649 # Create and register a DatasetType
650 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
652 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
653 datasetTypeName = "metric"
655 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
657 # Add needed Dimensions
658 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
659 butler.registry.insertDimensionData(
660 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
661 )
662 for detector in (1, 2):
663 butler.registry.insertDimensionData(
664 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
665 )
667 butler.registry.insertDimensionData(
668 "visit",
669 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
670 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
671 )
673 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
674 dataRoot = os.path.join(TESTDIR, "data", "basic")
675 datasets = []
676 for detector in (1, 2):
677 detector_name = f"detector_{detector}"
678 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
679 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
680 # Create a DatasetRef for ingest
681 refIn = DatasetRef(datasetType, dataId, id=None)
683 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
685 butler.ingest(*datasets, transfer="copy")
687 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
688 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
690 metrics1 = butler.get(datasetTypeName, dataId1)
691 metrics2 = butler.get(datasetTypeName, dataId2)
692 self.assertNotEqual(metrics1, metrics2)
694 # Compare URIs
695 uri1 = butler.getURI(datasetTypeName, dataId1)
696 uri2 = butler.getURI(datasetTypeName, dataId2)
697 self.assertNotEqual(uri1, uri2)
699 # Now do a multi-dataset but single file ingest
700 metricFile = os.path.join(dataRoot, "detectors.yaml")
701 refs = []
702 for detector in (1, 2):
703 detector_name = f"detector_{detector}"
704 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
705 # Create a DatasetRef for ingest
706 refs.append(DatasetRef(datasetType, dataId, id=None))
708 datasets = []
709 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
711 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
713 # Check that the datastore recorded no file size.
714 # Not all datastores can support this.
715 try:
716 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
717 self.assertEqual(infos[0].file_size, -1)
718 except AttributeError:
719 pass
721 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
722 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
724 multi1 = butler.get(datasetTypeName, dataId1)
725 multi2 = butler.get(datasetTypeName, dataId2)
727 self.assertEqual(multi1, metrics1)
728 self.assertEqual(multi2, metrics2)
730 # Compare URIs
731 uri1 = butler.getURI(datasetTypeName, dataId1)
732 uri2 = butler.getURI(datasetTypeName, dataId2)
733 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
735 # Test that removing one does not break the second
736 # This line will issue a warning log message for a ChainedDatastore
737 # that uses an InMemoryDatastore since in-memory can not ingest
738 # files.
739 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
740 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
741 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
742 multi2b = butler.get(datasetTypeName, dataId2)
743 self.assertEqual(multi2, multi2b)
745 def testPruneCollections(self):
746 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
747 butler = Butler(self.tmpConfigFile, writeable=True)
748 # Load registry data with dimensions to hang datasets off of.
749 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
750 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
751 # Add some RUN-type collections.
752 run1 = "run1"
753 butler.registry.registerRun(run1)
754 run2 = "run2"
755 butler.registry.registerRun(run2)
756 # put some datasets. ref1 and ref2 have the same data ID, and are in
757 # different runs. ref3 has a different data ID.
758 metric = makeExampleMetrics()
759 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
760 datasetType = self.addDatasetType(
761 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
762 )
763 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
764 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
765 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
767 # Try to delete a RUN collection without purge, or with purge and not
768 # unstore.
769 with self.assertRaises(TypeError):
770 butler.pruneCollection(run1)
771 with self.assertRaises(TypeError):
772 butler.pruneCollection(run2, purge=True)
773 # Add a TAGGED collection and associate ref3 only into it.
774 tag1 = "tag1"
775 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
776 self.assertTrue(registered)
777 # Registering a second time should be allowed.
778 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
779 self.assertFalse(registered)
780 butler.registry.associate(tag1, [ref3])
781 # Add a CHAINED collection that searches run1 and then run2. It
782 # logically contains only ref1, because ref2 is shadowed due to them
783 # having the same data ID and dataset type.
784 chain1 = "chain1"
785 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
786 butler.registry.setCollectionChain(chain1, [run1, run2])
787 # Try to delete RUN collections, which should fail with complete
788 # rollback because they're still referenced by the CHAINED
789 # collection.
790 with self.assertRaises(Exception):
791 butler.pruneCollection(run1, pruge=True, unstore=True)
792 with self.assertRaises(Exception):
793 butler.pruneCollection(run2, pruge=True, unstore=True)
794 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
795 existence = butler.datastore.mexists([ref1, ref2, ref3])
796 self.assertTrue(existence[ref1])
797 self.assertTrue(existence[ref2])
798 self.assertTrue(existence[ref3])
799 # Try to delete CHAINED and TAGGED collections with purge; should not
800 # work.
801 with self.assertRaises(TypeError):
802 butler.pruneCollection(tag1, purge=True, unstore=True)
803 with self.assertRaises(TypeError):
804 butler.pruneCollection(chain1, purge=True, unstore=True)
805 # Remove the tagged collection with unstore=False. This should not
806 # affect the datasets.
807 butler.pruneCollection(tag1)
808 with self.assertRaises(MissingCollectionError):
809 butler.registry.getCollectionType(tag1)
810 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
811 existence = butler.datastore.mexists([ref1, ref2, ref3])
812 self.assertTrue(existence[ref1])
813 self.assertTrue(existence[ref2])
814 self.assertTrue(existence[ref3])
815 # Add the tagged collection back in, and remove it with unstore=True.
816 # This should remove ref3 only from the datastore.
817 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
818 butler.registry.associate(tag1, [ref3])
819 butler.pruneCollection(tag1, unstore=True)
820 with self.assertRaises(MissingCollectionError):
821 butler.registry.getCollectionType(tag1)
822 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
823 existence = butler.datastore.mexists([ref1, ref2, ref3])
824 self.assertTrue(existence[ref1])
825 self.assertTrue(existence[ref2])
826 self.assertFalse(existence[ref3])
827 # Delete the chain with unstore=False. The datasets should not be
828 # affected at all.
829 butler.pruneCollection(chain1)
830 with self.assertRaises(MissingCollectionError):
831 butler.registry.getCollectionType(chain1)
832 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
833 existence = butler.datastore.mexists([ref1, ref2, ref3])
834 self.assertTrue(existence[ref1])
835 self.assertTrue(existence[ref2])
836 self.assertFalse(existence[ref3])
837 # Redefine and then delete the chain with unstore=True. Only ref1
838 # should be unstored (ref3 has already been unstored, but otherwise
839 # would be now).
840 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
841 butler.registry.setCollectionChain(chain1, [run1, run2])
842 butler.pruneCollection(chain1, unstore=True)
843 with self.assertRaises(MissingCollectionError):
844 butler.registry.getCollectionType(chain1)
845 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
846 existence = butler.datastore.mexists([ref1, ref2, ref3])
847 self.assertFalse(existence[ref1])
848 self.assertTrue(existence[ref2])
849 self.assertFalse(existence[ref3])
850 # Remove run1. This removes ref1 and ref3 from the registry (they're
851 # already gone from the datastore, which is fine).
852 butler.pruneCollection(run1, purge=True, unstore=True)
853 with self.assertRaises(MissingCollectionError):
854 butler.registry.getCollectionType(run1)
855 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
856 self.assertTrue(butler.datastore.exists(ref2))
857 # Remove run2. This removes ref2 from the registry and the datastore.
858 butler.pruneCollection(run2, purge=True, unstore=True)
859 with self.assertRaises(MissingCollectionError):
860 butler.registry.getCollectionType(run2)
861 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
863 # Now that the collections have been pruned we can remove the
864 # dataset type
865 butler.registry.removeDatasetType(datasetType.name)
867 def testPickle(self):
868 """Test pickle support."""
869 butler = Butler(self.tmpConfigFile, run=self.default_run)
870 butlerOut = pickle.loads(pickle.dumps(butler))
871 self.assertIsInstance(butlerOut, Butler)
872 self.assertEqual(butlerOut._config, butler._config)
873 self.assertEqual(butlerOut.collections, butler.collections)
874 self.assertEqual(butlerOut.run, butler.run)
876 def testGetDatasetTypes(self):
877 butler = Butler(self.tmpConfigFile, run=self.default_run)
878 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
879 dimensionEntries = [
880 (
881 "instrument",
882 {"instrument": "DummyCam"},
883 {"instrument": "DummyHSC"},
884 {"instrument": "DummyCamComp"},
885 ),
886 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
887 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
888 ]
889 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
890 # Add needed Dimensions
891 for args in dimensionEntries:
892 butler.registry.insertDimensionData(*args)
894 # When a DatasetType is added to the registry entries are not created
895 # for components but querying them can return the components.
896 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
897 components = set()
898 for datasetTypeName in datasetTypeNames:
899 # Create and register a DatasetType
900 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
902 for componentName in storageClass.components:
903 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
905 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
906 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
908 # Now that we have some dataset types registered, validate them
909 butler.validateConfiguration(
910 ignore=[
911 "test_metric_comp",
912 "metric3",
913 "calexp",
914 "DummySC",
915 "datasetType.component",
916 "random_data",
917 "random_data_2",
918 ]
919 )
921 # Add a new datasetType that will fail template validation
922 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
923 if self.validationCanFail:
924 with self.assertRaises(ValidationError):
925 butler.validateConfiguration()
927 # Rerun validation but with a subset of dataset type names
928 butler.validateConfiguration(datasetTypeNames=["metric4"])
930 # Rerun validation but ignore the bad datasetType
931 butler.validateConfiguration(
932 ignore=[
933 "test_metric_comp",
934 "metric3",
935 "calexp",
936 "DummySC",
937 "datasetType.component",
938 "random_data",
939 "random_data_2",
940 ]
941 )
943 def testTransaction(self):
944 butler = Butler(self.tmpConfigFile, run=self.default_run)
945 datasetTypeName = "test_metric"
946 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
947 dimensionEntries = (
948 ("instrument", {"instrument": "DummyCam"}),
949 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
950 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
951 )
952 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
953 metric = makeExampleMetrics()
954 dataId = {"instrument": "DummyCam", "visit": 42}
955 # Create and register a DatasetType
956 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
957 with self.assertRaises(TransactionTestError):
958 with butler.transaction():
959 # Add needed Dimensions
960 for args in dimensionEntries:
961 butler.registry.insertDimensionData(*args)
962 # Store a dataset
963 ref = butler.put(metric, datasetTypeName, dataId)
964 self.assertIsInstance(ref, DatasetRef)
965 # Test getDirect
966 metricOut = butler.getDirect(ref)
967 self.assertEqual(metric, metricOut)
968 # Test get
969 metricOut = butler.get(datasetTypeName, dataId)
970 self.assertEqual(metric, metricOut)
971 # Check we can get components
972 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
973 raise TransactionTestError("This should roll back the entire transaction")
974 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
975 butler.registry.expandDataId(dataId)
976 # Should raise LookupError for missing data ID value
977 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
978 butler.get(datasetTypeName, dataId)
979 # Also check explicitly if Dataset entry is missing
980 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
981 # Direct retrieval should not find the file in the Datastore
982 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
983 butler.getDirect(ref)
985 def testMakeRepo(self):
986 """Test that we can write butler configuration to a new repository via
987 the Butler.makeRepo interface and then instantiate a butler from the
988 repo root.
989 """
990 # Do not run the test if we know this datastore configuration does
991 # not support a file system root
992 if self.fullConfigKey is None:
993 return
995 # create two separate directories
996 root1 = tempfile.mkdtemp(dir=self.root)
997 root2 = tempfile.mkdtemp(dir=self.root)
999 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1000 limited = Config(self.configFile)
1001 butler1 = Butler(butlerConfig)
1002 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1003 full = Config(self.tmpConfigFile)
1004 butler2 = Butler(butlerConfig)
1005 # Butlers should have the same configuration regardless of whether
1006 # defaults were expanded.
1007 self.assertEqual(butler1._config, butler2._config)
1008 # Config files loaded directly should not be the same.
1009 self.assertNotEqual(limited, full)
1010 # Make sure "limited" doesn't have a few keys we know it should be
1011 # inheriting from defaults.
1012 self.assertIn(self.fullConfigKey, full)
1013 self.assertNotIn(self.fullConfigKey, limited)
1015 # Collections don't appear until something is put in them
1016 collections1 = set(butler1.registry.queryCollections())
1017 self.assertEqual(collections1, set())
1018 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1020 # Check that a config with no associated file name will not
1021 # work properly with relocatable Butler repo
1022 butlerConfig.configFile = None
1023 with self.assertRaises(ValueError):
1024 Butler(butlerConfig)
1026 with self.assertRaises(FileExistsError):
1027 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1029 def testStringification(self):
1030 butler = Butler(self.tmpConfigFile, run=self.default_run)
1031 butlerStr = str(butler)
1033 if self.datastoreStr is not None:
1034 for testStr in self.datastoreStr:
1035 self.assertIn(testStr, butlerStr)
1036 if self.registryStr is not None:
1037 self.assertIn(self.registryStr, butlerStr)
1039 datastoreName = butler.datastore.name
1040 if self.datastoreName is not None:
1041 for testStr in self.datastoreName:
1042 self.assertIn(testStr, datastoreName)
1044 def testButlerRewriteDataId(self):
1045 """Test that dataIds can be rewritten based on dimension records."""
1047 butler = Butler(self.tmpConfigFile, run=self.default_run)
1049 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1050 datasetTypeName = "random_data"
1052 # Create dimension records.
1053 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1054 butler.registry.insertDimensionData(
1055 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1056 )
1057 butler.registry.insertDimensionData(
1058 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1059 )
1061 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1062 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1063 butler.registry.registerDatasetType(datasetType)
1065 n_exposures = 5
1066 dayobs = 20210530
1068 for i in range(n_exposures):
1069 butler.registry.insertDimensionData(
1070 "exposure",
1071 {
1072 "instrument": "DummyCamComp",
1073 "id": i,
1074 "obs_id": f"exp{i}",
1075 "seq_num": i,
1076 "day_obs": dayobs,
1077 "physical_filter": "d-r",
1078 },
1079 )
1081 # Write some data.
1082 for i in range(n_exposures):
1083 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1085 # Use the seq_num for the put to test rewriting.
1086 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1087 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1089 # Check that the exposure is correct in the dataId
1090 self.assertEqual(ref.dataId["exposure"], i)
1092 # and check that we can get the dataset back with the same dataId
1093 new_metric = butler.get(datasetTypeName, dataId=dataId)
1094 self.assertEqual(new_metric, metric)
1097class FileDatastoreButlerTests(ButlerTests):
1098 """Common tests and specialization of ButlerTests for butlers backed
1099 by datastores that inherit from FileDatastore.
1100 """
1102 def checkFileExists(self, root, relpath):
1103 """Checks if file exists at a given path (relative to root).
1105 Test testPutTemplates verifies actual physical existance of the files
1106 in the requested location.
1107 """
1108 uri = ResourcePath(root, forceDirectory=True)
1109 return uri.join(relpath).exists()
1111 def testPutTemplates(self):
1112 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1113 butler = Butler(self.tmpConfigFile, run=self.default_run)
1115 # Add needed Dimensions
1116 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1117 butler.registry.insertDimensionData(
1118 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1119 )
1120 butler.registry.insertDimensionData(
1121 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1122 )
1123 butler.registry.insertDimensionData(
1124 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1125 )
1127 # Create and store a dataset
1128 metric = makeExampleMetrics()
1130 # Create two almost-identical DatasetTypes (both will use default
1131 # template)
1132 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1133 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1134 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1135 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1137 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1138 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1140 # Put with exactly the data ID keys needed
1141 ref = butler.put(metric, "metric1", dataId1)
1142 uri = butler.getURI(ref)
1143 self.assertTrue(
1144 self.checkFileExists(
1145 butler.datastore.root, f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle"
1146 ),
1147 f"Checking existence of {uri}",
1148 )
1150 # Check the template based on dimensions
1151 butler.datastore.templates.validateTemplates([ref])
1153 # Put with extra data ID keys (physical_filter is an optional
1154 # dependency); should not change template (at least the way we're
1155 # defining them to behave now; the important thing is that they
1156 # must be consistent).
1157 ref = butler.put(metric, "metric2", dataId2)
1158 uri = butler.getURI(ref)
1159 self.assertTrue(
1160 self.checkFileExists(
1161 butler.datastore.root, f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle"
1162 ),
1163 f"Checking existence of {uri}",
1164 )
1166 # Check the template based on dimensions
1167 butler.datastore.templates.validateTemplates([ref])
1169 # Now use a file template that will not result in unique filenames
1170 with self.assertRaises(FileTemplateValidationError):
1171 butler.put(metric, "metric3", dataId1)
1173 def testImportExport(self):
1174 # Run put/get tests just to create and populate a repo.
1175 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1176 self.runImportExportTest(storageClass)
1178 @unittest.expectedFailure
1179 def testImportExportVirtualComposite(self):
1180 # Run put/get tests just to create and populate a repo.
1181 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1182 self.runImportExportTest(storageClass)
1184 def runImportExportTest(self, storageClass):
1185 """This test does an export to a temp directory and an import back
1186 into a new temp directory repo. It does not assume a posix datastore"""
1187 exportButler = self.runPutGetTest(storageClass, "test_metric")
1188 print("Root:", exportButler.datastore.root)
1189 # Test that the repo actually has at least one dataset.
1190 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1191 self.assertGreater(len(datasets), 0)
1192 # Add a DimensionRecord that's unused by those datasets.
1193 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1194 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1195 # Export and then import datasets.
1196 with safeTestTempDir(TESTDIR) as exportDir:
1197 exportFile = os.path.join(exportDir, "exports.yaml")
1198 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1199 export.saveDatasets(datasets)
1200 # Export the same datasets again. This should quietly do
1201 # nothing because of internal deduplication, and it shouldn't
1202 # complain about being asked to export the "htm7" elements even
1203 # though there aren't any in these datasets or in the database.
1204 export.saveDatasets(datasets, elements=["htm7"])
1205 # Save one of the data IDs again; this should be harmless
1206 # because of internal deduplication.
1207 export.saveDataIds([datasets[0].dataId])
1208 # Save some dimension records directly.
1209 export.saveDimensionData("skymap", [skymapRecord])
1210 self.assertTrue(os.path.exists(exportFile))
1211 with safeTestTempDir(TESTDIR) as importDir:
1212 # We always want this to be a local posix butler
1213 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1214 # Calling script.butlerImport tests the implementation of the
1215 # butler command line interface "import" subcommand. Functions
1216 # in the script folder are generally considered protected and
1217 # should not be used as public api.
1218 with open(exportFile, "r") as f:
1219 script.butlerImport(
1220 importDir,
1221 export_file=f,
1222 directory=exportDir,
1223 transfer="auto",
1224 skip_dimensions=None,
1225 reuse_ids=False,
1226 )
1227 importButler = Butler(importDir, run=self.default_run)
1228 for ref in datasets:
1229 with self.subTest(ref=ref):
1230 # Test for existence by passing in the DatasetType and
1231 # data ID separately, to avoid lookup by dataset_id.
1232 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1233 self.assertEqual(
1234 list(importButler.registry.queryDimensionRecords("skymap")),
1235 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1236 )
1238 def testRemoveRuns(self):
1239 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1240 butler = Butler(self.tmpConfigFile, writeable=True)
1241 # Load registry data with dimensions to hang datasets off of.
1242 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1243 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1244 # Add some RUN-type collection.
1245 run1 = "run1"
1246 butler.registry.registerRun(run1)
1247 run2 = "run2"
1248 butler.registry.registerRun(run2)
1249 # put a dataset in each
1250 metric = makeExampleMetrics()
1251 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1252 datasetType = self.addDatasetType(
1253 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1254 )
1255 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1256 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1257 uri1 = butler.getURI(ref1, collections=[run1])
1258 uri2 = butler.getURI(ref2, collections=[run2])
1259 # Remove from both runs with different values for unstore.
1260 butler.removeRuns([run1], unstore=True)
1261 butler.removeRuns([run2], unstore=False)
1262 # Should be nothing in registry for either one, and datastore should
1263 # not think either exists.
1264 with self.assertRaises(MissingCollectionError):
1265 butler.registry.getCollectionType(run1)
1266 with self.assertRaises(MissingCollectionError):
1267 butler.registry.getCollectionType(run2)
1268 self.assertFalse(butler.datastore.exists(ref1))
1269 self.assertFalse(butler.datastore.exists(ref2))
1270 # The ref we unstored should be gone according to the URI, but the
1271 # one we forgot should still be around.
1272 self.assertFalse(uri1.exists())
1273 self.assertTrue(uri2.exists())
1276class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1277 """PosixDatastore specialization of a butler"""
1279 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1280 fullConfigKey = ".datastore.formatters"
1281 validationCanFail = True
1282 datastoreStr = ["/tmp"]
1283 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1284 registryStr = "/gen3.sqlite3"
1286 def testPathConstructor(self):
1287 """Independent test of constructor using PathLike."""
1288 butler = Butler(self.tmpConfigFile, run=self.default_run)
1289 self.assertIsInstance(butler, Butler)
1291 # And again with a Path object with the butler yaml
1292 path = pathlib.Path(self.tmpConfigFile)
1293 butler = Butler(path, writeable=False)
1294 self.assertIsInstance(butler, Butler)
1296 # And again with a Path object without the butler yaml
1297 # (making sure we skip it if the tmp config doesn't end
1298 # in butler.yaml -- which is the case for a subclass)
1299 if self.tmpConfigFile.endswith("butler.yaml"):
1300 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1301 butler = Butler(path, writeable=False)
1302 self.assertIsInstance(butler, Butler)
1304 def testExportTransferCopy(self):
1305 """Test local export using all transfer modes"""
1306 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1307 exportButler = self.runPutGetTest(storageClass, "test_metric")
1308 # Test that the repo actually has at least one dataset.
1309 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1310 self.assertGreater(len(datasets), 0)
1311 uris = [exportButler.getURI(d) for d in datasets]
1312 datastoreRoot = exportButler.datastore.root
1314 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1316 for path in pathsInStore:
1317 # Assume local file system
1318 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1320 for transfer in ("copy", "link", "symlink", "relsymlink"):
1321 with safeTestTempDir(TESTDIR) as exportDir:
1322 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1323 export.saveDatasets(datasets)
1324 for path in pathsInStore:
1325 self.assertTrue(
1326 self.checkFileExists(exportDir, path),
1327 f"Check that mode {transfer} exported files",
1328 )
1330 def testPruneDatasets(self):
1331 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1332 butler = Butler(self.tmpConfigFile, writeable=True)
1333 # Load registry data with dimensions to hang datasets off of.
1334 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1335 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1336 # Add some RUN-type collections.
1337 run1 = "run1"
1338 butler.registry.registerRun(run1)
1339 run2 = "run2"
1340 butler.registry.registerRun(run2)
1341 # put some datasets. ref1 and ref2 have the same data ID, and are in
1342 # different runs. ref3 has a different data ID.
1343 metric = makeExampleMetrics()
1344 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1345 datasetType = self.addDatasetType(
1346 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1347 )
1348 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1349 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1350 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1352 # Simple prune.
1353 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1354 with self.assertRaises(LookupError):
1355 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1357 # Put data back.
1358 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1359 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1360 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1362 # Check that in normal mode, deleting the record will lead to
1363 # trash not touching the file.
1364 uri1 = butler.datastore.getURI(ref1)
1365 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1366 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1367 butler.datastore.trash(ref1)
1368 butler.datastore.emptyTrash()
1369 self.assertTrue(uri1.exists())
1370 uri1.remove() # Clean it up.
1372 # Simulate execution butler setup by deleting the datastore
1373 # record but keeping the file around and trusting.
1374 butler.datastore.trustGetRequest = True
1375 uri2 = butler.datastore.getURI(ref2)
1376 uri3 = butler.datastore.getURI(ref3)
1377 self.assertTrue(uri2.exists())
1378 self.assertTrue(uri3.exists())
1380 # Remove the datastore record.
1381 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1382 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1383 self.assertTrue(uri2.exists())
1384 butler.datastore.trash([ref2, ref3])
1385 # Immediate removal for ref2 file
1386 self.assertFalse(uri2.exists())
1387 # But ref3 has to wait for the empty.
1388 self.assertTrue(uri3.exists())
1389 butler.datastore.emptyTrash()
1390 self.assertFalse(uri3.exists())
1392 # Clear out the datasets from registry.
1393 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1395 def testPytypePutCoercion(self):
1396 """Test python type coercion on Butler.get and put."""
1398 # Store some data with the normal example storage class.
1399 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1400 datasetTypeName = "test_metric"
1401 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
1403 dataId = {"instrument": "DummyCamComp", "visit": 423}
1405 # Put a dict and this should coerce to a MetricsExample
1406 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1407 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1408 test_metric = butler.getDirect(metric_ref)
1409 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1410 self.assertEqual(test_metric.summary, test_dict["summary"])
1411 self.assertEqual(test_metric.output, test_dict["output"])
1413 # Check that the put still works if a DatasetType is given with
1414 # a definition matching this python type.
1415 registry_type = butler.registry.getDatasetType(datasetTypeName)
1416 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
1417 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
1418 self.assertEqual(metric2_ref.datasetType, registry_type)
1420 # The get will return the type expected by registry.
1421 test_metric2 = butler.getDirect(metric2_ref)
1422 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
1424 # Make a new DatasetRef with the compatible but different DatasetType.
1425 # This should now return a dict.
1426 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
1427 test_dict2 = butler.getDirect(new_ref)
1428 self.assertEqual(get_full_type_name(test_dict2), "dict")
1430 # Get it again with the wrong dataset type definition using get()
1431 # rather than getDirect(). This should be consistent with getDirect()
1432 # behavior and return the type of the DatasetType.
1433 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
1434 self.assertEqual(get_full_type_name(test_dict3), "dict")
1436 def testPytypeCoercion(self):
1437 """Test python type coercion on Butler.get and put."""
1439 # Store some data with the normal example storage class.
1440 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1441 datasetTypeName = "test_metric"
1442 butler = self.runPutGetTest(storageClass, datasetTypeName)
1444 dataId = {"instrument": "DummyCamComp", "visit": 423}
1445 metric = butler.get(datasetTypeName, dataId=dataId)
1446 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1448 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1449 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1451 # Now need to hack the registry dataset type definition.
1452 # There is no API for this.
1453 manager = butler.registry._managers.datasets
1454 manager._db.update(
1455 manager._static.dataset_type,
1456 {"name": datasetTypeName},
1457 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1458 )
1460 # Force reset of dataset type cache
1461 butler.registry.refresh()
1463 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1464 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1465 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1467 metric_model = butler.get(datasetTypeName, dataId=dataId)
1468 self.assertNotEqual(type(metric_model), type(metric))
1469 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1471 # Put the model and read it back to show that everything now
1472 # works as normal.
1473 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1474 metric_model_new = butler.get(metric_ref)
1475 self.assertEqual(metric_model_new, metric_model)
1477 # Hack the storage class again to something that will fail on the
1478 # get with no conversion class.
1479 manager._db.update(
1480 manager._static.dataset_type,
1481 {"name": datasetTypeName},
1482 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1483 )
1484 butler.registry.refresh()
1486 with self.assertRaises(ValueError):
1487 butler.get(datasetTypeName, dataId=dataId)
1490class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1491 """InMemoryDatastore specialization of a butler"""
1493 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1494 fullConfigKey = None
1495 useTempRoot = False
1496 validationCanFail = False
1497 datastoreStr = ["datastore='InMemory"]
1498 datastoreName = ["InMemoryDatastore@"]
1499 registryStr = "/gen3.sqlite3"
1501 def testIngest(self):
1502 pass
1505class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1506 """PosixDatastore specialization"""
1508 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1509 fullConfigKey = ".datastore.datastores.1.formatters"
1510 validationCanFail = True
1511 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1512 datastoreName = [
1513 "InMemoryDatastore@",
1514 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1515 "SecondDatastore",
1516 ]
1517 registryStr = "/gen3.sqlite3"
1520class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1521 """Test that a yaml file in one location can refer to a root in another."""
1523 datastoreStr = ["dir1"]
1524 # Disable the makeRepo test since we are deliberately not using
1525 # butler.yaml as the config name.
1526 fullConfigKey = None
1528 def setUp(self):
1529 self.root = makeTestTempDir(TESTDIR)
1531 # Make a new repository in one place
1532 self.dir1 = os.path.join(self.root, "dir1")
1533 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1535 # Move the yaml file to a different place and add a "root"
1536 self.dir2 = os.path.join(self.root, "dir2")
1537 os.makedirs(self.dir2, exist_ok=True)
1538 configFile1 = os.path.join(self.dir1, "butler.yaml")
1539 config = Config(configFile1)
1540 config["root"] = self.dir1
1541 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1542 config.dumpToUri(configFile2)
1543 os.remove(configFile1)
1544 self.tmpConfigFile = configFile2
1546 def testFileLocations(self):
1547 self.assertNotEqual(self.dir1, self.dir2)
1548 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1549 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1550 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1553class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1554 """Test that a config file created by makeRepo outside of repo works."""
1556 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1558 def setUp(self):
1559 self.root = makeTestTempDir(TESTDIR)
1560 self.root2 = makeTestTempDir(TESTDIR)
1562 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1563 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1565 def tearDown(self):
1566 if os.path.exists(self.root2):
1567 shutil.rmtree(self.root2, ignore_errors=True)
1568 super().tearDown()
1570 def testConfigExistence(self):
1571 c = Config(self.tmpConfigFile)
1572 uri_config = ResourcePath(c["root"])
1573 uri_expected = ResourcePath(self.root, forceDirectory=True)
1574 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1575 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1577 def testPutGet(self):
1578 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1579 self.runPutGetTest(storageClass, "test_metric")
1582class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1583 """Test that a config file created by makeRepo outside of repo works."""
1585 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1587 def setUp(self):
1588 self.root = makeTestTempDir(TESTDIR)
1589 self.root2 = makeTestTempDir(TESTDIR)
1591 self.tmpConfigFile = self.root2
1592 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1594 def testConfigExistence(self):
1595 # Append the yaml file else Config constructor does not know the file
1596 # type.
1597 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1598 super().testConfigExistence()
1601class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1602 """Test that a config file created by makeRepo outside of repo works."""
1604 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1606 def setUp(self):
1607 self.root = makeTestTempDir(TESTDIR)
1608 self.root2 = makeTestTempDir(TESTDIR)
1610 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1611 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1614@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1615class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1616 """S3Datastore specialization of a butler; an S3 storage Datastore +
1617 a local in-memory SqlRegistry.
1618 """
1620 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1621 fullConfigKey = None
1622 validationCanFail = True
1624 bucketName = "anybucketname"
1625 """Name of the Bucket that will be used in the tests. The name is read from
1626 the config file used with the tests during set-up.
1627 """
1629 root = "butlerRoot/"
1630 """Root repository directory expected to be used in case useTempRoot=False.
1631 Otherwise the root is set to a 20 characters long randomly generated string
1632 during set-up.
1633 """
1635 datastoreStr = [f"datastore={root}"]
1636 """Contains all expected root locations in a format expected to be
1637 returned by Butler stringification.
1638 """
1640 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1641 """The expected format of the S3 Datastore string."""
1643 registryStr = "/gen3.sqlite3"
1644 """Expected format of the Registry string."""
1646 mock_s3 = mock_s3()
1647 """The mocked s3 interface from moto."""
1649 def genRoot(self):
1650 """Returns a random string of len 20 to serve as a root
1651 name for the temporary bucket repo.
1653 This is equivalent to tempfile.mkdtemp as this is what self.root
1654 becomes when useTempRoot is True.
1655 """
1656 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1657 return rndstr + "/"
1659 def setUp(self):
1660 config = Config(self.configFile)
1661 uri = ResourcePath(config[".datastore.datastore.root"])
1662 self.bucketName = uri.netloc
1664 # Enable S3 mocking of tests.
1665 self.mock_s3.start()
1667 # set up some fake credentials if they do not exist
1668 self.usingDummyCredentials = setAwsEnvCredentials()
1670 if self.useTempRoot:
1671 self.root = self.genRoot()
1672 rooturi = f"s3://{self.bucketName}/{self.root}"
1673 config.update({"datastore": {"datastore": {"root": rooturi}}})
1675 # need local folder to store registry database
1676 self.reg_dir = makeTestTempDir(TESTDIR)
1677 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1679 # MOTO needs to know that we expect Bucket bucketname to exist
1680 # (this used to be the class attribute bucketName)
1681 s3 = boto3.resource("s3")
1682 s3.create_bucket(Bucket=self.bucketName)
1684 self.datastoreStr = f"datastore={self.root}"
1685 self.datastoreName = [f"FileDatastore@{rooturi}"]
1686 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1687 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1689 def tearDown(self):
1690 s3 = boto3.resource("s3")
1691 bucket = s3.Bucket(self.bucketName)
1692 try:
1693 bucket.objects.all().delete()
1694 except botocore.exceptions.ClientError as e:
1695 if e.response["Error"]["Code"] == "404":
1696 # the key was not reachable - pass
1697 pass
1698 else:
1699 raise
1701 bucket = s3.Bucket(self.bucketName)
1702 bucket.delete()
1704 # Stop the S3 mock.
1705 self.mock_s3.stop()
1707 # unset any potentially set dummy credentials
1708 if self.usingDummyCredentials:
1709 unsetAwsEnvCredentials()
1711 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1712 shutil.rmtree(self.reg_dir, ignore_errors=True)
1714 if self.useTempRoot and os.path.exists(self.root):
1715 shutil.rmtree(self.root, ignore_errors=True)
1718@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1719class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1720 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1721 a local in-memory SqlRegistry.
1722 """
1724 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1725 fullConfigKey = None
1726 validationCanFail = True
1728 serverName = "localhost"
1729 """Name of the server that will be used in the tests.
1730 """
1732 portNumber = 8080
1733 """Port on which the webdav server listens. Automatically chosen
1734 at setUpClass via the _getfreeport() method
1735 """
1737 root = "butlerRoot/"
1738 """Root repository directory expected to be used in case useTempRoot=False.
1739 Otherwise the root is set to a 20 characters long randomly generated string
1740 during set-up.
1741 """
1743 datastoreStr = [f"datastore={root}"]
1744 """Contains all expected root locations in a format expected to be
1745 returned by Butler stringification.
1746 """
1748 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1749 """The expected format of the WebdavDatastore string."""
1751 registryStr = "/gen3.sqlite3"
1752 """Expected format of the Registry string."""
1754 serverThread = None
1755 """Thread in which the local webdav server will run"""
1757 stopWebdavServer = False
1758 """This flag will cause the webdav server to
1759 gracefully shut down when True
1760 """
1762 def genRoot(self):
1763 """Returns a random string of len 20 to serve as a root
1764 name for the temporary bucket repo.
1766 This is equivalent to tempfile.mkdtemp as this is what self.root
1767 becomes when useTempRoot is True.
1768 """
1769 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1770 return rndstr + "/"
1772 @classmethod
1773 def setUpClass(cls):
1774 # Do the same as inherited class
1775 cls.storageClassFactory = StorageClassFactory()
1776 cls.storageClassFactory.addFromConfig(cls.configFile)
1778 cls.portNumber = cls._getfreeport()
1779 # Run a local webdav server on which tests will be run
1780 cls.serverThread = Thread(
1781 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1782 )
1783 cls.serverThread.start()
1784 # Wait for it to start
1785 time.sleep(3)
1787 @classmethod
1788 def tearDownClass(cls):
1789 # Ask for graceful shut down of the webdav server
1790 cls.stopWebdavServer = True
1791 # Wait for the thread to exit
1792 cls.serverThread.join()
1794 def setUp(self):
1795 config = Config(self.configFile)
1797 if self.useTempRoot:
1798 self.root = self.genRoot()
1799 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1800 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1802 # need local folder to store registry database
1803 self.reg_dir = makeTestTempDir(TESTDIR)
1804 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1806 self.datastoreStr = f"datastore={self.root}"
1807 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1809 if not _is_webdav_endpoint(self.rooturi):
1810 raise OSError("Webdav server not running properly: cannot run tests.")
1812 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1813 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1815 def tearDown(self):
1816 # Clear temporary directory
1817 ResourcePath(self.rooturi).remove()
1818 ResourcePath(self.rooturi).session.close()
1820 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1821 shutil.rmtree(self.reg_dir, ignore_errors=True)
1823 if self.useTempRoot and os.path.exists(self.root):
1824 shutil.rmtree(self.root, ignore_errors=True)
1826 def _serveWebdav(self, port: int, stopWebdavServer):
1827 """Starts a local webdav-compatible HTTP server,
1828 Listening on http://localhost:port
1829 This server only runs when this test class is instantiated,
1830 and then shuts down. Must be started is a separate thread.
1832 Parameters
1833 ----------
1834 port : `int`
1835 The port number on which the server should listen
1836 """
1837 root_path = gettempdir()
1839 config = {
1840 "host": "0.0.0.0",
1841 "port": port,
1842 "provider_mapping": {"/": root_path},
1843 "http_authenticator": {"domain_controller": None},
1844 "simple_dc": {"user_mapping": {"*": True}},
1845 "verbose": 0,
1846 }
1847 app = WsgiDAVApp(config)
1849 server_args = {
1850 "bind_addr": (config["host"], config["port"]),
1851 "wsgi_app": app,
1852 }
1853 server = wsgi.Server(**server_args)
1854 server.prepare()
1856 try:
1857 # Start the actual server in a separate thread
1858 t = Thread(target=server.serve, daemon=True)
1859 t.start()
1860 # watch stopWebdavServer, and gracefully
1861 # shut down the server when True
1862 while True:
1863 if stopWebdavServer():
1864 break
1865 time.sleep(1)
1866 except KeyboardInterrupt:
1867 print("Caught Ctrl-C, shutting down...")
1868 finally:
1869 server.stop()
1870 t.join()
1872 def _getfreeport():
1873 """
1874 Determines a free port using sockets.
1875 """
1876 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1877 free_socket.bind(("0.0.0.0", 0))
1878 free_socket.listen()
1879 port = free_socket.getsockname()[1]
1880 free_socket.close()
1881 return port
1884class PosixDatastoreTransfers(unittest.TestCase):
1885 """Test data transfers between butlers.
1887 Test for different managers. UUID to UUID and integer to integer are
1888 tested. UUID to integer is not supported since we do not currently
1889 want to allow that. Integer to UUID is supported with the caveat
1890 that UUID4 will be generated and this will be incorrect for raw
1891 dataset types. The test ignores that.
1892 """
1894 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1896 @classmethod
1897 def setUpClass(cls):
1898 cls.storageClassFactory = StorageClassFactory()
1899 cls.storageClassFactory.addFromConfig(cls.configFile)
1901 def setUp(self):
1902 self.root = makeTestTempDir(TESTDIR)
1903 self.config = Config(self.configFile)
1905 def tearDown(self):
1906 removeTestTempDir(self.root)
1908 def create_butler(self, manager, label):
1909 config = Config(self.configFile)
1910 config["registry", "managers", "datasets"] = manager
1911 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1913 def create_butlers(self, manager1, manager2):
1914 self.source_butler = self.create_butler(manager1, "1")
1915 self.target_butler = self.create_butler(manager2, "2")
1917 def testTransferUuidToUuid(self):
1918 self.create_butlers(
1919 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1920 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1921 )
1922 # Setting id_gen_map should have no effect here
1923 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1925 def testTransferIntToInt(self):
1926 self.create_butlers(
1927 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1928 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1929 )
1930 # int dataset ID only allows UNIQUE
1931 self.assertButlerTransfers()
1933 def testTransferIntToUuid(self):
1934 self.create_butlers(
1935 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1936 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1937 )
1938 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1940 def testTransferMissing(self):
1941 """Test transfers where datastore records are missing.
1943 This is how execution butler works.
1944 """
1945 self.create_butlers(
1946 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1947 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1948 )
1950 # Configure the source butler to allow trust.
1951 self.source_butler.datastore.trustGetRequest = True
1953 self.assertButlerTransfers(purge=True)
1955 def testTransferMissingDisassembly(self):
1956 """Test transfers where datastore records are missing.
1958 This is how execution butler works.
1959 """
1960 self.create_butlers(
1961 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1962 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1963 )
1965 # Configure the source butler to allow trust.
1966 self.source_butler.datastore.trustGetRequest = True
1968 # Test disassembly.
1969 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1971 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1972 """Test that a run can be transferred to another butler."""
1974 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1975 datasetTypeName = "random_data"
1977 # Test will create 3 collections and we will want to transfer
1978 # two of those three.
1979 runs = ["run1", "run2", "other"]
1981 # Also want to use two different dataset types to ensure that
1982 # grouping works.
1983 datasetTypeNames = ["random_data", "random_data_2"]
1985 # Create the run collections in the source butler.
1986 for run in runs:
1987 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1989 # Create dimensions in both butlers (transfer will not create them).
1990 n_exposures = 30
1991 for butler in (self.source_butler, self.target_butler):
1992 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1993 butler.registry.insertDimensionData(
1994 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1995 )
1996 butler.registry.insertDimensionData(
1997 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1998 )
2000 for i in range(n_exposures):
2001 butler.registry.insertDimensionData(
2002 "exposure",
2003 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2004 )
2006 # Create dataset types in the source butler.
2007 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
2008 for datasetTypeName in datasetTypeNames:
2009 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2010 self.source_butler.registry.registerDatasetType(datasetType)
2012 # Write a dataset to an unrelated run -- this will ensure that
2013 # we are rewriting integer dataset ids in the target if necessary.
2014 # Will not be relevant for UUID.
2015 run = "distraction"
2016 butler = Butler(butler=self.source_butler, run=run)
2017 butler.put(
2018 makeExampleMetrics(),
2019 datasetTypeName,
2020 exposure=1,
2021 instrument="DummyCamComp",
2022 physical_filter="d-r",
2023 )
2025 # Write some example metrics to the source
2026 butler = Butler(butler=self.source_butler)
2028 # Set of DatasetRefs that should be in the list of refs to transfer
2029 # but which will not be transferred.
2030 deleted = set()
2032 n_expected = 20 # Number of datasets expected to be transferred
2033 source_refs = []
2034 for i in range(n_exposures):
2035 # Put a third of datasets into each collection, only retain
2036 # two thirds.
2037 index = i % 3
2038 run = runs[index]
2039 datasetTypeName = datasetTypeNames[i % 2]
2041 metric_data = {
2042 "summary": {"counter": i},
2043 "output": {"text": "metric"},
2044 "data": [2 * x for x in range(i)],
2045 }
2046 metric = MetricsExample(**metric_data)
2047 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2048 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2050 # Remove the datastore record using low-level API
2051 if purge:
2052 # Remove records for a fraction.
2053 if index == 1:
2055 # For one of these delete the file as well.
2056 # This allows the "missing" code to filter the
2057 # file out.
2058 if not deleted:
2059 primary, uris = butler.datastore.getURIs(ref)
2060 if primary:
2061 primary.remove()
2062 for uri in uris.values():
2063 uri.remove()
2064 n_expected -= 1
2065 deleted.add(ref)
2067 # Remove the datastore record.
2068 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2070 if index < 2:
2071 source_refs.append(ref)
2072 if ref not in deleted:
2073 new_metric = butler.get(ref.unresolved(), collections=run)
2074 self.assertEqual(new_metric, metric)
2076 # Create some bad dataset types to ensure we check for inconsistent
2077 # definitions.
2078 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2079 for datasetTypeName in datasetTypeNames:
2080 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2081 self.target_butler.registry.registerDatasetType(datasetType)
2082 with self.assertRaises(ConflictingDefinitionError):
2083 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2084 # And remove the bad definitions.
2085 for datasetTypeName in datasetTypeNames:
2086 self.target_butler.registry.removeDatasetType(datasetTypeName)
2088 # Transfer without creating dataset types should fail.
2089 with self.assertRaises(KeyError):
2090 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2092 # Now transfer them to the second butler
2093 with self.assertLogs(level=logging.DEBUG) as cm:
2094 transferred = self.target_butler.transfer_from(
2095 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2096 )
2097 self.assertEqual(len(transferred), n_expected)
2098 log_output = ";".join(cm.output)
2099 self.assertIn("found in datastore for chunk", log_output)
2100 self.assertIn("Creating output run", log_output)
2102 # Do the transfer twice to ensure that it will do nothing extra.
2103 # Only do this if purge=True because it does not work for int
2104 # dataset_id.
2105 if purge:
2106 # This should not need to register dataset types.
2107 transferred = self.target_butler.transfer_from(
2108 self.source_butler, source_refs, id_gen_map=id_gen_map
2109 )
2110 self.assertEqual(len(transferred), n_expected)
2112 # Also do an explicit low-level transfer to trigger some
2113 # edge cases.
2114 with self.assertLogs(level=logging.DEBUG) as cm:
2115 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2116 log_output = ";".join(cm.output)
2117 self.assertIn("no file artifacts exist", log_output)
2119 with self.assertRaises(TypeError):
2120 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2122 with self.assertRaises(ValueError):
2123 self.target_butler.datastore.transfer_from(
2124 self.source_butler.datastore, source_refs, transfer="split"
2125 )
2127 # Now try to get the same refs from the new butler.
2128 for ref in source_refs:
2129 if ref not in deleted:
2130 unresolved_ref = ref.unresolved()
2131 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2132 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2133 self.assertEqual(new_metric, old_metric)
2135 # Now prune run2 collection and create instead a CHAINED collection.
2136 # This should block the transfer.
2137 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2138 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2139 with self.assertRaises(CollectionTypeError):
2140 # Re-importing the run1 datasets can be problematic if they
2141 # use integer IDs so filter those out.
2142 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2143 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2146if __name__ == "__main__": 2146 ↛ 2147line 2146 didn't jump to line 2147, because the condition on line 2146 was never true
2147 unittest.main()