Coverage for tests/test_butler.py: 15%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionSearch,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import (
77 CollectionError,
78 CollectionTypeError,
79 ConflictingDefinitionError,
80 DataIdValueError,
81 MissingCollectionError,
82)
83from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
84from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
85from lsst.resources import ResourcePath
86from lsst.resources.http import _is_webdav_endpoint
87from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
88from lsst.utils import doImport
89from lsst.utils.introspection import get_full_type_name
91TESTDIR = os.path.abspath(os.path.dirname(__file__))
94def makeExampleMetrics():
95 return MetricsExample(
96 {"AM1": 5.2, "AM2": 30.6},
97 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
98 [563, 234, 456.7, 752, 8, 9, 27],
99 )
102class TransactionTestError(Exception):
103 """Specific error for testing transactions, to prevent misdiagnosing
104 that might otherwise occur when a standard exception is used.
105 """
107 pass
110class ButlerConfigTests(unittest.TestCase):
111 """Simple tests for ButlerConfig that are not tested in any other test
112 cases."""
114 def testSearchPath(self):
115 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
116 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
117 config1 = ButlerConfig(configFile)
118 self.assertNotIn("testConfigs", "\n".join(cm.output))
120 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
121 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
122 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
123 self.assertIn("testConfigs", "\n".join(cm.output))
125 key = ("datastore", "records", "table")
126 self.assertNotEqual(config1[key], config2[key])
127 self.assertEqual(config2[key], "override_record")
130class ButlerPutGetTests:
131 """Helper method for running a suite of put/get tests from different
132 butler configurations."""
134 root = None
135 default_run = "ingésτ😺"
137 @staticmethod
138 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
139 """Create a DatasetType and register it"""
140 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
141 registry.registerDatasetType(datasetType)
142 return datasetType
144 @classmethod
145 def setUpClass(cls):
146 cls.storageClassFactory = StorageClassFactory()
147 cls.storageClassFactory.addFromConfig(cls.configFile)
149 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
150 datasetType = datasetRef.datasetType
151 dataId = datasetRef.dataId
152 deferred = butler.getDirectDeferred(datasetRef)
154 for component in components:
155 compTypeName = datasetType.componentTypeName(component)
156 result = butler.get(compTypeName, dataId, collections=collections)
157 self.assertEqual(result, getattr(reference, component))
158 result_deferred = deferred.get(component=component)
159 self.assertEqual(result_deferred, result)
161 def tearDown(self):
162 removeTestTempDir(self.root)
164 def create_butler(self, run, storageClass, datasetTypeName):
165 butler = Butler(self.tmpConfigFile, run=run)
167 collections = set(butler.registry.queryCollections())
168 self.assertEqual(collections, set([run]))
170 # Create and register a DatasetType
171 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
173 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
175 # Add needed Dimensions
176 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
177 butler.registry.insertDimensionData(
178 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
179 )
180 butler.registry.insertDimensionData(
181 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
182 )
183 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
184 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
185 butler.registry.insertDimensionData(
186 "visit",
187 {
188 "instrument": "DummyCamComp",
189 "id": 423,
190 "name": "fourtwentythree",
191 "physical_filter": "d-r",
192 "visit_system": 1,
193 "datetime_begin": visit_start,
194 "datetime_end": visit_end,
195 },
196 )
198 # Add more visits for some later tests
199 for visit_id in (424, 425):
200 butler.registry.insertDimensionData(
201 "visit",
202 {
203 "instrument": "DummyCamComp",
204 "id": visit_id,
205 "name": f"fourtwentyfour_{visit_id}",
206 "physical_filter": "d-r",
207 "visit_system": 1,
208 },
209 )
210 return butler, datasetType
212 def runPutGetTest(self, storageClass, datasetTypeName):
213 # New datasets will be added to run and tag, but we will only look in
214 # tag when looking up datasets.
215 run = self.default_run
216 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
218 # Create and store a dataset
219 metric = makeExampleMetrics()
220 dataId = {"instrument": "DummyCamComp", "visit": 423}
222 # Create a DatasetRef for put
223 refIn = DatasetRef(datasetType, dataId, id=None)
225 # Put with a preexisting id should fail
226 with self.assertRaises(ValueError):
227 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
229 # Put and remove the dataset once as a DatasetRef, once as a dataId,
230 # and once with a DatasetType
232 # Keep track of any collections we add and do not clean up
233 expected_collections = {run}
235 counter = 0
236 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
237 # Since we are using subTest we can get cascading failures
238 # here with the first attempt failing and the others failing
239 # immediately because the dataset already exists. Work around
240 # this by using a distinct run collection each time
241 counter += 1
242 this_run = f"put_run_{counter}"
243 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
244 expected_collections.update({this_run})
246 with self.subTest(args=args):
247 ref = butler.put(metric, *args, run=this_run)
248 self.assertIsInstance(ref, DatasetRef)
250 # Test getDirect
251 metricOut = butler.getDirect(ref)
252 self.assertEqual(metric, metricOut)
253 # Test get
254 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
255 self.assertEqual(metric, metricOut)
256 # Test get with a datasetRef
257 metricOut = butler.get(ref, collections=this_run)
258 self.assertEqual(metric, metricOut)
259 # Test getDeferred with dataId
260 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
261 self.assertEqual(metric, metricOut)
262 # Test getDeferred with a datasetRef
263 metricOut = butler.getDeferred(ref, collections=this_run).get()
264 self.assertEqual(metric, metricOut)
265 # and deferred direct with ref
266 metricOut = butler.getDirectDeferred(ref).get()
267 self.assertEqual(metric, metricOut)
269 # Check we can get components
270 if storageClass.isComposite():
271 self.assertGetComponents(
272 butler, ref, ("summary", "data", "output"), metric, collections=this_run
273 )
275 # Can the artifacts themselves be retrieved?
276 if not butler.datastore.isEphemeral:
277 root_uri = ResourcePath(self.root)
279 for preserve_path in (True, False):
280 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
281 # Use copy so that we can test that overwrite
282 # protection works (using "auto" for File URIs would
283 # use hard links and subsequent transfer would work
284 # because it knows they are the same file).
285 transferred = butler.retrieveArtifacts(
286 [ref], destination, preserve_path=preserve_path, transfer="copy"
287 )
288 self.assertGreater(len(transferred), 0)
289 artifacts = list(ResourcePath.findFileResources([destination]))
290 self.assertEqual(set(transferred), set(artifacts))
292 for artifact in transferred:
293 path_in_destination = artifact.relative_to(destination)
294 self.assertIsNotNone(path_in_destination)
296 # when path is not preserved there should not be
297 # any path separators.
298 num_seps = path_in_destination.count("/")
299 if preserve_path:
300 self.assertGreater(num_seps, 0)
301 else:
302 self.assertEqual(num_seps, 0)
304 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
305 n_uris = len(secondary_uris)
306 if primary_uri:
307 n_uris += 1
308 self.assertEqual(
309 len(artifacts),
310 n_uris,
311 "Comparing expected artifacts vs actual:"
312 f" {artifacts} vs {primary_uri} and {secondary_uris}",
313 )
315 if preserve_path:
316 # No need to run these twice
317 with self.assertRaises(ValueError):
318 butler.retrieveArtifacts([ref], destination, transfer="move")
320 with self.assertRaises(FileExistsError):
321 butler.retrieveArtifacts([ref], destination)
323 transferred_again = butler.retrieveArtifacts(
324 [ref], destination, preserve_path=preserve_path, overwrite=True
325 )
326 self.assertEqual(set(transferred_again), set(transferred))
328 # Now remove the dataset completely.
329 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
330 # Lookup with original args should still fail.
331 with self.assertRaises(LookupError):
332 butler.datasetExists(*args, collections=this_run)
333 # getDirect() should still fail.
334 with self.assertRaises(FileNotFoundError):
335 butler.getDirect(ref)
336 # Registry shouldn't be able to find it by dataset_id anymore.
337 self.assertIsNone(butler.registry.getDataset(ref.id))
339 # Do explicit registry removal since we know they are
340 # empty
341 butler.registry.removeCollection(this_run)
342 expected_collections.remove(this_run)
344 # Put the dataset again, since the last thing we did was remove it
345 # and we want to use the default collection.
346 ref = butler.put(metric, refIn)
348 # Get with parameters
349 stop = 4
350 sliced = butler.get(ref, parameters={"slice": slice(stop)})
351 self.assertNotEqual(metric, sliced)
352 self.assertEqual(metric.summary, sliced.summary)
353 self.assertEqual(metric.output, sliced.output)
354 self.assertEqual(metric.data[:stop], sliced.data)
355 # getDeferred with parameters
356 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
357 self.assertNotEqual(metric, sliced)
358 self.assertEqual(metric.summary, sliced.summary)
359 self.assertEqual(metric.output, sliced.output)
360 self.assertEqual(metric.data[:stop], sliced.data)
361 # getDeferred with deferred parameters
362 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
363 self.assertNotEqual(metric, sliced)
364 self.assertEqual(metric.summary, sliced.summary)
365 self.assertEqual(metric.output, sliced.output)
366 self.assertEqual(metric.data[:stop], sliced.data)
368 if storageClass.isComposite():
369 # Check that components can be retrieved
370 metricOut = butler.get(ref.datasetType.name, dataId)
371 compNameS = ref.datasetType.componentTypeName("summary")
372 compNameD = ref.datasetType.componentTypeName("data")
373 summary = butler.get(compNameS, dataId)
374 self.assertEqual(summary, metric.summary)
375 data = butler.get(compNameD, dataId)
376 self.assertEqual(data, metric.data)
378 if "counter" in storageClass.derivedComponents:
379 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
380 self.assertEqual(count, len(data))
382 count = butler.get(
383 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
384 )
385 self.assertEqual(count, stop)
387 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
388 summary = butler.getDirect(compRef)
389 self.assertEqual(summary, metric.summary)
391 # Create a Dataset type that has the same name but is inconsistent.
392 inconsistentDatasetType = DatasetType(
393 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
394 )
396 # Getting with a dataset type that does not match registry fails
397 with self.assertRaises(ValueError):
398 butler.get(inconsistentDatasetType, dataId)
400 # Combining a DatasetRef with a dataId should fail
401 with self.assertRaises(ValueError):
402 butler.get(ref, dataId)
403 # Getting with an explicit ref should fail if the id doesn't match
404 with self.assertRaises(ValueError):
405 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
407 # Getting a dataset with unknown parameters should fail
408 with self.assertRaises(KeyError):
409 butler.get(ref, parameters={"unsupported": True})
411 # Check we have a collection
412 collections = set(butler.registry.queryCollections())
413 self.assertEqual(collections, expected_collections)
415 # Clean up to check that we can remove something that may have
416 # already had a component removed
417 butler.pruneDatasets([ref], unstore=True, purge=True)
419 # Check that we can configure a butler to accept a put even
420 # if it already has the dataset in registry.
421 ref = butler.put(metric, refIn)
423 # Repeat put will fail.
424 with self.assertRaises(ConflictingDefinitionError):
425 butler.put(metric, refIn)
427 # Remove the datastore entry.
428 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
430 # Put will still fail
431 with self.assertRaises(ConflictingDefinitionError):
432 butler.put(metric, refIn)
434 # Allow the put to succeed
435 butler._allow_put_of_predefined_dataset = True
436 ref2 = butler.put(metric, refIn)
437 self.assertEqual(ref2.id, ref.id)
439 # A second put will still fail but with a different exception
440 # than before.
441 with self.assertRaises(ConflictingDefinitionError):
442 butler.put(metric, refIn)
444 # Reset the flag to avoid confusion
445 butler._allow_put_of_predefined_dataset = False
447 # Leave the dataset in place since some downstream tests require
448 # something to be present
450 return butler
452 def testDeferredCollectionPassing(self):
453 # Construct a butler with no run or collection, but make it writeable.
454 butler = Butler(self.tmpConfigFile, writeable=True)
455 # Create and register a DatasetType
456 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
457 datasetType = self.addDatasetType(
458 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
459 )
460 # Add needed Dimensions
461 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
462 butler.registry.insertDimensionData(
463 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
464 )
465 butler.registry.insertDimensionData(
466 "visit",
467 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
468 )
469 dataId = {"instrument": "DummyCamComp", "visit": 423}
470 # Create dataset.
471 metric = makeExampleMetrics()
472 # Register a new run and put dataset.
473 run = "deferred"
474 self.assertTrue(butler.registry.registerRun(run))
475 # Second time it will be allowed but indicate no-op
476 self.assertFalse(butler.registry.registerRun(run))
477 ref = butler.put(metric, datasetType, dataId, run=run)
478 # Putting with no run should fail with TypeError.
479 with self.assertRaises(CollectionError):
480 butler.put(metric, datasetType, dataId)
481 # Dataset should exist.
482 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
483 # We should be able to get the dataset back, but with and without
484 # a deferred dataset handle.
485 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
486 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
487 # Trying to find the dataset without any collection is a TypeError.
488 with self.assertRaises(CollectionError):
489 butler.datasetExists(datasetType, dataId)
490 with self.assertRaises(CollectionError):
491 butler.get(datasetType, dataId)
492 # Associate the dataset with a different collection.
493 butler.registry.registerCollection("tagged")
494 butler.registry.associate("tagged", [ref])
495 # Deleting the dataset from the new collection should make it findable
496 # in the original collection.
497 butler.pruneDatasets([ref], tags=["tagged"])
498 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
501class ButlerTests(ButlerPutGetTests):
502 """Tests for Butler."""
504 useTempRoot = True
506 def setUp(self):
507 """Create a new butler root for each test."""
508 self.root = makeTestTempDir(TESTDIR)
509 Butler.makeRepo(self.root, config=Config(self.configFile))
510 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
512 def testConstructor(self):
513 """Independent test of constructor."""
514 butler = Butler(self.tmpConfigFile, run=self.default_run)
515 self.assertIsInstance(butler, Butler)
517 # Check that butler.yaml is added automatically.
518 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
519 config_dir = self.tmpConfigFile[: -len(end)]
520 butler = Butler(config_dir, run=self.default_run)
521 self.assertIsInstance(butler, Butler)
523 collections = set(butler.registry.queryCollections())
524 self.assertEqual(collections, {self.default_run})
526 # Check that some special characters can be included in run name.
527 special_run = "u@b.c-A"
528 butler_special = Butler(butler=butler, run=special_run)
529 collections = set(butler_special.registry.queryCollections("*@*"))
530 self.assertEqual(collections, {special_run})
532 butler2 = Butler(butler=butler, collections=["other"])
533 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
534 self.assertIsNone(butler2.run)
535 self.assertIs(butler.datastore, butler2.datastore)
537 # Test that we can use an environment variable to find this
538 # repository.
539 butler_index = Config()
540 butler_index["label"] = self.tmpConfigFile
541 for suffix in (".yaml", ".json"):
542 # Ensure that the content differs so that we know that
543 # we aren't reusing the cache.
544 bad_label = f"s3://bucket/not_real{suffix}"
545 butler_index["bad_label"] = bad_label
546 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
547 butler_index.dumpToUri(temp_file)
548 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
549 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
550 uri = Butler.get_repo_uri("bad_label")
551 self.assertEqual(uri, ResourcePath(bad_label))
552 uri = Butler.get_repo_uri("label")
553 butler = Butler(uri, writeable=False)
554 self.assertIsInstance(butler, Butler)
555 butler = Butler("label", writeable=False)
556 self.assertIsInstance(butler, Butler)
557 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
558 Butler("not_there", writeable=False)
559 with self.assertRaises(KeyError) as cm:
560 Butler.get_repo_uri("missing")
561 self.assertIn("not known to", str(cm.exception))
562 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
563 with self.assertRaises(FileNotFoundError):
564 Butler.get_repo_uri("label")
565 self.assertEqual(Butler.get_known_repos(), set())
566 with self.assertRaises(KeyError) as cm:
567 # No environment variable set.
568 Butler.get_repo_uri("label")
569 self.assertIn("No repository index defined", str(cm.exception))
570 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
571 # No aliases registered.
572 Butler("not_there")
573 self.assertEqual(Butler.get_known_repos(), set())
575 def testBasicPutGet(self):
576 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
577 self.runPutGetTest(storageClass, "test_metric")
579 def testCompositePutGetConcrete(self):
581 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
582 butler = self.runPutGetTest(storageClass, "test_metric")
584 # Should *not* be disassembled
585 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
586 self.assertEqual(len(datasets), 1)
587 uri, components = butler.getURIs(datasets[0])
588 self.assertIsInstance(uri, ResourcePath)
589 self.assertFalse(components)
590 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
591 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
593 # Predicted dataset
594 dataId = {"instrument": "DummyCamComp", "visit": 424}
595 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
596 self.assertFalse(components)
597 self.assertIsInstance(uri, ResourcePath)
598 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
599 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
601 def testCompositePutGetVirtual(self):
602 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
603 butler = self.runPutGetTest(storageClass, "test_metric_comp")
605 # Should be disassembled
606 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
607 self.assertEqual(len(datasets), 1)
608 uri, components = butler.getURIs(datasets[0])
610 if butler.datastore.isEphemeral:
611 # Never disassemble in-memory datastore
612 self.assertIsInstance(uri, ResourcePath)
613 self.assertFalse(components)
614 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
615 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
616 else:
617 self.assertIsNone(uri)
618 self.assertEqual(set(components), set(storageClass.components))
619 for compuri in components.values():
620 self.assertIsInstance(compuri, ResourcePath)
621 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
622 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
624 # Predicted dataset
625 dataId = {"instrument": "DummyCamComp", "visit": 424}
626 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
628 if butler.datastore.isEphemeral:
629 # Never disassembled
630 self.assertIsInstance(uri, ResourcePath)
631 self.assertFalse(components)
632 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
633 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
634 else:
635 self.assertIsNone(uri)
636 self.assertEqual(set(components), set(storageClass.components))
637 for compuri in components.values():
638 self.assertIsInstance(compuri, ResourcePath)
639 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
640 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
642 def testIngest(self):
643 butler = Butler(self.tmpConfigFile, run=self.default_run)
645 # Create and register a DatasetType
646 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
648 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
649 datasetTypeName = "metric"
651 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
653 # Add needed Dimensions
654 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
655 butler.registry.insertDimensionData(
656 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
657 )
658 for detector in (1, 2):
659 butler.registry.insertDimensionData(
660 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
661 )
663 butler.registry.insertDimensionData(
664 "visit",
665 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
666 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
667 )
669 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
670 dataRoot = os.path.join(TESTDIR, "data", "basic")
671 datasets = []
672 for detector in (1, 2):
673 detector_name = f"detector_{detector}"
674 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
675 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
676 # Create a DatasetRef for ingest
677 refIn = DatasetRef(datasetType, dataId, id=None)
679 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
681 butler.ingest(*datasets, transfer="copy")
683 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
684 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
686 metrics1 = butler.get(datasetTypeName, dataId1)
687 metrics2 = butler.get(datasetTypeName, dataId2)
688 self.assertNotEqual(metrics1, metrics2)
690 # Compare URIs
691 uri1 = butler.getURI(datasetTypeName, dataId1)
692 uri2 = butler.getURI(datasetTypeName, dataId2)
693 self.assertNotEqual(uri1, uri2)
695 # Now do a multi-dataset but single file ingest
696 metricFile = os.path.join(dataRoot, "detectors.yaml")
697 refs = []
698 for detector in (1, 2):
699 detector_name = f"detector_{detector}"
700 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
701 # Create a DatasetRef for ingest
702 refs.append(DatasetRef(datasetType, dataId, id=None))
704 datasets = []
705 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
707 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
709 # Check that the datastore recorded no file size.
710 # Not all datastores can support this.
711 try:
712 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
713 self.assertEqual(infos[0].file_size, -1)
714 except AttributeError:
715 pass
717 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
718 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
720 multi1 = butler.get(datasetTypeName, dataId1)
721 multi2 = butler.get(datasetTypeName, dataId2)
723 self.assertEqual(multi1, metrics1)
724 self.assertEqual(multi2, metrics2)
726 # Compare URIs
727 uri1 = butler.getURI(datasetTypeName, dataId1)
728 uri2 = butler.getURI(datasetTypeName, dataId2)
729 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
731 # Test that removing one does not break the second
732 # This line will issue a warning log message for a ChainedDatastore
733 # that uses an InMemoryDatastore since in-memory can not ingest
734 # files.
735 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
736 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
737 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
738 multi2b = butler.get(datasetTypeName, dataId2)
739 self.assertEqual(multi2, multi2b)
741 def testPruneCollections(self):
742 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
743 butler = Butler(self.tmpConfigFile, writeable=True)
744 # Load registry data with dimensions to hang datasets off of.
745 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
746 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
747 # Add some RUN-type collections.
748 run1 = "run1"
749 butler.registry.registerRun(run1)
750 run2 = "run2"
751 butler.registry.registerRun(run2)
752 # put some datasets. ref1 and ref2 have the same data ID, and are in
753 # different runs. ref3 has a different data ID.
754 metric = makeExampleMetrics()
755 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
756 datasetType = self.addDatasetType(
757 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
758 )
759 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
760 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
761 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
763 # Try to delete a RUN collection without purge, or with purge and not
764 # unstore.
765 with self.assertRaises(TypeError):
766 butler.pruneCollection(run1)
767 with self.assertRaises(TypeError):
768 butler.pruneCollection(run2, purge=True)
769 # Add a TAGGED collection and associate ref3 only into it.
770 tag1 = "tag1"
771 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
772 self.assertTrue(registered)
773 # Registering a second time should be allowed.
774 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
775 self.assertFalse(registered)
776 butler.registry.associate(tag1, [ref3])
777 # Add a CHAINED collection that searches run1 and then run2. It
778 # logically contains only ref1, because ref2 is shadowed due to them
779 # having the same data ID and dataset type.
780 chain1 = "chain1"
781 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
782 butler.registry.setCollectionChain(chain1, [run1, run2])
783 # Try to delete RUN collections, which should fail with complete
784 # rollback because they're still referenced by the CHAINED
785 # collection.
786 with self.assertRaises(Exception):
787 butler.pruneCollection(run1, pruge=True, unstore=True)
788 with self.assertRaises(Exception):
789 butler.pruneCollection(run2, pruge=True, unstore=True)
790 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
791 existence = butler.datastore.mexists([ref1, ref2, ref3])
792 self.assertTrue(existence[ref1])
793 self.assertTrue(existence[ref2])
794 self.assertTrue(existence[ref3])
795 # Try to delete CHAINED and TAGGED collections with purge; should not
796 # work.
797 with self.assertRaises(TypeError):
798 butler.pruneCollection(tag1, purge=True, unstore=True)
799 with self.assertRaises(TypeError):
800 butler.pruneCollection(chain1, purge=True, unstore=True)
801 # Remove the tagged collection with unstore=False. This should not
802 # affect the datasets.
803 butler.pruneCollection(tag1)
804 with self.assertRaises(MissingCollectionError):
805 butler.registry.getCollectionType(tag1)
806 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
807 existence = butler.datastore.mexists([ref1, ref2, ref3])
808 self.assertTrue(existence[ref1])
809 self.assertTrue(existence[ref2])
810 self.assertTrue(existence[ref3])
811 # Add the tagged collection back in, and remove it with unstore=True.
812 # This should remove ref3 only from the datastore.
813 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
814 butler.registry.associate(tag1, [ref3])
815 butler.pruneCollection(tag1, unstore=True)
816 with self.assertRaises(MissingCollectionError):
817 butler.registry.getCollectionType(tag1)
818 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
819 existence = butler.datastore.mexists([ref1, ref2, ref3])
820 self.assertTrue(existence[ref1])
821 self.assertTrue(existence[ref2])
822 self.assertFalse(existence[ref3])
823 # Delete the chain with unstore=False. The datasets should not be
824 # affected at all.
825 butler.pruneCollection(chain1)
826 with self.assertRaises(MissingCollectionError):
827 butler.registry.getCollectionType(chain1)
828 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
829 existence = butler.datastore.mexists([ref1, ref2, ref3])
830 self.assertTrue(existence[ref1])
831 self.assertTrue(existence[ref2])
832 self.assertFalse(existence[ref3])
833 # Redefine and then delete the chain with unstore=True. Only ref1
834 # should be unstored (ref3 has already been unstored, but otherwise
835 # would be now).
836 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
837 butler.registry.setCollectionChain(chain1, [run1, run2])
838 butler.pruneCollection(chain1, unstore=True)
839 with self.assertRaises(MissingCollectionError):
840 butler.registry.getCollectionType(chain1)
841 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
842 existence = butler.datastore.mexists([ref1, ref2, ref3])
843 self.assertFalse(existence[ref1])
844 self.assertTrue(existence[ref2])
845 self.assertFalse(existence[ref3])
846 # Remove run1. This removes ref1 and ref3 from the registry (they're
847 # already gone from the datastore, which is fine).
848 butler.pruneCollection(run1, purge=True, unstore=True)
849 with self.assertRaises(MissingCollectionError):
850 butler.registry.getCollectionType(run1)
851 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
852 self.assertTrue(butler.datastore.exists(ref2))
853 # Remove run2. This removes ref2 from the registry and the datastore.
854 butler.pruneCollection(run2, purge=True, unstore=True)
855 with self.assertRaises(MissingCollectionError):
856 butler.registry.getCollectionType(run2)
857 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
859 # Now that the collections have been pruned we can remove the
860 # dataset type
861 butler.registry.removeDatasetType(datasetType.name)
863 def testPickle(self):
864 """Test pickle support."""
865 butler = Butler(self.tmpConfigFile, run=self.default_run)
866 butlerOut = pickle.loads(pickle.dumps(butler))
867 self.assertIsInstance(butlerOut, Butler)
868 self.assertEqual(butlerOut._config, butler._config)
869 self.assertEqual(butlerOut.collections, butler.collections)
870 self.assertEqual(butlerOut.run, butler.run)
872 def testGetDatasetTypes(self):
873 butler = Butler(self.tmpConfigFile, run=self.default_run)
874 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
875 dimensionEntries = [
876 (
877 "instrument",
878 {"instrument": "DummyCam"},
879 {"instrument": "DummyHSC"},
880 {"instrument": "DummyCamComp"},
881 ),
882 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
883 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
884 ]
885 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
886 # Add needed Dimensions
887 for args in dimensionEntries:
888 butler.registry.insertDimensionData(*args)
890 # When a DatasetType is added to the registry entries are not created
891 # for components but querying them can return the components.
892 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
893 components = set()
894 for datasetTypeName in datasetTypeNames:
895 # Create and register a DatasetType
896 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
898 for componentName in storageClass.components:
899 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
901 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
902 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
904 # Now that we have some dataset types registered, validate them
905 butler.validateConfiguration(
906 ignore=[
907 "test_metric_comp",
908 "metric3",
909 "calexp",
910 "DummySC",
911 "datasetType.component",
912 "random_data",
913 "random_data_2",
914 ]
915 )
917 # Add a new datasetType that will fail template validation
918 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
919 if self.validationCanFail:
920 with self.assertRaises(ValidationError):
921 butler.validateConfiguration()
923 # Rerun validation but with a subset of dataset type names
924 butler.validateConfiguration(datasetTypeNames=["metric4"])
926 # Rerun validation but ignore the bad datasetType
927 butler.validateConfiguration(
928 ignore=[
929 "test_metric_comp",
930 "metric3",
931 "calexp",
932 "DummySC",
933 "datasetType.component",
934 "random_data",
935 "random_data_2",
936 ]
937 )
939 def testTransaction(self):
940 butler = Butler(self.tmpConfigFile, run=self.default_run)
941 datasetTypeName = "test_metric"
942 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
943 dimensionEntries = (
944 ("instrument", {"instrument": "DummyCam"}),
945 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
946 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
947 )
948 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
949 metric = makeExampleMetrics()
950 dataId = {"instrument": "DummyCam", "visit": 42}
951 # Create and register a DatasetType
952 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
953 with self.assertRaises(TransactionTestError):
954 with butler.transaction():
955 # Add needed Dimensions
956 for args in dimensionEntries:
957 butler.registry.insertDimensionData(*args)
958 # Store a dataset
959 ref = butler.put(metric, datasetTypeName, dataId)
960 self.assertIsInstance(ref, DatasetRef)
961 # Test getDirect
962 metricOut = butler.getDirect(ref)
963 self.assertEqual(metric, metricOut)
964 # Test get
965 metricOut = butler.get(datasetTypeName, dataId)
966 self.assertEqual(metric, metricOut)
967 # Check we can get components
968 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
969 raise TransactionTestError("This should roll back the entire transaction")
970 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
971 butler.registry.expandDataId(dataId)
972 # Should raise LookupError for missing data ID value
973 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
974 butler.get(datasetTypeName, dataId)
975 # Also check explicitly if Dataset entry is missing
976 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
977 # Direct retrieval should not find the file in the Datastore
978 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
979 butler.getDirect(ref)
981 def testMakeRepo(self):
982 """Test that we can write butler configuration to a new repository via
983 the Butler.makeRepo interface and then instantiate a butler from the
984 repo root.
985 """
986 # Do not run the test if we know this datastore configuration does
987 # not support a file system root
988 if self.fullConfigKey is None:
989 return
991 # create two separate directories
992 root1 = tempfile.mkdtemp(dir=self.root)
993 root2 = tempfile.mkdtemp(dir=self.root)
995 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
996 limited = Config(self.configFile)
997 butler1 = Butler(butlerConfig)
998 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
999 full = Config(self.tmpConfigFile)
1000 butler2 = Butler(butlerConfig)
1001 # Butlers should have the same configuration regardless of whether
1002 # defaults were expanded.
1003 self.assertEqual(butler1._config, butler2._config)
1004 # Config files loaded directly should not be the same.
1005 self.assertNotEqual(limited, full)
1006 # Make sure "limited" doesn't have a few keys we know it should be
1007 # inheriting from defaults.
1008 self.assertIn(self.fullConfigKey, full)
1009 self.assertNotIn(self.fullConfigKey, limited)
1011 # Collections don't appear until something is put in them
1012 collections1 = set(butler1.registry.queryCollections())
1013 self.assertEqual(collections1, set())
1014 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1016 # Check that a config with no associated file name will not
1017 # work properly with relocatable Butler repo
1018 butlerConfig.configFile = None
1019 with self.assertRaises(ValueError):
1020 Butler(butlerConfig)
1022 with self.assertRaises(FileExistsError):
1023 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1025 def testStringification(self):
1026 butler = Butler(self.tmpConfigFile, run=self.default_run)
1027 butlerStr = str(butler)
1029 if self.datastoreStr is not None:
1030 for testStr in self.datastoreStr:
1031 self.assertIn(testStr, butlerStr)
1032 if self.registryStr is not None:
1033 self.assertIn(self.registryStr, butlerStr)
1035 datastoreName = butler.datastore.name
1036 if self.datastoreName is not None:
1037 for testStr in self.datastoreName:
1038 self.assertIn(testStr, datastoreName)
1040 def testButlerRewriteDataId(self):
1041 """Test that dataIds can be rewritten based on dimension records."""
1043 butler = Butler(self.tmpConfigFile, run=self.default_run)
1045 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1046 datasetTypeName = "random_data"
1048 # Create dimension records.
1049 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1050 butler.registry.insertDimensionData(
1051 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1052 )
1053 butler.registry.insertDimensionData(
1054 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1055 )
1057 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1058 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1059 butler.registry.registerDatasetType(datasetType)
1061 n_exposures = 5
1062 dayobs = 20210530
1064 for i in range(n_exposures):
1065 butler.registry.insertDimensionData(
1066 "exposure",
1067 {
1068 "instrument": "DummyCamComp",
1069 "id": i,
1070 "obs_id": f"exp{i}",
1071 "seq_num": i,
1072 "day_obs": dayobs,
1073 "physical_filter": "d-r",
1074 },
1075 )
1077 # Write some data.
1078 for i in range(n_exposures):
1079 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1081 # Use the seq_num for the put to test rewriting.
1082 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1083 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1085 # Check that the exposure is correct in the dataId
1086 self.assertEqual(ref.dataId["exposure"], i)
1088 # and check that we can get the dataset back with the same dataId
1089 new_metric = butler.get(datasetTypeName, dataId=dataId)
1090 self.assertEqual(new_metric, metric)
1093class FileDatastoreButlerTests(ButlerTests):
1094 """Common tests and specialization of ButlerTests for butlers backed
1095 by datastores that inherit from FileDatastore.
1096 """
1098 def checkFileExists(self, root, relpath):
1099 """Checks if file exists at a given path (relative to root).
1101 Test testPutTemplates verifies actual physical existance of the files
1102 in the requested location.
1103 """
1104 uri = ResourcePath(root, forceDirectory=True)
1105 return uri.join(relpath).exists()
1107 def testPutTemplates(self):
1108 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1109 butler = Butler(self.tmpConfigFile, run=self.default_run)
1111 # Add needed Dimensions
1112 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1113 butler.registry.insertDimensionData(
1114 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1115 )
1116 butler.registry.insertDimensionData(
1117 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1118 )
1119 butler.registry.insertDimensionData(
1120 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1121 )
1123 # Create and store a dataset
1124 metric = makeExampleMetrics()
1126 # Create two almost-identical DatasetTypes (both will use default
1127 # template)
1128 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1129 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1130 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1131 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1133 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1134 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1136 # Put with exactly the data ID keys needed
1137 ref = butler.put(metric, "metric1", dataId1)
1138 uri = butler.getURI(ref)
1139 self.assertTrue(
1140 self.checkFileExists(
1141 butler.datastore.root, f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle"
1142 ),
1143 f"Checking existence of {uri}",
1144 )
1146 # Check the template based on dimensions
1147 butler.datastore.templates.validateTemplates([ref])
1149 # Put with extra data ID keys (physical_filter is an optional
1150 # dependency); should not change template (at least the way we're
1151 # defining them to behave now; the important thing is that they
1152 # must be consistent).
1153 ref = butler.put(metric, "metric2", dataId2)
1154 uri = butler.getURI(ref)
1155 self.assertTrue(
1156 self.checkFileExists(
1157 butler.datastore.root, f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle"
1158 ),
1159 f"Checking existence of {uri}",
1160 )
1162 # Check the template based on dimensions
1163 butler.datastore.templates.validateTemplates([ref])
1165 # Now use a file template that will not result in unique filenames
1166 with self.assertRaises(FileTemplateValidationError):
1167 butler.put(metric, "metric3", dataId1)
1169 def testImportExport(self):
1170 # Run put/get tests just to create and populate a repo.
1171 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1172 self.runImportExportTest(storageClass)
1174 @unittest.expectedFailure
1175 def testImportExportVirtualComposite(self):
1176 # Run put/get tests just to create and populate a repo.
1177 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1178 self.runImportExportTest(storageClass)
1180 def runImportExportTest(self, storageClass):
1181 """This test does an export to a temp directory and an import back
1182 into a new temp directory repo. It does not assume a posix datastore"""
1183 exportButler = self.runPutGetTest(storageClass, "test_metric")
1184 print("Root:", exportButler.datastore.root)
1185 # Test that the repo actually has at least one dataset.
1186 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1187 self.assertGreater(len(datasets), 0)
1188 # Add a DimensionRecord that's unused by those datasets.
1189 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1190 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1191 # Export and then import datasets.
1192 with safeTestTempDir(TESTDIR) as exportDir:
1193 exportFile = os.path.join(exportDir, "exports.yaml")
1194 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1195 export.saveDatasets(datasets)
1196 # Export the same datasets again. This should quietly do
1197 # nothing because of internal deduplication, and it shouldn't
1198 # complain about being asked to export the "htm7" elements even
1199 # though there aren't any in these datasets or in the database.
1200 export.saveDatasets(datasets, elements=["htm7"])
1201 # Save one of the data IDs again; this should be harmless
1202 # because of internal deduplication.
1203 export.saveDataIds([datasets[0].dataId])
1204 # Save some dimension records directly.
1205 export.saveDimensionData("skymap", [skymapRecord])
1206 self.assertTrue(os.path.exists(exportFile))
1207 with safeTestTempDir(TESTDIR) as importDir:
1208 # We always want this to be a local posix butler
1209 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1210 # Calling script.butlerImport tests the implementation of the
1211 # butler command line interface "import" subcommand. Functions
1212 # in the script folder are generally considered protected and
1213 # should not be used as public api.
1214 with open(exportFile, "r") as f:
1215 script.butlerImport(
1216 importDir,
1217 export_file=f,
1218 directory=exportDir,
1219 transfer="auto",
1220 skip_dimensions=None,
1221 reuse_ids=False,
1222 )
1223 importButler = Butler(importDir, run=self.default_run)
1224 for ref in datasets:
1225 with self.subTest(ref=ref):
1226 # Test for existence by passing in the DatasetType and
1227 # data ID separately, to avoid lookup by dataset_id.
1228 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1229 self.assertEqual(
1230 list(importButler.registry.queryDimensionRecords("skymap")),
1231 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1232 )
1234 def testRemoveRuns(self):
1235 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1236 butler = Butler(self.tmpConfigFile, writeable=True)
1237 # Load registry data with dimensions to hang datasets off of.
1238 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1239 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1240 # Add some RUN-type collection.
1241 run1 = "run1"
1242 butler.registry.registerRun(run1)
1243 run2 = "run2"
1244 butler.registry.registerRun(run2)
1245 # put a dataset in each
1246 metric = makeExampleMetrics()
1247 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1248 datasetType = self.addDatasetType(
1249 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1250 )
1251 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1252 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1253 uri1 = butler.getURI(ref1, collections=[run1])
1254 uri2 = butler.getURI(ref2, collections=[run2])
1255 # Remove from both runs with different values for unstore.
1256 butler.removeRuns([run1], unstore=True)
1257 butler.removeRuns([run2], unstore=False)
1258 # Should be nothing in registry for either one, and datastore should
1259 # not think either exists.
1260 with self.assertRaises(MissingCollectionError):
1261 butler.registry.getCollectionType(run1)
1262 with self.assertRaises(MissingCollectionError):
1263 butler.registry.getCollectionType(run2)
1264 self.assertFalse(butler.datastore.exists(ref1))
1265 self.assertFalse(butler.datastore.exists(ref2))
1266 # The ref we unstored should be gone according to the URI, but the
1267 # one we forgot should still be around.
1268 self.assertFalse(uri1.exists())
1269 self.assertTrue(uri2.exists())
1272class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1273 """PosixDatastore specialization of a butler"""
1275 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1276 fullConfigKey = ".datastore.formatters"
1277 validationCanFail = True
1278 datastoreStr = ["/tmp"]
1279 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1280 registryStr = "/gen3.sqlite3"
1282 def testPathConstructor(self):
1283 """Independent test of constructor using PathLike."""
1284 butler = Butler(self.tmpConfigFile, run=self.default_run)
1285 self.assertIsInstance(butler, Butler)
1287 # And again with a Path object with the butler yaml
1288 path = pathlib.Path(self.tmpConfigFile)
1289 butler = Butler(path, writeable=False)
1290 self.assertIsInstance(butler, Butler)
1292 # And again with a Path object without the butler yaml
1293 # (making sure we skip it if the tmp config doesn't end
1294 # in butler.yaml -- which is the case for a subclass)
1295 if self.tmpConfigFile.endswith("butler.yaml"):
1296 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1297 butler = Butler(path, writeable=False)
1298 self.assertIsInstance(butler, Butler)
1300 def testExportTransferCopy(self):
1301 """Test local export using all transfer modes"""
1302 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1303 exportButler = self.runPutGetTest(storageClass, "test_metric")
1304 # Test that the repo actually has at least one dataset.
1305 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1306 self.assertGreater(len(datasets), 0)
1307 uris = [exportButler.getURI(d) for d in datasets]
1308 datastoreRoot = exportButler.datastore.root
1310 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1312 for path in pathsInStore:
1313 # Assume local file system
1314 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1316 for transfer in ("copy", "link", "symlink", "relsymlink"):
1317 with safeTestTempDir(TESTDIR) as exportDir:
1318 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1319 export.saveDatasets(datasets)
1320 for path in pathsInStore:
1321 self.assertTrue(
1322 self.checkFileExists(exportDir, path),
1323 f"Check that mode {transfer} exported files",
1324 )
1326 def testPruneDatasets(self):
1327 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1328 butler = Butler(self.tmpConfigFile, writeable=True)
1329 # Load registry data with dimensions to hang datasets off of.
1330 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1331 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1332 # Add some RUN-type collections.
1333 run1 = "run1"
1334 butler.registry.registerRun(run1)
1335 run2 = "run2"
1336 butler.registry.registerRun(run2)
1337 # put some datasets. ref1 and ref2 have the same data ID, and are in
1338 # different runs. ref3 has a different data ID.
1339 metric = makeExampleMetrics()
1340 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1341 datasetType = self.addDatasetType(
1342 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1343 )
1344 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1345 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1346 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1348 # Simple prune.
1349 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1350 with self.assertRaises(LookupError):
1351 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1353 # Put data back.
1354 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1355 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1356 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1358 # Check that in normal mode, deleting the record will lead to
1359 # trash not touching the file.
1360 uri1 = butler.datastore.getURI(ref1)
1361 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1362 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1363 butler.datastore.trash(ref1)
1364 butler.datastore.emptyTrash()
1365 self.assertTrue(uri1.exists())
1366 uri1.remove() # Clean it up.
1368 # Simulate execution butler setup by deleting the datastore
1369 # record but keeping the file around and trusting.
1370 butler.datastore.trustGetRequest = True
1371 uri2 = butler.datastore.getURI(ref2)
1372 uri3 = butler.datastore.getURI(ref3)
1373 self.assertTrue(uri2.exists())
1374 self.assertTrue(uri3.exists())
1376 # Remove the datastore record.
1377 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1378 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1379 self.assertTrue(uri2.exists())
1380 butler.datastore.trash([ref2, ref3])
1381 # Immediate removal for ref2 file
1382 self.assertFalse(uri2.exists())
1383 # But ref3 has to wait for the empty.
1384 self.assertTrue(uri3.exists())
1385 butler.datastore.emptyTrash()
1386 self.assertFalse(uri3.exists())
1388 # Clear out the datasets from registry.
1389 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1391 def testPytypePutCoercion(self):
1392 """Test python type coercion on Butler.get and put."""
1394 # Store some data with the normal example storage class.
1395 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1396 datasetTypeName = "test_metric"
1397 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
1399 dataId = {"instrument": "DummyCamComp", "visit": 423}
1401 # Put a dict and this should coerce to a MetricsExample
1402 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1403 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1404 test_metric = butler.getDirect(metric_ref)
1405 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1406 self.assertEqual(test_metric.summary, test_dict["summary"])
1407 self.assertEqual(test_metric.output, test_dict["output"])
1409 # Check that the put still works if a DatasetType is given with
1410 # a definition matching this python type.
1411 registry_type = butler.registry.getDatasetType(datasetTypeName)
1412 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
1413 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
1414 self.assertEqual(metric2_ref.datasetType, registry_type)
1416 # The get will return the type expected by registry.
1417 test_metric2 = butler.getDirect(metric2_ref)
1418 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
1420 # Make a new DatasetRef with the compatible but different DatasetType.
1421 # This should now return a dict.
1422 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
1423 test_dict2 = butler.getDirect(new_ref)
1424 self.assertEqual(get_full_type_name(test_dict2), "dict")
1426 # Get it again with the wrong dataset type definition using get()
1427 # rather than getDirect(). This should be consistent with getDirect()
1428 # behavior and return the type of the DatasetType.
1429 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
1430 self.assertEqual(get_full_type_name(test_dict3), "dict")
1432 def testPytypeCoercion(self):
1433 """Test python type coercion on Butler.get and put."""
1435 # Store some data with the normal example storage class.
1436 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1437 datasetTypeName = "test_metric"
1438 butler = self.runPutGetTest(storageClass, datasetTypeName)
1440 dataId = {"instrument": "DummyCamComp", "visit": 423}
1441 metric = butler.get(datasetTypeName, dataId=dataId)
1442 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1444 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1445 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1447 # Now need to hack the registry dataset type definition.
1448 # There is no API for this.
1449 manager = butler.registry._managers.datasets
1450 manager._db.update(
1451 manager._static.dataset_type,
1452 {"name": datasetTypeName},
1453 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1454 )
1456 # Force reset of dataset type cache
1457 butler.registry.refresh()
1459 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1460 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1461 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1463 metric_model = butler.get(datasetTypeName, dataId=dataId)
1464 self.assertNotEqual(type(metric_model), type(metric))
1465 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1467 # Put the model and read it back to show that everything now
1468 # works as normal.
1469 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1470 metric_model_new = butler.get(metric_ref)
1471 self.assertEqual(metric_model_new, metric_model)
1473 # Hack the storage class again to something that will fail on the
1474 # get with no conversion class.
1475 manager._db.update(
1476 manager._static.dataset_type,
1477 {"name": datasetTypeName},
1478 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1479 )
1480 butler.registry.refresh()
1482 with self.assertRaises(ValueError):
1483 butler.get(datasetTypeName, dataId=dataId)
1486class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1487 """InMemoryDatastore specialization of a butler"""
1489 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1490 fullConfigKey = None
1491 useTempRoot = False
1492 validationCanFail = False
1493 datastoreStr = ["datastore='InMemory"]
1494 datastoreName = ["InMemoryDatastore@"]
1495 registryStr = "/gen3.sqlite3"
1497 def testIngest(self):
1498 pass
1501class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1502 """PosixDatastore specialization"""
1504 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1505 fullConfigKey = ".datastore.datastores.1.formatters"
1506 validationCanFail = True
1507 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1508 datastoreName = [
1509 "InMemoryDatastore@",
1510 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1511 "SecondDatastore",
1512 ]
1513 registryStr = "/gen3.sqlite3"
1516class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1517 """Test that a yaml file in one location can refer to a root in another."""
1519 datastoreStr = ["dir1"]
1520 # Disable the makeRepo test since we are deliberately not using
1521 # butler.yaml as the config name.
1522 fullConfigKey = None
1524 def setUp(self):
1525 self.root = makeTestTempDir(TESTDIR)
1527 # Make a new repository in one place
1528 self.dir1 = os.path.join(self.root, "dir1")
1529 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1531 # Move the yaml file to a different place and add a "root"
1532 self.dir2 = os.path.join(self.root, "dir2")
1533 os.makedirs(self.dir2, exist_ok=True)
1534 configFile1 = os.path.join(self.dir1, "butler.yaml")
1535 config = Config(configFile1)
1536 config["root"] = self.dir1
1537 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1538 config.dumpToUri(configFile2)
1539 os.remove(configFile1)
1540 self.tmpConfigFile = configFile2
1542 def testFileLocations(self):
1543 self.assertNotEqual(self.dir1, self.dir2)
1544 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1545 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1546 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1549class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1550 """Test that a config file created by makeRepo outside of repo works."""
1552 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1554 def setUp(self):
1555 self.root = makeTestTempDir(TESTDIR)
1556 self.root2 = makeTestTempDir(TESTDIR)
1558 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1559 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1561 def tearDown(self):
1562 if os.path.exists(self.root2):
1563 shutil.rmtree(self.root2, ignore_errors=True)
1564 super().tearDown()
1566 def testConfigExistence(self):
1567 c = Config(self.tmpConfigFile)
1568 uri_config = ResourcePath(c["root"])
1569 uri_expected = ResourcePath(self.root, forceDirectory=True)
1570 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1571 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1573 def testPutGet(self):
1574 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1575 self.runPutGetTest(storageClass, "test_metric")
1578class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1579 """Test that a config file created by makeRepo outside of repo works."""
1581 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1583 def setUp(self):
1584 self.root = makeTestTempDir(TESTDIR)
1585 self.root2 = makeTestTempDir(TESTDIR)
1587 self.tmpConfigFile = self.root2
1588 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1590 def testConfigExistence(self):
1591 # Append the yaml file else Config constructor does not know the file
1592 # type.
1593 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1594 super().testConfigExistence()
1597class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1598 """Test that a config file created by makeRepo outside of repo works."""
1600 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1602 def setUp(self):
1603 self.root = makeTestTempDir(TESTDIR)
1604 self.root2 = makeTestTempDir(TESTDIR)
1606 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1607 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1610@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1611class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1612 """S3Datastore specialization of a butler; an S3 storage Datastore +
1613 a local in-memory SqlRegistry.
1614 """
1616 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1617 fullConfigKey = None
1618 validationCanFail = True
1620 bucketName = "anybucketname"
1621 """Name of the Bucket that will be used in the tests. The name is read from
1622 the config file used with the tests during set-up.
1623 """
1625 root = "butlerRoot/"
1626 """Root repository directory expected to be used in case useTempRoot=False.
1627 Otherwise the root is set to a 20 characters long randomly generated string
1628 during set-up.
1629 """
1631 datastoreStr = [f"datastore={root}"]
1632 """Contains all expected root locations in a format expected to be
1633 returned by Butler stringification.
1634 """
1636 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1637 """The expected format of the S3 Datastore string."""
1639 registryStr = "/gen3.sqlite3"
1640 """Expected format of the Registry string."""
1642 mock_s3 = mock_s3()
1643 """The mocked s3 interface from moto."""
1645 def genRoot(self):
1646 """Returns a random string of len 20 to serve as a root
1647 name for the temporary bucket repo.
1649 This is equivalent to tempfile.mkdtemp as this is what self.root
1650 becomes when useTempRoot is True.
1651 """
1652 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1653 return rndstr + "/"
1655 def setUp(self):
1656 config = Config(self.configFile)
1657 uri = ResourcePath(config[".datastore.datastore.root"])
1658 self.bucketName = uri.netloc
1660 # Enable S3 mocking of tests.
1661 self.mock_s3.start()
1663 # set up some fake credentials if they do not exist
1664 self.usingDummyCredentials = setAwsEnvCredentials()
1666 if self.useTempRoot:
1667 self.root = self.genRoot()
1668 rooturi = f"s3://{self.bucketName}/{self.root}"
1669 config.update({"datastore": {"datastore": {"root": rooturi}}})
1671 # need local folder to store registry database
1672 self.reg_dir = makeTestTempDir(TESTDIR)
1673 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1675 # MOTO needs to know that we expect Bucket bucketname to exist
1676 # (this used to be the class attribute bucketName)
1677 s3 = boto3.resource("s3")
1678 s3.create_bucket(Bucket=self.bucketName)
1680 self.datastoreStr = f"datastore={self.root}"
1681 self.datastoreName = [f"FileDatastore@{rooturi}"]
1682 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1683 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1685 def tearDown(self):
1686 s3 = boto3.resource("s3")
1687 bucket = s3.Bucket(self.bucketName)
1688 try:
1689 bucket.objects.all().delete()
1690 except botocore.exceptions.ClientError as e:
1691 if e.response["Error"]["Code"] == "404":
1692 # the key was not reachable - pass
1693 pass
1694 else:
1695 raise
1697 bucket = s3.Bucket(self.bucketName)
1698 bucket.delete()
1700 # Stop the S3 mock.
1701 self.mock_s3.stop()
1703 # unset any potentially set dummy credentials
1704 if self.usingDummyCredentials:
1705 unsetAwsEnvCredentials()
1707 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1708 shutil.rmtree(self.reg_dir, ignore_errors=True)
1710 if self.useTempRoot and os.path.exists(self.root):
1711 shutil.rmtree(self.root, ignore_errors=True)
1714@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1715class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1716 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1717 a local in-memory SqlRegistry.
1718 """
1720 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1721 fullConfigKey = None
1722 validationCanFail = True
1724 serverName = "localhost"
1725 """Name of the server that will be used in the tests.
1726 """
1728 portNumber = 8080
1729 """Port on which the webdav server listens. Automatically chosen
1730 at setUpClass via the _getfreeport() method
1731 """
1733 root = "butlerRoot/"
1734 """Root repository directory expected to be used in case useTempRoot=False.
1735 Otherwise the root is set to a 20 characters long randomly generated string
1736 during set-up.
1737 """
1739 datastoreStr = [f"datastore={root}"]
1740 """Contains all expected root locations in a format expected to be
1741 returned by Butler stringification.
1742 """
1744 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1745 """The expected format of the WebdavDatastore string."""
1747 registryStr = "/gen3.sqlite3"
1748 """Expected format of the Registry string."""
1750 serverThread = None
1751 """Thread in which the local webdav server will run"""
1753 stopWebdavServer = False
1754 """This flag will cause the webdav server to
1755 gracefully shut down when True
1756 """
1758 def genRoot(self):
1759 """Returns a random string of len 20 to serve as a root
1760 name for the temporary bucket repo.
1762 This is equivalent to tempfile.mkdtemp as this is what self.root
1763 becomes when useTempRoot is True.
1764 """
1765 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1766 return rndstr + "/"
1768 @classmethod
1769 def setUpClass(cls):
1770 # Do the same as inherited class
1771 cls.storageClassFactory = StorageClassFactory()
1772 cls.storageClassFactory.addFromConfig(cls.configFile)
1774 cls.portNumber = cls._getfreeport()
1775 # Run a local webdav server on which tests will be run
1776 cls.serverThread = Thread(
1777 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1778 )
1779 cls.serverThread.start()
1780 # Wait for it to start
1781 time.sleep(3)
1783 @classmethod
1784 def tearDownClass(cls):
1785 # Ask for graceful shut down of the webdav server
1786 cls.stopWebdavServer = True
1787 # Wait for the thread to exit
1788 cls.serverThread.join()
1790 def setUp(self):
1791 config = Config(self.configFile)
1793 if self.useTempRoot:
1794 self.root = self.genRoot()
1795 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1796 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1798 # need local folder to store registry database
1799 self.reg_dir = makeTestTempDir(TESTDIR)
1800 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1802 self.datastoreStr = f"datastore={self.root}"
1803 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1805 if not _is_webdav_endpoint(self.rooturi):
1806 raise OSError("Webdav server not running properly: cannot run tests.")
1808 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1809 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1811 def tearDown(self):
1812 # Clear temporary directory
1813 ResourcePath(self.rooturi).remove()
1814 ResourcePath(self.rooturi).session.close()
1816 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1817 shutil.rmtree(self.reg_dir, ignore_errors=True)
1819 if self.useTempRoot and os.path.exists(self.root):
1820 shutil.rmtree(self.root, ignore_errors=True)
1822 def _serveWebdav(self, port: int, stopWebdavServer):
1823 """Starts a local webdav-compatible HTTP server,
1824 Listening on http://localhost:port
1825 This server only runs when this test class is instantiated,
1826 and then shuts down. Must be started is a separate thread.
1828 Parameters
1829 ----------
1830 port : `int`
1831 The port number on which the server should listen
1832 """
1833 root_path = gettempdir()
1835 config = {
1836 "host": "0.0.0.0",
1837 "port": port,
1838 "provider_mapping": {"/": root_path},
1839 "http_authenticator": {"domain_controller": None},
1840 "simple_dc": {"user_mapping": {"*": True}},
1841 "verbose": 0,
1842 }
1843 app = WsgiDAVApp(config)
1845 server_args = {
1846 "bind_addr": (config["host"], config["port"]),
1847 "wsgi_app": app,
1848 }
1849 server = wsgi.Server(**server_args)
1850 server.prepare()
1852 try:
1853 # Start the actual server in a separate thread
1854 t = Thread(target=server.serve, daemon=True)
1855 t.start()
1856 # watch stopWebdavServer, and gracefully
1857 # shut down the server when True
1858 while True:
1859 if stopWebdavServer():
1860 break
1861 time.sleep(1)
1862 except KeyboardInterrupt:
1863 print("Caught Ctrl-C, shutting down...")
1864 finally:
1865 server.stop()
1866 t.join()
1868 def _getfreeport():
1869 """
1870 Determines a free port using sockets.
1871 """
1872 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1873 free_socket.bind(("0.0.0.0", 0))
1874 free_socket.listen()
1875 port = free_socket.getsockname()[1]
1876 free_socket.close()
1877 return port
1880class PosixDatastoreTransfers(unittest.TestCase):
1881 """Test data transfers between butlers.
1883 Test for different managers. UUID to UUID and integer to integer are
1884 tested. UUID to integer is not supported since we do not currently
1885 want to allow that. Integer to UUID is supported with the caveat
1886 that UUID4 will be generated and this will be incorrect for raw
1887 dataset types. The test ignores that.
1888 """
1890 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1892 @classmethod
1893 def setUpClass(cls):
1894 cls.storageClassFactory = StorageClassFactory()
1895 cls.storageClassFactory.addFromConfig(cls.configFile)
1897 def setUp(self):
1898 self.root = makeTestTempDir(TESTDIR)
1899 self.config = Config(self.configFile)
1901 def tearDown(self):
1902 removeTestTempDir(self.root)
1904 def create_butler(self, manager, label):
1905 config = Config(self.configFile)
1906 config["registry", "managers", "datasets"] = manager
1907 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1909 def create_butlers(self, manager1, manager2):
1910 self.source_butler = self.create_butler(manager1, "1")
1911 self.target_butler = self.create_butler(manager2, "2")
1913 def testTransferUuidToUuid(self):
1914 self.create_butlers(
1915 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1916 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1917 )
1918 # Setting id_gen_map should have no effect here
1919 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1921 def testTransferIntToInt(self):
1922 self.create_butlers(
1923 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1924 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1925 )
1926 # int dataset ID only allows UNIQUE
1927 self.assertButlerTransfers()
1929 def testTransferIntToUuid(self):
1930 self.create_butlers(
1931 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1932 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1933 )
1934 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1936 def testTransferMissing(self):
1937 """Test transfers where datastore records are missing.
1939 This is how execution butler works.
1940 """
1941 self.create_butlers(
1942 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1943 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1944 )
1946 # Configure the source butler to allow trust.
1947 self.source_butler.datastore.trustGetRequest = True
1949 self.assertButlerTransfers(purge=True)
1951 def testTransferMissingDisassembly(self):
1952 """Test transfers where datastore records are missing.
1954 This is how execution butler works.
1955 """
1956 self.create_butlers(
1957 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1958 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1959 )
1961 # Configure the source butler to allow trust.
1962 self.source_butler.datastore.trustGetRequest = True
1964 # Test disassembly.
1965 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1967 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1968 """Test that a run can be transferred to another butler."""
1970 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1971 datasetTypeName = "random_data"
1973 # Test will create 3 collections and we will want to transfer
1974 # two of those three.
1975 runs = ["run1", "run2", "other"]
1977 # Also want to use two different dataset types to ensure that
1978 # grouping works.
1979 datasetTypeNames = ["random_data", "random_data_2"]
1981 # Create the run collections in the source butler.
1982 for run in runs:
1983 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1985 # Create dimensions in both butlers (transfer will not create them).
1986 n_exposures = 30
1987 for butler in (self.source_butler, self.target_butler):
1988 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1989 butler.registry.insertDimensionData(
1990 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1991 )
1992 butler.registry.insertDimensionData(
1993 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1994 )
1996 for i in range(n_exposures):
1997 butler.registry.insertDimensionData(
1998 "exposure",
1999 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2000 )
2002 # Create dataset types in the source butler.
2003 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
2004 for datasetTypeName in datasetTypeNames:
2005 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2006 self.source_butler.registry.registerDatasetType(datasetType)
2008 # Write a dataset to an unrelated run -- this will ensure that
2009 # we are rewriting integer dataset ids in the target if necessary.
2010 # Will not be relevant for UUID.
2011 run = "distraction"
2012 butler = Butler(butler=self.source_butler, run=run)
2013 butler.put(
2014 makeExampleMetrics(),
2015 datasetTypeName,
2016 exposure=1,
2017 instrument="DummyCamComp",
2018 physical_filter="d-r",
2019 )
2021 # Write some example metrics to the source
2022 butler = Butler(butler=self.source_butler)
2024 # Set of DatasetRefs that should be in the list of refs to transfer
2025 # but which will not be transferred.
2026 deleted = set()
2028 n_expected = 20 # Number of datasets expected to be transferred
2029 source_refs = []
2030 for i in range(n_exposures):
2031 # Put a third of datasets into each collection, only retain
2032 # two thirds.
2033 index = i % 3
2034 run = runs[index]
2035 datasetTypeName = datasetTypeNames[i % 2]
2037 metric_data = {
2038 "summary": {"counter": i},
2039 "output": {"text": "metric"},
2040 "data": [2 * x for x in range(i)],
2041 }
2042 metric = MetricsExample(**metric_data)
2043 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2044 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2046 # Remove the datastore record using low-level API
2047 if purge:
2048 # Remove records for a fraction.
2049 if index == 1:
2051 # For one of these delete the file as well.
2052 # This allows the "missing" code to filter the
2053 # file out.
2054 if not deleted:
2055 primary, uris = butler.datastore.getURIs(ref)
2056 if primary:
2057 primary.remove()
2058 for uri in uris.values():
2059 uri.remove()
2060 n_expected -= 1
2061 deleted.add(ref)
2063 # Remove the datastore record.
2064 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2066 if index < 2:
2067 source_refs.append(ref)
2068 if ref not in deleted:
2069 new_metric = butler.get(ref.unresolved(), collections=run)
2070 self.assertEqual(new_metric, metric)
2072 # Create some bad dataset types to ensure we check for inconsistent
2073 # definitions.
2074 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2075 for datasetTypeName in datasetTypeNames:
2076 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2077 self.target_butler.registry.registerDatasetType(datasetType)
2078 with self.assertRaises(ConflictingDefinitionError):
2079 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2080 # And remove the bad definitions.
2081 for datasetTypeName in datasetTypeNames:
2082 self.target_butler.registry.removeDatasetType(datasetTypeName)
2084 # Transfer without creating dataset types should fail.
2085 with self.assertRaises(KeyError):
2086 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2088 # Now transfer them to the second butler
2089 with self.assertLogs(level=logging.DEBUG) as cm:
2090 transferred = self.target_butler.transfer_from(
2091 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2092 )
2093 self.assertEqual(len(transferred), n_expected)
2094 log_output = ";".join(cm.output)
2095 self.assertIn("found in datastore for chunk", log_output)
2096 self.assertIn("Creating output run", log_output)
2098 # Do the transfer twice to ensure that it will do nothing extra.
2099 # Only do this if purge=True because it does not work for int
2100 # dataset_id.
2101 if purge:
2102 # This should not need to register dataset types.
2103 transferred = self.target_butler.transfer_from(
2104 self.source_butler, source_refs, id_gen_map=id_gen_map
2105 )
2106 self.assertEqual(len(transferred), n_expected)
2108 # Also do an explicit low-level transfer to trigger some
2109 # edge cases.
2110 with self.assertLogs(level=logging.DEBUG) as cm:
2111 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2112 log_output = ";".join(cm.output)
2113 self.assertIn("no file artifacts exist", log_output)
2115 with self.assertRaises(TypeError):
2116 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2118 with self.assertRaises(ValueError):
2119 self.target_butler.datastore.transfer_from(
2120 self.source_butler.datastore, source_refs, transfer="split"
2121 )
2123 # Now try to get the same refs from the new butler.
2124 for ref in source_refs:
2125 if ref not in deleted:
2126 unresolved_ref = ref.unresolved()
2127 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2128 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2129 self.assertEqual(new_metric, old_metric)
2131 # Now prune run2 collection and create instead a CHAINED collection.
2132 # This should block the transfer.
2133 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2134 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2135 with self.assertRaises(CollectionTypeError):
2136 # Re-importing the run1 datasets can be problematic if they
2137 # use integer IDs so filter those out.
2138 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2139 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2142if __name__ == "__main__": 2142 ↛ 2143line 2142 didn't jump to line 2143, because the condition on line 2142 was never true
2143 unittest.main()