Coverage for tests/test_butler.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionSearch,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import (
77 CollectionError,
78 CollectionTypeError,
79 ConflictingDefinitionError,
80 DataIdValueError,
81 MissingCollectionError,
82)
83from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
84from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
85from lsst.resources import ResourcePath
86from lsst.resources.http import isWebdavEndpoint
87from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
88from lsst.utils import doImport
89from lsst.utils.introspection import get_full_type_name
91TESTDIR = os.path.abspath(os.path.dirname(__file__))
94def makeExampleMetrics():
95 return MetricsExample(
96 {"AM1": 5.2, "AM2": 30.6},
97 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
98 [563, 234, 456.7, 752, 8, 9, 27],
99 )
102class TransactionTestError(Exception):
103 """Specific error for testing transactions, to prevent misdiagnosing
104 that might otherwise occur when a standard exception is used.
105 """
107 pass
110class ButlerConfigTests(unittest.TestCase):
111 """Simple tests for ButlerConfig that are not tested in any other test
112 cases."""
114 def testSearchPath(self):
115 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
116 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
117 config1 = ButlerConfig(configFile)
118 self.assertNotIn("testConfigs", "\n".join(cm.output))
120 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
121 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
122 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
123 self.assertIn("testConfigs", "\n".join(cm.output))
125 key = ("datastore", "records", "table")
126 self.assertNotEqual(config1[key], config2[key])
127 self.assertEqual(config2[key], "override_record")
130class ButlerPutGetTests:
131 """Helper method for running a suite of put/get tests from different
132 butler configurations."""
134 root = None
136 @staticmethod
137 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
138 """Create a DatasetType and register it"""
139 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
140 registry.registerDatasetType(datasetType)
141 return datasetType
143 @classmethod
144 def setUpClass(cls):
145 cls.storageClassFactory = StorageClassFactory()
146 cls.storageClassFactory.addFromConfig(cls.configFile)
148 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
149 datasetType = datasetRef.datasetType
150 dataId = datasetRef.dataId
151 deferred = butler.getDirectDeferred(datasetRef)
153 for component in components:
154 compTypeName = datasetType.componentTypeName(component)
155 result = butler.get(compTypeName, dataId, collections=collections)
156 self.assertEqual(result, getattr(reference, component))
157 result_deferred = deferred.get(component=component)
158 self.assertEqual(result_deferred, result)
160 def tearDown(self):
161 removeTestTempDir(self.root)
163 def create_butler(self, run, storageClass, datasetTypeName):
164 butler = Butler(self.tmpConfigFile, run=run)
166 collections = set(butler.registry.queryCollections())
167 self.assertEqual(collections, set([run]))
169 # Create and register a DatasetType
170 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
172 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
174 # Add needed Dimensions
175 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
176 butler.registry.insertDimensionData(
177 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
178 )
179 butler.registry.insertDimensionData(
180 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
181 )
182 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
183 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
184 butler.registry.insertDimensionData(
185 "visit",
186 {
187 "instrument": "DummyCamComp",
188 "id": 423,
189 "name": "fourtwentythree",
190 "physical_filter": "d-r",
191 "visit_system": 1,
192 "datetime_begin": visit_start,
193 "datetime_end": visit_end,
194 },
195 )
197 # Add more visits for some later tests
198 for visit_id in (424, 425):
199 butler.registry.insertDimensionData(
200 "visit",
201 {
202 "instrument": "DummyCamComp",
203 "id": visit_id,
204 "name": f"fourtwentyfour_{visit_id}",
205 "physical_filter": "d-r",
206 "visit_system": 1,
207 },
208 )
209 return butler, datasetType
211 def runPutGetTest(self, storageClass, datasetTypeName):
212 # New datasets will be added to run and tag, but we will only look in
213 # tag when looking up datasets.
214 run = "ingest"
215 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
217 # Create and store a dataset
218 metric = makeExampleMetrics()
219 dataId = {"instrument": "DummyCamComp", "visit": 423}
221 # Create a DatasetRef for put
222 refIn = DatasetRef(datasetType, dataId, id=None)
224 # Put with a preexisting id should fail
225 with self.assertRaises(ValueError):
226 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
228 # Put and remove the dataset once as a DatasetRef, once as a dataId,
229 # and once with a DatasetType
231 # Keep track of any collections we add and do not clean up
232 expected_collections = {run}
234 counter = 0
235 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
236 # Since we are using subTest we can get cascading failures
237 # here with the first attempt failing and the others failing
238 # immediately because the dataset already exists. Work around
239 # this by using a distinct run collection each time
240 counter += 1
241 this_run = f"put_run_{counter}"
242 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
243 expected_collections.update({this_run})
245 with self.subTest(args=args):
246 ref = butler.put(metric, *args, run=this_run)
247 self.assertIsInstance(ref, DatasetRef)
249 # Test getDirect
250 metricOut = butler.getDirect(ref)
251 self.assertEqual(metric, metricOut)
252 # Test get
253 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
254 self.assertEqual(metric, metricOut)
255 # Test get with a datasetRef
256 metricOut = butler.get(ref, collections=this_run)
257 self.assertEqual(metric, metricOut)
258 # Test getDeferred with dataId
259 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
260 self.assertEqual(metric, metricOut)
261 # Test getDeferred with a datasetRef
262 metricOut = butler.getDeferred(ref, collections=this_run).get()
263 self.assertEqual(metric, metricOut)
264 # and deferred direct with ref
265 metricOut = butler.getDirectDeferred(ref).get()
266 self.assertEqual(metric, metricOut)
268 # Check we can get components
269 if storageClass.isComposite():
270 self.assertGetComponents(
271 butler, ref, ("summary", "data", "output"), metric, collections=this_run
272 )
274 # Can the artifacts themselves be retrieved?
275 if not butler.datastore.isEphemeral:
276 root_uri = ResourcePath(self.root)
278 for preserve_path in (True, False):
279 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
280 # Use copy so that we can test that overwrite
281 # protection works (using "auto" for File URIs would
282 # use hard links and subsequent transfer would work
283 # because it knows they are the same file).
284 transferred = butler.retrieveArtifacts(
285 [ref], destination, preserve_path=preserve_path, transfer="copy"
286 )
287 self.assertGreater(len(transferred), 0)
288 artifacts = list(ResourcePath.findFileResources([destination]))
289 self.assertEqual(set(transferred), set(artifacts))
291 for artifact in transferred:
292 path_in_destination = artifact.relative_to(destination)
293 self.assertIsNotNone(path_in_destination)
295 # when path is not preserved there should not be
296 # any path separators.
297 num_seps = path_in_destination.count("/")
298 if preserve_path:
299 self.assertGreater(num_seps, 0)
300 else:
301 self.assertEqual(num_seps, 0)
303 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
304 n_uris = len(secondary_uris)
305 if primary_uri:
306 n_uris += 1
307 self.assertEqual(
308 len(artifacts),
309 n_uris,
310 "Comparing expected artifacts vs actual:"
311 f" {artifacts} vs {primary_uri} and {secondary_uris}",
312 )
314 if preserve_path:
315 # No need to run these twice
316 with self.assertRaises(ValueError):
317 butler.retrieveArtifacts([ref], destination, transfer="move")
319 with self.assertRaises(FileExistsError):
320 butler.retrieveArtifacts([ref], destination)
322 transferred_again = butler.retrieveArtifacts(
323 [ref], destination, preserve_path=preserve_path, overwrite=True
324 )
325 self.assertEqual(set(transferred_again), set(transferred))
327 # Now remove the dataset completely.
328 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
329 # Lookup with original args should still fail.
330 with self.assertRaises(LookupError):
331 butler.datasetExists(*args, collections=this_run)
332 # getDirect() should still fail.
333 with self.assertRaises(FileNotFoundError):
334 butler.getDirect(ref)
335 # Registry shouldn't be able to find it by dataset_id anymore.
336 self.assertIsNone(butler.registry.getDataset(ref.id))
338 # Do explicit registry removal since we know they are
339 # empty
340 butler.registry.removeCollection(this_run)
341 expected_collections.remove(this_run)
343 # Put the dataset again, since the last thing we did was remove it
344 # and we want to use the default collection.
345 ref = butler.put(metric, refIn)
347 # Get with parameters
348 stop = 4
349 sliced = butler.get(ref, parameters={"slice": slice(stop)})
350 self.assertNotEqual(metric, sliced)
351 self.assertEqual(metric.summary, sliced.summary)
352 self.assertEqual(metric.output, sliced.output)
353 self.assertEqual(metric.data[:stop], sliced.data)
354 # getDeferred with parameters
355 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
356 self.assertNotEqual(metric, sliced)
357 self.assertEqual(metric.summary, sliced.summary)
358 self.assertEqual(metric.output, sliced.output)
359 self.assertEqual(metric.data[:stop], sliced.data)
360 # getDeferred with deferred parameters
361 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
362 self.assertNotEqual(metric, sliced)
363 self.assertEqual(metric.summary, sliced.summary)
364 self.assertEqual(metric.output, sliced.output)
365 self.assertEqual(metric.data[:stop], sliced.data)
367 if storageClass.isComposite():
368 # Check that components can be retrieved
369 metricOut = butler.get(ref.datasetType.name, dataId)
370 compNameS = ref.datasetType.componentTypeName("summary")
371 compNameD = ref.datasetType.componentTypeName("data")
372 summary = butler.get(compNameS, dataId)
373 self.assertEqual(summary, metric.summary)
374 data = butler.get(compNameD, dataId)
375 self.assertEqual(data, metric.data)
377 if "counter" in storageClass.derivedComponents:
378 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
379 self.assertEqual(count, len(data))
381 count = butler.get(
382 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
383 )
384 self.assertEqual(count, stop)
386 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
387 summary = butler.getDirect(compRef)
388 self.assertEqual(summary, metric.summary)
390 # Create a Dataset type that has the same name but is inconsistent.
391 inconsistentDatasetType = DatasetType(
392 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
393 )
395 # Getting with a dataset type that does not match registry fails
396 with self.assertRaises(ValueError):
397 butler.get(inconsistentDatasetType, dataId)
399 # Combining a DatasetRef with a dataId should fail
400 with self.assertRaises(ValueError):
401 butler.get(ref, dataId)
402 # Getting with an explicit ref should fail if the id doesn't match
403 with self.assertRaises(ValueError):
404 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
406 # Getting a dataset with unknown parameters should fail
407 with self.assertRaises(KeyError):
408 butler.get(ref, parameters={"unsupported": True})
410 # Check we have a collection
411 collections = set(butler.registry.queryCollections())
412 self.assertEqual(collections, expected_collections)
414 # Clean up to check that we can remove something that may have
415 # already had a component removed
416 butler.pruneDatasets([ref], unstore=True, purge=True)
418 # Check that we can configure a butler to accept a put even
419 # if it already has the dataset in registry.
420 ref = butler.put(metric, refIn)
422 # Repeat put will fail.
423 with self.assertRaises(ConflictingDefinitionError):
424 butler.put(metric, refIn)
426 # Remove the datastore entry.
427 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
429 # Put will still fail
430 with self.assertRaises(ConflictingDefinitionError):
431 butler.put(metric, refIn)
433 # Allow the put to succeed
434 butler._allow_put_of_predefined_dataset = True
435 ref2 = butler.put(metric, refIn)
436 self.assertEqual(ref2.id, ref.id)
438 # A second put will still fail but with a different exception
439 # than before.
440 with self.assertRaises(ConflictingDefinitionError):
441 butler.put(metric, refIn)
443 # Reset the flag to avoid confusion
444 butler._allow_put_of_predefined_dataset = False
446 # Leave the dataset in place since some downstream tests require
447 # something to be present
449 return butler
451 def testDeferredCollectionPassing(self):
452 # Construct a butler with no run or collection, but make it writeable.
453 butler = Butler(self.tmpConfigFile, writeable=True)
454 # Create and register a DatasetType
455 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
456 datasetType = self.addDatasetType(
457 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
458 )
459 # Add needed Dimensions
460 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
461 butler.registry.insertDimensionData(
462 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
463 )
464 butler.registry.insertDimensionData(
465 "visit",
466 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
467 )
468 dataId = {"instrument": "DummyCamComp", "visit": 423}
469 # Create dataset.
470 metric = makeExampleMetrics()
471 # Register a new run and put dataset.
472 run = "deferred"
473 self.assertTrue(butler.registry.registerRun(run))
474 # Second time it will be allowed but indicate no-op
475 self.assertFalse(butler.registry.registerRun(run))
476 ref = butler.put(metric, datasetType, dataId, run=run)
477 # Putting with no run should fail with TypeError.
478 with self.assertRaises(CollectionError):
479 butler.put(metric, datasetType, dataId)
480 # Dataset should exist.
481 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
482 # We should be able to get the dataset back, but with and without
483 # a deferred dataset handle.
484 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
485 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
486 # Trying to find the dataset without any collection is a TypeError.
487 with self.assertRaises(CollectionError):
488 butler.datasetExists(datasetType, dataId)
489 with self.assertRaises(CollectionError):
490 butler.get(datasetType, dataId)
491 # Associate the dataset with a different collection.
492 butler.registry.registerCollection("tagged")
493 butler.registry.associate("tagged", [ref])
494 # Deleting the dataset from the new collection should make it findable
495 # in the original collection.
496 butler.pruneDatasets([ref], tags=["tagged"])
497 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
500class ButlerTests(ButlerPutGetTests):
501 """Tests for Butler."""
503 useTempRoot = True
505 def setUp(self):
506 """Create a new butler root for each test."""
507 self.root = makeTestTempDir(TESTDIR)
508 Butler.makeRepo(self.root, config=Config(self.configFile))
509 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
511 def testConstructor(self):
512 """Independent test of constructor."""
513 butler = Butler(self.tmpConfigFile, run="ingest")
514 self.assertIsInstance(butler, Butler)
516 # Check that butler.yaml is added automatically.
517 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
518 config_dir = self.tmpConfigFile[: -len(end)]
519 butler = Butler(config_dir, run="ingest")
520 self.assertIsInstance(butler, Butler)
522 collections = set(butler.registry.queryCollections())
523 self.assertEqual(collections, {"ingest"})
525 # Check that some special characters can be included in run name.
526 special_run = "u@b.c-A"
527 butler_special = Butler(butler=butler, run=special_run)
528 collections = set(butler_special.registry.queryCollections("*@*"))
529 self.assertEqual(collections, {special_run})
531 butler2 = Butler(butler=butler, collections=["other"])
532 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
533 self.assertIsNone(butler2.run)
534 self.assertIs(butler.datastore, butler2.datastore)
536 # Test that we can use an environment variable to find this
537 # repository.
538 butler_index = Config()
539 butler_index["label"] = self.tmpConfigFile
540 for suffix in (".yaml", ".json"):
541 # Ensure that the content differs so that we know that
542 # we aren't reusing the cache.
543 bad_label = f"s3://bucket/not_real{suffix}"
544 butler_index["bad_label"] = bad_label
545 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
546 butler_index.dumpToUri(temp_file)
547 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
548 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
549 uri = Butler.get_repo_uri("bad_label")
550 self.assertEqual(uri, ResourcePath(bad_label))
551 uri = Butler.get_repo_uri("label")
552 butler = Butler(uri, writeable=False)
553 self.assertIsInstance(butler, Butler)
554 butler = Butler("label", writeable=False)
555 self.assertIsInstance(butler, Butler)
556 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
557 Butler("not_there", writeable=False)
558 with self.assertRaises(KeyError) as cm:
559 Butler.get_repo_uri("missing")
560 self.assertIn("not known to", str(cm.exception))
561 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
562 with self.assertRaises(FileNotFoundError):
563 Butler.get_repo_uri("label")
564 self.assertEqual(Butler.get_known_repos(), set())
565 with self.assertRaises(KeyError) as cm:
566 # No environment variable set.
567 Butler.get_repo_uri("label")
568 self.assertIn("No repository index defined", str(cm.exception))
569 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
570 # No aliases registered.
571 Butler("not_there")
572 self.assertEqual(Butler.get_known_repos(), set())
574 def testBasicPutGet(self):
575 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
576 self.runPutGetTest(storageClass, "test_metric")
578 def testCompositePutGetConcrete(self):
580 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
581 butler = self.runPutGetTest(storageClass, "test_metric")
583 # Should *not* be disassembled
584 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
585 self.assertEqual(len(datasets), 1)
586 uri, components = butler.getURIs(datasets[0])
587 self.assertIsInstance(uri, ResourcePath)
588 self.assertFalse(components)
589 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
590 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
592 # Predicted dataset
593 dataId = {"instrument": "DummyCamComp", "visit": 424}
594 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
595 self.assertFalse(components)
596 self.assertIsInstance(uri, ResourcePath)
597 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
598 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
600 def testCompositePutGetVirtual(self):
601 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
602 butler = self.runPutGetTest(storageClass, "test_metric_comp")
604 # Should be disassembled
605 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
606 self.assertEqual(len(datasets), 1)
607 uri, components = butler.getURIs(datasets[0])
609 if butler.datastore.isEphemeral:
610 # Never disassemble in-memory datastore
611 self.assertIsInstance(uri, ResourcePath)
612 self.assertFalse(components)
613 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
614 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
615 else:
616 self.assertIsNone(uri)
617 self.assertEqual(set(components), set(storageClass.components))
618 for compuri in components.values():
619 self.assertIsInstance(compuri, ResourcePath)
620 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
621 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
623 # Predicted dataset
624 dataId = {"instrument": "DummyCamComp", "visit": 424}
625 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
627 if butler.datastore.isEphemeral:
628 # Never disassembled
629 self.assertIsInstance(uri, ResourcePath)
630 self.assertFalse(components)
631 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
632 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
633 else:
634 self.assertIsNone(uri)
635 self.assertEqual(set(components), set(storageClass.components))
636 for compuri in components.values():
637 self.assertIsInstance(compuri, ResourcePath)
638 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
639 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
641 def testIngest(self):
642 butler = Butler(self.tmpConfigFile, run="ingest")
644 # Create and register a DatasetType
645 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
647 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
648 datasetTypeName = "metric"
650 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
652 # Add needed Dimensions
653 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
654 butler.registry.insertDimensionData(
655 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
656 )
657 for detector in (1, 2):
658 butler.registry.insertDimensionData(
659 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
660 )
662 butler.registry.insertDimensionData(
663 "visit",
664 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
665 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
666 )
668 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
669 dataRoot = os.path.join(TESTDIR, "data", "basic")
670 datasets = []
671 for detector in (1, 2):
672 detector_name = f"detector_{detector}"
673 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
674 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
675 # Create a DatasetRef for ingest
676 refIn = DatasetRef(datasetType, dataId, id=None)
678 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
680 butler.ingest(*datasets, transfer="copy")
682 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
683 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
685 metrics1 = butler.get(datasetTypeName, dataId1)
686 metrics2 = butler.get(datasetTypeName, dataId2)
687 self.assertNotEqual(metrics1, metrics2)
689 # Compare URIs
690 uri1 = butler.getURI(datasetTypeName, dataId1)
691 uri2 = butler.getURI(datasetTypeName, dataId2)
692 self.assertNotEqual(uri1, uri2)
694 # Now do a multi-dataset but single file ingest
695 metricFile = os.path.join(dataRoot, "detectors.yaml")
696 refs = []
697 for detector in (1, 2):
698 detector_name = f"detector_{detector}"
699 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
700 # Create a DatasetRef for ingest
701 refs.append(DatasetRef(datasetType, dataId, id=None))
703 datasets = []
704 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
706 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
708 # Check that the datastore recorded no file size.
709 # Not all datastores can support this.
710 try:
711 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
712 self.assertEqual(infos[0].file_size, -1)
713 except AttributeError:
714 pass
716 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
717 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
719 multi1 = butler.get(datasetTypeName, dataId1)
720 multi2 = butler.get(datasetTypeName, dataId2)
722 self.assertEqual(multi1, metrics1)
723 self.assertEqual(multi2, metrics2)
725 # Compare URIs
726 uri1 = butler.getURI(datasetTypeName, dataId1)
727 uri2 = butler.getURI(datasetTypeName, dataId2)
728 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
730 # Test that removing one does not break the second
731 # This line will issue a warning log message for a ChainedDatastore
732 # that uses an InMemoryDatastore since in-memory can not ingest
733 # files.
734 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
735 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
736 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
737 multi2b = butler.get(datasetTypeName, dataId2)
738 self.assertEqual(multi2, multi2b)
740 def testPruneCollections(self):
741 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
742 butler = Butler(self.tmpConfigFile, writeable=True)
743 # Load registry data with dimensions to hang datasets off of.
744 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
745 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
746 # Add some RUN-type collections.
747 run1 = "run1"
748 butler.registry.registerRun(run1)
749 run2 = "run2"
750 butler.registry.registerRun(run2)
751 # put some datasets. ref1 and ref2 have the same data ID, and are in
752 # different runs. ref3 has a different data ID.
753 metric = makeExampleMetrics()
754 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
755 datasetType = self.addDatasetType(
756 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
757 )
758 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
759 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
760 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
762 # Try to delete a RUN collection without purge, or with purge and not
763 # unstore.
764 with self.assertRaises(TypeError):
765 butler.pruneCollection(run1)
766 with self.assertRaises(TypeError):
767 butler.pruneCollection(run2, purge=True)
768 # Add a TAGGED collection and associate ref3 only into it.
769 tag1 = "tag1"
770 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
771 self.assertTrue(registered)
772 # Registering a second time should be allowed.
773 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
774 self.assertFalse(registered)
775 butler.registry.associate(tag1, [ref3])
776 # Add a CHAINED collection that searches run1 and then run2. It
777 # logically contains only ref1, because ref2 is shadowed due to them
778 # having the same data ID and dataset type.
779 chain1 = "chain1"
780 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
781 butler.registry.setCollectionChain(chain1, [run1, run2])
782 # Try to delete RUN collections, which should fail with complete
783 # rollback because they're still referenced by the CHAINED
784 # collection.
785 with self.assertRaises(Exception):
786 butler.pruneCollection(run1, pruge=True, unstore=True)
787 with self.assertRaises(Exception):
788 butler.pruneCollection(run2, pruge=True, unstore=True)
789 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
790 existence = butler.datastore.mexists([ref1, ref2, ref3])
791 self.assertTrue(existence[ref1])
792 self.assertTrue(existence[ref2])
793 self.assertTrue(existence[ref3])
794 # Try to delete CHAINED and TAGGED collections with purge; should not
795 # work.
796 with self.assertRaises(TypeError):
797 butler.pruneCollection(tag1, purge=True, unstore=True)
798 with self.assertRaises(TypeError):
799 butler.pruneCollection(chain1, purge=True, unstore=True)
800 # Remove the tagged collection with unstore=False. This should not
801 # affect the datasets.
802 butler.pruneCollection(tag1)
803 with self.assertRaises(MissingCollectionError):
804 butler.registry.getCollectionType(tag1)
805 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
806 existence = butler.datastore.mexists([ref1, ref2, ref3])
807 self.assertTrue(existence[ref1])
808 self.assertTrue(existence[ref2])
809 self.assertTrue(existence[ref3])
810 # Add the tagged collection back in, and remove it with unstore=True.
811 # This should remove ref3 only from the datastore.
812 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
813 butler.registry.associate(tag1, [ref3])
814 butler.pruneCollection(tag1, unstore=True)
815 with self.assertRaises(MissingCollectionError):
816 butler.registry.getCollectionType(tag1)
817 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
818 existence = butler.datastore.mexists([ref1, ref2, ref3])
819 self.assertTrue(existence[ref1])
820 self.assertTrue(existence[ref2])
821 self.assertFalse(existence[ref3])
822 # Delete the chain with unstore=False. The datasets should not be
823 # affected at all.
824 butler.pruneCollection(chain1)
825 with self.assertRaises(MissingCollectionError):
826 butler.registry.getCollectionType(chain1)
827 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
828 existence = butler.datastore.mexists([ref1, ref2, ref3])
829 self.assertTrue(existence[ref1])
830 self.assertTrue(existence[ref2])
831 self.assertFalse(existence[ref3])
832 # Redefine and then delete the chain with unstore=True. Only ref1
833 # should be unstored (ref3 has already been unstored, but otherwise
834 # would be now).
835 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
836 butler.registry.setCollectionChain(chain1, [run1, run2])
837 butler.pruneCollection(chain1, unstore=True)
838 with self.assertRaises(MissingCollectionError):
839 butler.registry.getCollectionType(chain1)
840 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
841 existence = butler.datastore.mexists([ref1, ref2, ref3])
842 self.assertFalse(existence[ref1])
843 self.assertTrue(existence[ref2])
844 self.assertFalse(existence[ref3])
845 # Remove run1. This removes ref1 and ref3 from the registry (they're
846 # already gone from the datastore, which is fine).
847 butler.pruneCollection(run1, purge=True, unstore=True)
848 with self.assertRaises(MissingCollectionError):
849 butler.registry.getCollectionType(run1)
850 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
851 self.assertTrue(butler.datastore.exists(ref2))
852 # Remove run2. This removes ref2 from the registry and the datastore.
853 butler.pruneCollection(run2, purge=True, unstore=True)
854 with self.assertRaises(MissingCollectionError):
855 butler.registry.getCollectionType(run2)
856 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
858 # Now that the collections have been pruned we can remove the
859 # dataset type
860 butler.registry.removeDatasetType(datasetType.name)
862 def testPickle(self):
863 """Test pickle support."""
864 butler = Butler(self.tmpConfigFile, run="ingest")
865 butlerOut = pickle.loads(pickle.dumps(butler))
866 self.assertIsInstance(butlerOut, Butler)
867 self.assertEqual(butlerOut._config, butler._config)
868 self.assertEqual(butlerOut.collections, butler.collections)
869 self.assertEqual(butlerOut.run, butler.run)
871 def testGetDatasetTypes(self):
872 butler = Butler(self.tmpConfigFile, run="ingest")
873 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
874 dimensionEntries = [
875 (
876 "instrument",
877 {"instrument": "DummyCam"},
878 {"instrument": "DummyHSC"},
879 {"instrument": "DummyCamComp"},
880 ),
881 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
882 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
883 ]
884 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
885 # Add needed Dimensions
886 for args in dimensionEntries:
887 butler.registry.insertDimensionData(*args)
889 # When a DatasetType is added to the registry entries are not created
890 # for components but querying them can return the components.
891 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
892 components = set()
893 for datasetTypeName in datasetTypeNames:
894 # Create and register a DatasetType
895 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
897 for componentName in storageClass.components:
898 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
900 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
901 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
903 # Now that we have some dataset types registered, validate them
904 butler.validateConfiguration(
905 ignore=[
906 "test_metric_comp",
907 "metric3",
908 "calexp",
909 "DummySC",
910 "datasetType.component",
911 "random_data",
912 "random_data_2",
913 ]
914 )
916 # Add a new datasetType that will fail template validation
917 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
918 if self.validationCanFail:
919 with self.assertRaises(ValidationError):
920 butler.validateConfiguration()
922 # Rerun validation but with a subset of dataset type names
923 butler.validateConfiguration(datasetTypeNames=["metric4"])
925 # Rerun validation but ignore the bad datasetType
926 butler.validateConfiguration(
927 ignore=[
928 "test_metric_comp",
929 "metric3",
930 "calexp",
931 "DummySC",
932 "datasetType.component",
933 "random_data",
934 "random_data_2",
935 ]
936 )
938 def testTransaction(self):
939 butler = Butler(self.tmpConfigFile, run="ingest")
940 datasetTypeName = "test_metric"
941 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
942 dimensionEntries = (
943 ("instrument", {"instrument": "DummyCam"}),
944 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
945 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
946 )
947 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
948 metric = makeExampleMetrics()
949 dataId = {"instrument": "DummyCam", "visit": 42}
950 # Create and register a DatasetType
951 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
952 with self.assertRaises(TransactionTestError):
953 with butler.transaction():
954 # Add needed Dimensions
955 for args in dimensionEntries:
956 butler.registry.insertDimensionData(*args)
957 # Store a dataset
958 ref = butler.put(metric, datasetTypeName, dataId)
959 self.assertIsInstance(ref, DatasetRef)
960 # Test getDirect
961 metricOut = butler.getDirect(ref)
962 self.assertEqual(metric, metricOut)
963 # Test get
964 metricOut = butler.get(datasetTypeName, dataId)
965 self.assertEqual(metric, metricOut)
966 # Check we can get components
967 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
968 raise TransactionTestError("This should roll back the entire transaction")
969 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
970 butler.registry.expandDataId(dataId)
971 # Should raise LookupError for missing data ID value
972 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
973 butler.get(datasetTypeName, dataId)
974 # Also check explicitly if Dataset entry is missing
975 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
976 # Direct retrieval should not find the file in the Datastore
977 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
978 butler.getDirect(ref)
980 def testMakeRepo(self):
981 """Test that we can write butler configuration to a new repository via
982 the Butler.makeRepo interface and then instantiate a butler from the
983 repo root.
984 """
985 # Do not run the test if we know this datastore configuration does
986 # not support a file system root
987 if self.fullConfigKey is None:
988 return
990 # create two separate directories
991 root1 = tempfile.mkdtemp(dir=self.root)
992 root2 = tempfile.mkdtemp(dir=self.root)
994 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
995 limited = Config(self.configFile)
996 butler1 = Butler(butlerConfig)
997 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
998 full = Config(self.tmpConfigFile)
999 butler2 = Butler(butlerConfig)
1000 # Butlers should have the same configuration regardless of whether
1001 # defaults were expanded.
1002 self.assertEqual(butler1._config, butler2._config)
1003 # Config files loaded directly should not be the same.
1004 self.assertNotEqual(limited, full)
1005 # Make sure "limited" doesn't have a few keys we know it should be
1006 # inheriting from defaults.
1007 self.assertIn(self.fullConfigKey, full)
1008 self.assertNotIn(self.fullConfigKey, limited)
1010 # Collections don't appear until something is put in them
1011 collections1 = set(butler1.registry.queryCollections())
1012 self.assertEqual(collections1, set())
1013 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1015 # Check that a config with no associated file name will not
1016 # work properly with relocatable Butler repo
1017 butlerConfig.configFile = None
1018 with self.assertRaises(ValueError):
1019 Butler(butlerConfig)
1021 with self.assertRaises(FileExistsError):
1022 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1024 def testStringification(self):
1025 butler = Butler(self.tmpConfigFile, run="ingest")
1026 butlerStr = str(butler)
1028 if self.datastoreStr is not None:
1029 for testStr in self.datastoreStr:
1030 self.assertIn(testStr, butlerStr)
1031 if self.registryStr is not None:
1032 self.assertIn(self.registryStr, butlerStr)
1034 datastoreName = butler.datastore.name
1035 if self.datastoreName is not None:
1036 for testStr in self.datastoreName:
1037 self.assertIn(testStr, datastoreName)
1039 def testButlerRewriteDataId(self):
1040 """Test that dataIds can be rewritten based on dimension records."""
1042 butler = Butler(self.tmpConfigFile, run="ingest")
1044 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1045 datasetTypeName = "random_data"
1047 # Create dimension records.
1048 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1049 butler.registry.insertDimensionData(
1050 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1051 )
1052 butler.registry.insertDimensionData(
1053 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1054 )
1056 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1057 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1058 butler.registry.registerDatasetType(datasetType)
1060 n_exposures = 5
1061 dayobs = 20210530
1063 for i in range(n_exposures):
1064 butler.registry.insertDimensionData(
1065 "exposure",
1066 {
1067 "instrument": "DummyCamComp",
1068 "id": i,
1069 "obs_id": f"exp{i}",
1070 "seq_num": i,
1071 "day_obs": dayobs,
1072 "physical_filter": "d-r",
1073 },
1074 )
1076 # Write some data.
1077 for i in range(n_exposures):
1078 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1080 # Use the seq_num for the put to test rewriting.
1081 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1082 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1084 # Check that the exposure is correct in the dataId
1085 self.assertEqual(ref.dataId["exposure"], i)
1087 # and check that we can get the dataset back with the same dataId
1088 new_metric = butler.get(datasetTypeName, dataId=dataId)
1089 self.assertEqual(new_metric, metric)
1092class FileDatastoreButlerTests(ButlerTests):
1093 """Common tests and specialization of ButlerTests for butlers backed
1094 by datastores that inherit from FileDatastore.
1095 """
1097 def checkFileExists(self, root, relpath):
1098 """Checks if file exists at a given path (relative to root).
1100 Test testPutTemplates verifies actual physical existance of the files
1101 in the requested location.
1102 """
1103 uri = ResourcePath(root, forceDirectory=True)
1104 return uri.join(relpath).exists()
1106 def testPutTemplates(self):
1107 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1108 butler = Butler(self.tmpConfigFile, run="ingest")
1110 # Add needed Dimensions
1111 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1112 butler.registry.insertDimensionData(
1113 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1114 )
1115 butler.registry.insertDimensionData(
1116 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1117 )
1118 butler.registry.insertDimensionData(
1119 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1120 )
1122 # Create and store a dataset
1123 metric = makeExampleMetrics()
1125 # Create two almost-identical DatasetTypes (both will use default
1126 # template)
1127 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1128 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1129 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1130 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1132 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1133 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1135 # Put with exactly the data ID keys needed
1136 ref = butler.put(metric, "metric1", dataId1)
1137 uri = butler.getURI(ref)
1138 self.assertTrue(
1139 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"),
1140 f"Checking existence of {uri}",
1141 )
1143 # Check the template based on dimensions
1144 butler.datastore.templates.validateTemplates([ref])
1146 # Put with extra data ID keys (physical_filter is an optional
1147 # dependency); should not change template (at least the way we're
1148 # defining them to behave now; the important thing is that they
1149 # must be consistent).
1150 ref = butler.put(metric, "metric2", dataId2)
1151 uri = butler.getURI(ref)
1152 self.assertTrue(
1153 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"),
1154 f"Checking existence of {uri}",
1155 )
1157 # Check the template based on dimensions
1158 butler.datastore.templates.validateTemplates([ref])
1160 # Now use a file template that will not result in unique filenames
1161 with self.assertRaises(FileTemplateValidationError):
1162 butler.put(metric, "metric3", dataId1)
1164 def testImportExport(self):
1165 # Run put/get tests just to create and populate a repo.
1166 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1167 self.runImportExportTest(storageClass)
1169 @unittest.expectedFailure
1170 def testImportExportVirtualComposite(self):
1171 # Run put/get tests just to create and populate a repo.
1172 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1173 self.runImportExportTest(storageClass)
1175 def runImportExportTest(self, storageClass):
1176 """This test does an export to a temp directory and an import back
1177 into a new temp directory repo. It does not assume a posix datastore"""
1178 exportButler = self.runPutGetTest(storageClass, "test_metric")
1179 print("Root:", exportButler.datastore.root)
1180 # Test that the repo actually has at least one dataset.
1181 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1182 self.assertGreater(len(datasets), 0)
1183 # Add a DimensionRecord that's unused by those datasets.
1184 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1185 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1186 # Export and then import datasets.
1187 with safeTestTempDir(TESTDIR) as exportDir:
1188 exportFile = os.path.join(exportDir, "exports.yaml")
1189 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1190 export.saveDatasets(datasets)
1191 # Export the same datasets again. This should quietly do
1192 # nothing because of internal deduplication, and it shouldn't
1193 # complain about being asked to export the "htm7" elements even
1194 # though there aren't any in these datasets or in the database.
1195 export.saveDatasets(datasets, elements=["htm7"])
1196 # Save one of the data IDs again; this should be harmless
1197 # because of internal deduplication.
1198 export.saveDataIds([datasets[0].dataId])
1199 # Save some dimension records directly.
1200 export.saveDimensionData("skymap", [skymapRecord])
1201 self.assertTrue(os.path.exists(exportFile))
1202 with safeTestTempDir(TESTDIR) as importDir:
1203 # We always want this to be a local posix butler
1204 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1205 # Calling script.butlerImport tests the implementation of the
1206 # butler command line interface "import" subcommand. Functions
1207 # in the script folder are generally considered protected and
1208 # should not be used as public api.
1209 with open(exportFile, "r") as f:
1210 script.butlerImport(
1211 importDir,
1212 export_file=f,
1213 directory=exportDir,
1214 transfer="auto",
1215 skip_dimensions=None,
1216 reuse_ids=False,
1217 )
1218 importButler = Butler(importDir, run="ingest")
1219 for ref in datasets:
1220 with self.subTest(ref=ref):
1221 # Test for existence by passing in the DatasetType and
1222 # data ID separately, to avoid lookup by dataset_id.
1223 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1224 self.assertEqual(
1225 list(importButler.registry.queryDimensionRecords("skymap")),
1226 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1227 )
1229 def testRemoveRuns(self):
1230 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1231 butler = Butler(self.tmpConfigFile, writeable=True)
1232 # Load registry data with dimensions to hang datasets off of.
1233 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1234 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1235 # Add some RUN-type collection.
1236 run1 = "run1"
1237 butler.registry.registerRun(run1)
1238 run2 = "run2"
1239 butler.registry.registerRun(run2)
1240 # put a dataset in each
1241 metric = makeExampleMetrics()
1242 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1243 datasetType = self.addDatasetType(
1244 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1245 )
1246 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1247 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1248 uri1 = butler.getURI(ref1, collections=[run1])
1249 uri2 = butler.getURI(ref2, collections=[run2])
1250 # Remove from both runs with different values for unstore.
1251 butler.removeRuns([run1], unstore=True)
1252 butler.removeRuns([run2], unstore=False)
1253 # Should be nothing in registry for either one, and datastore should
1254 # not think either exists.
1255 with self.assertRaises(MissingCollectionError):
1256 butler.registry.getCollectionType(run1)
1257 with self.assertRaises(MissingCollectionError):
1258 butler.registry.getCollectionType(run2)
1259 self.assertFalse(butler.datastore.exists(ref1))
1260 self.assertFalse(butler.datastore.exists(ref2))
1261 # The ref we unstored should be gone according to the URI, but the
1262 # one we forgot should still be around.
1263 self.assertFalse(uri1.exists())
1264 self.assertTrue(uri2.exists())
1267class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1268 """PosixDatastore specialization of a butler"""
1270 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1271 fullConfigKey = ".datastore.formatters"
1272 validationCanFail = True
1273 datastoreStr = ["/tmp"]
1274 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1275 registryStr = "/gen3.sqlite3"
1277 def testPathConstructor(self):
1278 """Independent test of constructor using PathLike."""
1279 butler = Butler(self.tmpConfigFile, run="ingest")
1280 self.assertIsInstance(butler, Butler)
1282 # And again with a Path object with the butler yaml
1283 path = pathlib.Path(self.tmpConfigFile)
1284 butler = Butler(path, writeable=False)
1285 self.assertIsInstance(butler, Butler)
1287 # And again with a Path object without the butler yaml
1288 # (making sure we skip it if the tmp config doesn't end
1289 # in butler.yaml -- which is the case for a subclass)
1290 if self.tmpConfigFile.endswith("butler.yaml"):
1291 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1292 butler = Butler(path, writeable=False)
1293 self.assertIsInstance(butler, Butler)
1295 def testExportTransferCopy(self):
1296 """Test local export using all transfer modes"""
1297 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1298 exportButler = self.runPutGetTest(storageClass, "test_metric")
1299 # Test that the repo actually has at least one dataset.
1300 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1301 self.assertGreater(len(datasets), 0)
1302 uris = [exportButler.getURI(d) for d in datasets]
1303 datastoreRoot = exportButler.datastore.root
1305 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1307 for path in pathsInStore:
1308 # Assume local file system
1309 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1311 for transfer in ("copy", "link", "symlink", "relsymlink"):
1312 with safeTestTempDir(TESTDIR) as exportDir:
1313 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1314 export.saveDatasets(datasets)
1315 for path in pathsInStore:
1316 self.assertTrue(
1317 self.checkFileExists(exportDir, path),
1318 f"Check that mode {transfer} exported files",
1319 )
1321 def testPruneDatasets(self):
1322 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1323 butler = Butler(self.tmpConfigFile, writeable=True)
1324 # Load registry data with dimensions to hang datasets off of.
1325 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1326 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1327 # Add some RUN-type collections.
1328 run1 = "run1"
1329 butler.registry.registerRun(run1)
1330 run2 = "run2"
1331 butler.registry.registerRun(run2)
1332 # put some datasets. ref1 and ref2 have the same data ID, and are in
1333 # different runs. ref3 has a different data ID.
1334 metric = makeExampleMetrics()
1335 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1336 datasetType = self.addDatasetType(
1337 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1338 )
1339 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1340 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1341 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1343 # Simple prune.
1344 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1345 with self.assertRaises(LookupError):
1346 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1348 # Put data back.
1349 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1350 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1351 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1353 # Check that in normal mode, deleting the record will lead to
1354 # trash not touching the file.
1355 uri1 = butler.datastore.getURI(ref1)
1356 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1357 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1358 butler.datastore.trash(ref1)
1359 butler.datastore.emptyTrash()
1360 self.assertTrue(uri1.exists())
1361 uri1.remove() # Clean it up.
1363 # Simulate execution butler setup by deleting the datastore
1364 # record but keeping the file around and trusting.
1365 butler.datastore.trustGetRequest = True
1366 uri2 = butler.datastore.getURI(ref2)
1367 uri3 = butler.datastore.getURI(ref3)
1368 self.assertTrue(uri2.exists())
1369 self.assertTrue(uri3.exists())
1371 # Remove the datastore record.
1372 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1373 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1374 self.assertTrue(uri2.exists())
1375 butler.datastore.trash([ref2, ref3])
1376 # Immediate removal for ref2 file
1377 self.assertFalse(uri2.exists())
1378 # But ref3 has to wait for the empty.
1379 self.assertTrue(uri3.exists())
1380 butler.datastore.emptyTrash()
1381 self.assertFalse(uri3.exists())
1383 # Clear out the datasets from registry.
1384 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1386 def testPytypePutCoercion(self):
1387 """Test python type coercion on Butler.get and put."""
1389 # Store some data with the normal example storage class.
1390 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1391 datasetTypeName = "test_metric"
1392 butler, _ = self.create_butler("ingest", storageClass, datasetTypeName)
1394 dataId = {"instrument": "DummyCamComp", "visit": 423}
1396 # Put a dict and this should coerce to a MetricsExample
1397 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1398 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1399 test_metric = butler.getDirect(metric_ref)
1400 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1401 self.assertEqual(test_metric.summary, test_dict["summary"])
1402 self.assertEqual(test_metric.output, test_dict["output"])
1404 # Check that the put still works if a DatasetType is given with
1405 # a definition matching this python type.
1406 registry_type = butler.registry.getDatasetType(datasetTypeName)
1407 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
1408 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
1409 self.assertEqual(metric2_ref.datasetType, registry_type)
1411 # The get will return the type expected by registry.
1412 test_metric2 = butler.getDirect(metric2_ref)
1413 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
1415 # Make a new DatasetRef with the compatible but different DatasetType.
1416 # This should now return a dict.
1417 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
1418 test_dict2 = butler.getDirect(new_ref)
1419 self.assertEqual(get_full_type_name(test_dict2), "dict")
1421 # Get it again with the wrong dataset type definition using get()
1422 # rather than getDirect(). This should be consistent with getDirect()
1423 # behavior and return the type of the DatasetType.
1424 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
1425 self.assertEqual(get_full_type_name(test_dict3), "dict")
1427 def testPytypeCoercion(self):
1428 """Test python type coercion on Butler.get and put."""
1430 # Store some data with the normal example storage class.
1431 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1432 datasetTypeName = "test_metric"
1433 butler = self.runPutGetTest(storageClass, datasetTypeName)
1435 dataId = {"instrument": "DummyCamComp", "visit": 423}
1436 metric = butler.get(datasetTypeName, dataId=dataId)
1437 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1439 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1440 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1442 # Now need to hack the registry dataset type definition.
1443 # There is no API for this.
1444 manager = butler.registry._managers.datasets
1445 manager._db.update(
1446 manager._static.dataset_type,
1447 {"name": datasetTypeName},
1448 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1449 )
1451 # Force reset of dataset type cache
1452 butler.registry.refresh()
1454 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1455 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1456 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1458 metric_model = butler.get(datasetTypeName, dataId=dataId)
1459 self.assertNotEqual(type(metric_model), type(metric))
1460 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1462 # Put the model and read it back to show that everything now
1463 # works as normal.
1464 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1465 metric_model_new = butler.get(metric_ref)
1466 self.assertEqual(metric_model_new, metric_model)
1468 # Hack the storage class again to something that will fail on the
1469 # get with no conversion class.
1470 manager._db.update(
1471 manager._static.dataset_type,
1472 {"name": datasetTypeName},
1473 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1474 )
1475 butler.registry.refresh()
1477 with self.assertRaises(ValueError):
1478 butler.get(datasetTypeName, dataId=dataId)
1481class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1482 """InMemoryDatastore specialization of a butler"""
1484 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1485 fullConfigKey = None
1486 useTempRoot = False
1487 validationCanFail = False
1488 datastoreStr = ["datastore='InMemory"]
1489 datastoreName = ["InMemoryDatastore@"]
1490 registryStr = "/gen3.sqlite3"
1492 def testIngest(self):
1493 pass
1496class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1497 """PosixDatastore specialization"""
1499 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1500 fullConfigKey = ".datastore.datastores.1.formatters"
1501 validationCanFail = True
1502 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1503 datastoreName = [
1504 "InMemoryDatastore@",
1505 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1506 "SecondDatastore",
1507 ]
1508 registryStr = "/gen3.sqlite3"
1511class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1512 """Test that a yaml file in one location can refer to a root in another."""
1514 datastoreStr = ["dir1"]
1515 # Disable the makeRepo test since we are deliberately not using
1516 # butler.yaml as the config name.
1517 fullConfigKey = None
1519 def setUp(self):
1520 self.root = makeTestTempDir(TESTDIR)
1522 # Make a new repository in one place
1523 self.dir1 = os.path.join(self.root, "dir1")
1524 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1526 # Move the yaml file to a different place and add a "root"
1527 self.dir2 = os.path.join(self.root, "dir2")
1528 os.makedirs(self.dir2, exist_ok=True)
1529 configFile1 = os.path.join(self.dir1, "butler.yaml")
1530 config = Config(configFile1)
1531 config["root"] = self.dir1
1532 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1533 config.dumpToUri(configFile2)
1534 os.remove(configFile1)
1535 self.tmpConfigFile = configFile2
1537 def testFileLocations(self):
1538 self.assertNotEqual(self.dir1, self.dir2)
1539 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1540 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1541 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1544class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1545 """Test that a config file created by makeRepo outside of repo works."""
1547 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1549 def setUp(self):
1550 self.root = makeTestTempDir(TESTDIR)
1551 self.root2 = makeTestTempDir(TESTDIR)
1553 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1554 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1556 def tearDown(self):
1557 if os.path.exists(self.root2):
1558 shutil.rmtree(self.root2, ignore_errors=True)
1559 super().tearDown()
1561 def testConfigExistence(self):
1562 c = Config(self.tmpConfigFile)
1563 uri_config = ResourcePath(c["root"])
1564 uri_expected = ResourcePath(self.root, forceDirectory=True)
1565 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1566 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1568 def testPutGet(self):
1569 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1570 self.runPutGetTest(storageClass, "test_metric")
1573class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1574 """Test that a config file created by makeRepo outside of repo works."""
1576 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1578 def setUp(self):
1579 self.root = makeTestTempDir(TESTDIR)
1580 self.root2 = makeTestTempDir(TESTDIR)
1582 self.tmpConfigFile = self.root2
1583 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1585 def testConfigExistence(self):
1586 # Append the yaml file else Config constructor does not know the file
1587 # type.
1588 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1589 super().testConfigExistence()
1592class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1593 """Test that a config file created by makeRepo outside of repo works."""
1595 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1597 def setUp(self):
1598 self.root = makeTestTempDir(TESTDIR)
1599 self.root2 = makeTestTempDir(TESTDIR)
1601 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1602 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1605@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1606class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1607 """S3Datastore specialization of a butler; an S3 storage Datastore +
1608 a local in-memory SqlRegistry.
1609 """
1611 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1612 fullConfigKey = None
1613 validationCanFail = True
1615 bucketName = "anybucketname"
1616 """Name of the Bucket that will be used in the tests. The name is read from
1617 the config file used with the tests during set-up.
1618 """
1620 root = "butlerRoot/"
1621 """Root repository directory expected to be used in case useTempRoot=False.
1622 Otherwise the root is set to a 20 characters long randomly generated string
1623 during set-up.
1624 """
1626 datastoreStr = [f"datastore={root}"]
1627 """Contains all expected root locations in a format expected to be
1628 returned by Butler stringification.
1629 """
1631 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1632 """The expected format of the S3 Datastore string."""
1634 registryStr = "/gen3.sqlite3"
1635 """Expected format of the Registry string."""
1637 mock_s3 = mock_s3()
1638 """The mocked s3 interface from moto."""
1640 def genRoot(self):
1641 """Returns a random string of len 20 to serve as a root
1642 name for the temporary bucket repo.
1644 This is equivalent to tempfile.mkdtemp as this is what self.root
1645 becomes when useTempRoot is True.
1646 """
1647 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1648 return rndstr + "/"
1650 def setUp(self):
1651 config = Config(self.configFile)
1652 uri = ResourcePath(config[".datastore.datastore.root"])
1653 self.bucketName = uri.netloc
1655 # Enable S3 mocking of tests.
1656 self.mock_s3.start()
1658 # set up some fake credentials if they do not exist
1659 self.usingDummyCredentials = setAwsEnvCredentials()
1661 if self.useTempRoot:
1662 self.root = self.genRoot()
1663 rooturi = f"s3://{self.bucketName}/{self.root}"
1664 config.update({"datastore": {"datastore": {"root": rooturi}}})
1666 # need local folder to store registry database
1667 self.reg_dir = makeTestTempDir(TESTDIR)
1668 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1670 # MOTO needs to know that we expect Bucket bucketname to exist
1671 # (this used to be the class attribute bucketName)
1672 s3 = boto3.resource("s3")
1673 s3.create_bucket(Bucket=self.bucketName)
1675 self.datastoreStr = f"datastore={self.root}"
1676 self.datastoreName = [f"FileDatastore@{rooturi}"]
1677 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1678 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1680 def tearDown(self):
1681 s3 = boto3.resource("s3")
1682 bucket = s3.Bucket(self.bucketName)
1683 try:
1684 bucket.objects.all().delete()
1685 except botocore.exceptions.ClientError as e:
1686 if e.response["Error"]["Code"] == "404":
1687 # the key was not reachable - pass
1688 pass
1689 else:
1690 raise
1692 bucket = s3.Bucket(self.bucketName)
1693 bucket.delete()
1695 # Stop the S3 mock.
1696 self.mock_s3.stop()
1698 # unset any potentially set dummy credentials
1699 if self.usingDummyCredentials:
1700 unsetAwsEnvCredentials()
1702 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1703 shutil.rmtree(self.reg_dir, ignore_errors=True)
1705 if self.useTempRoot and os.path.exists(self.root):
1706 shutil.rmtree(self.root, ignore_errors=True)
1709@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1710# Mock required environment variables during tests
1711@unittest.mock.patch.dict(
1712 os.environ,
1713 {
1714 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1715 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1716 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1717 },
1718)
1719class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1720 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1721 a local in-memory SqlRegistry.
1722 """
1724 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1725 fullConfigKey = None
1726 validationCanFail = True
1728 serverName = "localhost"
1729 """Name of the server that will be used in the tests.
1730 """
1732 portNumber = 8080
1733 """Port on which the webdav server listens. Automatically chosen
1734 at setUpClass via the _getfreeport() method
1735 """
1737 root = "butlerRoot/"
1738 """Root repository directory expected to be used in case useTempRoot=False.
1739 Otherwise the root is set to a 20 characters long randomly generated string
1740 during set-up.
1741 """
1743 datastoreStr = [f"datastore={root}"]
1744 """Contains all expected root locations in a format expected to be
1745 returned by Butler stringification.
1746 """
1748 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1749 """The expected format of the WebdavDatastore string."""
1751 registryStr = "/gen3.sqlite3"
1752 """Expected format of the Registry string."""
1754 serverThread = None
1755 """Thread in which the local webdav server will run"""
1757 stopWebdavServer = False
1758 """This flag will cause the webdav server to
1759 gracefully shut down when True
1760 """
1762 def genRoot(self):
1763 """Returns a random string of len 20 to serve as a root
1764 name for the temporary bucket repo.
1766 This is equivalent to tempfile.mkdtemp as this is what self.root
1767 becomes when useTempRoot is True.
1768 """
1769 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1770 return rndstr + "/"
1772 @classmethod
1773 def setUpClass(cls):
1774 # Do the same as inherited class
1775 cls.storageClassFactory = StorageClassFactory()
1776 cls.storageClassFactory.addFromConfig(cls.configFile)
1778 cls.portNumber = cls._getfreeport()
1779 # Run a local webdav server on which tests will be run
1780 cls.serverThread = Thread(
1781 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1782 )
1783 cls.serverThread.start()
1784 # Wait for it to start
1785 time.sleep(3)
1787 @classmethod
1788 def tearDownClass(cls):
1789 # Ask for graceful shut down of the webdav server
1790 cls.stopWebdavServer = True
1791 # Wait for the thread to exit
1792 cls.serverThread.join()
1794 # Mock required environment variables during tests
1795 @unittest.mock.patch.dict(
1796 os.environ,
1797 {
1798 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1799 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1800 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1801 },
1802 )
1803 def setUp(self):
1804 config = Config(self.configFile)
1806 if self.useTempRoot:
1807 self.root = self.genRoot()
1808 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1809 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1811 # need local folder to store registry database
1812 self.reg_dir = makeTestTempDir(TESTDIR)
1813 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1815 self.datastoreStr = f"datastore={self.root}"
1816 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1818 if not isWebdavEndpoint(self.rooturi):
1819 raise OSError("Webdav server not running properly: cannot run tests.")
1821 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1822 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1824 # Mock required environment variables during tests
1825 @unittest.mock.patch.dict(
1826 os.environ,
1827 {
1828 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1829 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1830 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1831 },
1832 )
1833 def tearDown(self):
1834 # Clear temporary directory
1835 ResourcePath(self.rooturi).remove()
1836 ResourcePath(self.rooturi).session.close()
1838 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1839 shutil.rmtree(self.reg_dir, ignore_errors=True)
1841 if self.useTempRoot and os.path.exists(self.root):
1842 shutil.rmtree(self.root, ignore_errors=True)
1844 def _serveWebdav(self, port: int, stopWebdavServer):
1845 """Starts a local webdav-compatible HTTP server,
1846 Listening on http://localhost:port
1847 This server only runs when this test class is instantiated,
1848 and then shuts down. Must be started is a separate thread.
1850 Parameters
1851 ----------
1852 port : `int`
1853 The port number on which the server should listen
1854 """
1855 root_path = gettempdir()
1857 config = {
1858 "host": "0.0.0.0",
1859 "port": port,
1860 "provider_mapping": {"/": root_path},
1861 "http_authenticator": {"domain_controller": None},
1862 "simple_dc": {"user_mapping": {"*": True}},
1863 "verbose": 0,
1864 }
1865 app = WsgiDAVApp(config)
1867 server_args = {
1868 "bind_addr": (config["host"], config["port"]),
1869 "wsgi_app": app,
1870 }
1871 server = wsgi.Server(**server_args)
1872 server.prepare()
1874 try:
1875 # Start the actual server in a separate thread
1876 t = Thread(target=server.serve, daemon=True)
1877 t.start()
1878 # watch stopWebdavServer, and gracefully
1879 # shut down the server when True
1880 while True:
1881 if stopWebdavServer():
1882 break
1883 time.sleep(1)
1884 except KeyboardInterrupt:
1885 print("Caught Ctrl-C, shutting down...")
1886 finally:
1887 server.stop()
1888 t.join()
1890 def _getfreeport():
1891 """
1892 Determines a free port using sockets.
1893 """
1894 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1895 free_socket.bind(("0.0.0.0", 0))
1896 free_socket.listen()
1897 port = free_socket.getsockname()[1]
1898 free_socket.close()
1899 return port
1902class PosixDatastoreTransfers(unittest.TestCase):
1903 """Test data transfers between butlers.
1905 Test for different managers. UUID to UUID and integer to integer are
1906 tested. UUID to integer is not supported since we do not currently
1907 want to allow that. Integer to UUID is supported with the caveat
1908 that UUID4 will be generated and this will be incorrect for raw
1909 dataset types. The test ignores that.
1910 """
1912 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1914 @classmethod
1915 def setUpClass(cls):
1916 cls.storageClassFactory = StorageClassFactory()
1917 cls.storageClassFactory.addFromConfig(cls.configFile)
1919 def setUp(self):
1920 self.root = makeTestTempDir(TESTDIR)
1921 self.config = Config(self.configFile)
1923 def tearDown(self):
1924 removeTestTempDir(self.root)
1926 def create_butler(self, manager, label):
1927 config = Config(self.configFile)
1928 config["registry", "managers", "datasets"] = manager
1929 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1931 def create_butlers(self, manager1, manager2):
1932 self.source_butler = self.create_butler(manager1, "1")
1933 self.target_butler = self.create_butler(manager2, "2")
1935 def testTransferUuidToUuid(self):
1936 self.create_butlers(
1937 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1938 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1939 )
1940 # Setting id_gen_map should have no effect here
1941 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1943 def testTransferIntToInt(self):
1944 self.create_butlers(
1945 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1946 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1947 )
1948 # int dataset ID only allows UNIQUE
1949 self.assertButlerTransfers()
1951 def testTransferIntToUuid(self):
1952 self.create_butlers(
1953 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1954 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1955 )
1956 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1958 def testTransferMissing(self):
1959 """Test transfers where datastore records are missing.
1961 This is how execution butler works.
1962 """
1963 self.create_butlers(
1964 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1965 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1966 )
1968 # Configure the source butler to allow trust.
1969 self.source_butler.datastore.trustGetRequest = True
1971 self.assertButlerTransfers(purge=True)
1973 def testTransferMissingDisassembly(self):
1974 """Test transfers where datastore records are missing.
1976 This is how execution butler works.
1977 """
1978 self.create_butlers(
1979 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1980 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1981 )
1983 # Configure the source butler to allow trust.
1984 self.source_butler.datastore.trustGetRequest = True
1986 # Test disassembly.
1987 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1989 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1990 """Test that a run can be transferred to another butler."""
1992 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1993 datasetTypeName = "random_data"
1995 # Test will create 3 collections and we will want to transfer
1996 # two of those three.
1997 runs = ["run1", "run2", "other"]
1999 # Also want to use two different dataset types to ensure that
2000 # grouping works.
2001 datasetTypeNames = ["random_data", "random_data_2"]
2003 # Create the run collections in the source butler.
2004 for run in runs:
2005 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2007 # Create dimensions in both butlers (transfer will not create them).
2008 n_exposures = 30
2009 for butler in (self.source_butler, self.target_butler):
2010 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2011 butler.registry.insertDimensionData(
2012 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2013 )
2014 butler.registry.insertDimensionData(
2015 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2016 )
2018 for i in range(n_exposures):
2019 butler.registry.insertDimensionData(
2020 "exposure",
2021 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2022 )
2024 # Create dataset types in the source butler.
2025 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
2026 for datasetTypeName in datasetTypeNames:
2027 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2028 self.source_butler.registry.registerDatasetType(datasetType)
2030 # Write a dataset to an unrelated run -- this will ensure that
2031 # we are rewriting integer dataset ids in the target if necessary.
2032 # Will not be relevant for UUID.
2033 run = "distraction"
2034 butler = Butler(butler=self.source_butler, run=run)
2035 butler.put(
2036 makeExampleMetrics(),
2037 datasetTypeName,
2038 exposure=1,
2039 instrument="DummyCamComp",
2040 physical_filter="d-r",
2041 )
2043 # Write some example metrics to the source
2044 butler = Butler(butler=self.source_butler)
2046 # Set of DatasetRefs that should be in the list of refs to transfer
2047 # but which will not be transferred.
2048 deleted = set()
2050 n_expected = 20 # Number of datasets expected to be transferred
2051 source_refs = []
2052 for i in range(n_exposures):
2053 # Put a third of datasets into each collection, only retain
2054 # two thirds.
2055 index = i % 3
2056 run = runs[index]
2057 datasetTypeName = datasetTypeNames[i % 2]
2059 metric_data = {
2060 "summary": {"counter": i},
2061 "output": {"text": "metric"},
2062 "data": [2 * x for x in range(i)],
2063 }
2064 metric = MetricsExample(**metric_data)
2065 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2066 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2068 # Remove the datastore record using low-level API
2069 if purge:
2070 # Remove records for a fraction.
2071 if index == 1:
2073 # For one of these delete the file as well.
2074 # This allows the "missing" code to filter the
2075 # file out.
2076 if not deleted:
2077 primary, uris = butler.datastore.getURIs(ref)
2078 if primary:
2079 primary.remove()
2080 for uri in uris.values():
2081 uri.remove()
2082 n_expected -= 1
2083 deleted.add(ref)
2085 # Remove the datastore record.
2086 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2088 if index < 2:
2089 source_refs.append(ref)
2090 if ref not in deleted:
2091 new_metric = butler.get(ref.unresolved(), collections=run)
2092 self.assertEqual(new_metric, metric)
2094 # Create some bad dataset types to ensure we check for inconsistent
2095 # definitions.
2096 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2097 for datasetTypeName in datasetTypeNames:
2098 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2099 self.target_butler.registry.registerDatasetType(datasetType)
2100 with self.assertRaises(ConflictingDefinitionError):
2101 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2102 # And remove the bad definitions.
2103 for datasetTypeName in datasetTypeNames:
2104 self.target_butler.registry.removeDatasetType(datasetTypeName)
2106 # Transfer without creating dataset types should fail.
2107 with self.assertRaises(KeyError):
2108 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2110 # Now transfer them to the second butler
2111 with self.assertLogs(level=logging.DEBUG) as cm:
2112 transferred = self.target_butler.transfer_from(
2113 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2114 )
2115 self.assertEqual(len(transferred), n_expected)
2116 log_output = ";".join(cm.output)
2117 self.assertIn("found in datastore for chunk", log_output)
2118 self.assertIn("Creating output run", log_output)
2120 # Do the transfer twice to ensure that it will do nothing extra.
2121 # Only do this if purge=True because it does not work for int
2122 # dataset_id.
2123 if purge:
2124 # This should not need to register dataset types.
2125 transferred = self.target_butler.transfer_from(
2126 self.source_butler, source_refs, id_gen_map=id_gen_map
2127 )
2128 self.assertEqual(len(transferred), n_expected)
2130 # Also do an explicit low-level transfer to trigger some
2131 # edge cases.
2132 with self.assertLogs(level=logging.DEBUG) as cm:
2133 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2134 log_output = ";".join(cm.output)
2135 self.assertIn("no file artifacts exist", log_output)
2137 with self.assertRaises(TypeError):
2138 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2140 with self.assertRaises(ValueError):
2141 self.target_butler.datastore.transfer_from(
2142 self.source_butler.datastore, source_refs, transfer="split"
2143 )
2145 # Now try to get the same refs from the new butler.
2146 for ref in source_refs:
2147 if ref not in deleted:
2148 unresolved_ref = ref.unresolved()
2149 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2150 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2151 self.assertEqual(new_metric, old_metric)
2153 # Now prune run2 collection and create instead a CHAINED collection.
2154 # This should block the transfer.
2155 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2156 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2157 with self.assertRaises(CollectionTypeError):
2158 # Re-importing the run1 datasets can be problematic if they
2159 # use integer IDs so filter those out.
2160 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2161 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2164if __name__ == "__main__": 2164 ↛ 2165line 2164 didn't jump to line 2165, because the condition on line 2164 was never true
2165 unittest.main()