Coverage for tests/test_butler.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionSearch,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import ConflictingDefinitionError, MissingCollectionError
77from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
78from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
79from lsst.resources import ResourcePath
80from lsst.resources.http import isWebdavEndpoint
81from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
82from lsst.utils import doImport
83from lsst.utils.introspection import get_full_type_name
85TESTDIR = os.path.abspath(os.path.dirname(__file__))
88def makeExampleMetrics():
89 return MetricsExample(
90 {"AM1": 5.2, "AM2": 30.6},
91 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
92 [563, 234, 456.7, 752, 8, 9, 27],
93 )
96class TransactionTestError(Exception):
97 """Specific error for testing transactions, to prevent misdiagnosing
98 that might otherwise occur when a standard exception is used.
99 """
101 pass
104class ButlerConfigTests(unittest.TestCase):
105 """Simple tests for ButlerConfig that are not tested in any other test
106 cases."""
108 def testSearchPath(self):
109 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
110 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
111 config1 = ButlerConfig(configFile)
112 self.assertNotIn("testConfigs", "\n".join(cm.output))
114 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
115 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
116 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
117 self.assertIn("testConfigs", "\n".join(cm.output))
119 key = ("datastore", "records", "table")
120 self.assertNotEqual(config1[key], config2[key])
121 self.assertEqual(config2[key], "override_record")
124class ButlerPutGetTests:
125 """Helper method for running a suite of put/get tests from different
126 butler configurations."""
128 root = None
130 @staticmethod
131 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
132 """Create a DatasetType and register it"""
133 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
134 registry.registerDatasetType(datasetType)
135 return datasetType
137 @classmethod
138 def setUpClass(cls):
139 cls.storageClassFactory = StorageClassFactory()
140 cls.storageClassFactory.addFromConfig(cls.configFile)
142 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
143 datasetType = datasetRef.datasetType
144 dataId = datasetRef.dataId
145 deferred = butler.getDirectDeferred(datasetRef)
147 for component in components:
148 compTypeName = datasetType.componentTypeName(component)
149 result = butler.get(compTypeName, dataId, collections=collections)
150 self.assertEqual(result, getattr(reference, component))
151 result_deferred = deferred.get(component=component)
152 self.assertEqual(result_deferred, result)
154 def tearDown(self):
155 removeTestTempDir(self.root)
157 def runPutGetTest(self, storageClass, datasetTypeName):
158 # New datasets will be added to run and tag, but we will only look in
159 # tag when looking up datasets.
160 run = "ingest"
161 butler = Butler(self.tmpConfigFile, run=run)
163 collections = set(butler.registry.queryCollections())
164 self.assertEqual(collections, set([run]))
166 # Create and register a DatasetType
167 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
169 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
171 # Add needed Dimensions
172 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
173 butler.registry.insertDimensionData(
174 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
175 )
176 butler.registry.insertDimensionData(
177 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
178 )
179 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
180 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
181 butler.registry.insertDimensionData(
182 "visit",
183 {
184 "instrument": "DummyCamComp",
185 "id": 423,
186 "name": "fourtwentythree",
187 "physical_filter": "d-r",
188 "visit_system": 1,
189 "datetime_begin": visit_start,
190 "datetime_end": visit_end,
191 },
192 )
194 # Add a second visit for some later tests
195 butler.registry.insertDimensionData(
196 "visit",
197 {
198 "instrument": "DummyCamComp",
199 "id": 424,
200 "name": "fourtwentyfour",
201 "physical_filter": "d-r",
202 "visit_system": 1,
203 },
204 )
206 # Create and store a dataset
207 metric = makeExampleMetrics()
208 dataId = {"instrument": "DummyCamComp", "visit": 423}
210 # Create a DatasetRef for put
211 refIn = DatasetRef(datasetType, dataId, id=None)
213 # Put with a preexisting id should fail
214 with self.assertRaises(ValueError):
215 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
217 # Put and remove the dataset once as a DatasetRef, once as a dataId,
218 # and once with a DatasetType
220 # Keep track of any collections we add and do not clean up
221 expected_collections = {run}
223 counter = 0
224 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
225 # Since we are using subTest we can get cascading failures
226 # here with the first attempt failing and the others failing
227 # immediately because the dataset already exists. Work around
228 # this by using a distinct run collection each time
229 counter += 1
230 this_run = f"put_run_{counter}"
231 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
232 expected_collections.update({this_run})
234 with self.subTest(args=args):
235 ref = butler.put(metric, *args, run=this_run)
236 self.assertIsInstance(ref, DatasetRef)
238 # Test getDirect
239 metricOut = butler.getDirect(ref)
240 self.assertEqual(metric, metricOut)
241 # Test get
242 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
243 self.assertEqual(metric, metricOut)
244 # Test get with a datasetRef
245 metricOut = butler.get(ref, collections=this_run)
246 self.assertEqual(metric, metricOut)
247 # Test getDeferred with dataId
248 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
249 self.assertEqual(metric, metricOut)
250 # Test getDeferred with a datasetRef
251 metricOut = butler.getDeferred(ref, collections=this_run).get()
252 self.assertEqual(metric, metricOut)
253 # and deferred direct with ref
254 metricOut = butler.getDirectDeferred(ref).get()
255 self.assertEqual(metric, metricOut)
257 # Check we can get components
258 if storageClass.isComposite():
259 self.assertGetComponents(
260 butler, ref, ("summary", "data", "output"), metric, collections=this_run
261 )
263 # Can the artifacts themselves be retrieved?
264 if not butler.datastore.isEphemeral:
265 root_uri = ResourcePath(self.root)
267 for preserve_path in (True, False):
268 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
269 # Use copy so that we can test that overwrite
270 # protection works (using "auto" for File URIs would
271 # use hard links and subsequent transfer would work
272 # because it knows they are the same file).
273 transferred = butler.retrieveArtifacts(
274 [ref], destination, preserve_path=preserve_path, transfer="copy"
275 )
276 self.assertGreater(len(transferred), 0)
277 artifacts = list(ResourcePath.findFileResources([destination]))
278 self.assertEqual(set(transferred), set(artifacts))
280 for artifact in transferred:
281 path_in_destination = artifact.relative_to(destination)
282 self.assertIsNotNone(path_in_destination)
284 # when path is not preserved there should not be
285 # any path separators.
286 num_seps = path_in_destination.count("/")
287 if preserve_path:
288 self.assertGreater(num_seps, 0)
289 else:
290 self.assertEqual(num_seps, 0)
292 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
293 n_uris = len(secondary_uris)
294 if primary_uri:
295 n_uris += 1
296 self.assertEqual(
297 len(artifacts),
298 n_uris,
299 "Comparing expected artifacts vs actual:"
300 f" {artifacts} vs {primary_uri} and {secondary_uris}",
301 )
303 if preserve_path:
304 # No need to run these twice
305 with self.assertRaises(ValueError):
306 butler.retrieveArtifacts([ref], destination, transfer="move")
308 with self.assertRaises(FileExistsError):
309 butler.retrieveArtifacts([ref], destination)
311 transferred_again = butler.retrieveArtifacts(
312 [ref], destination, preserve_path=preserve_path, overwrite=True
313 )
314 self.assertEqual(set(transferred_again), set(transferred))
316 # Now remove the dataset completely.
317 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
318 # Lookup with original args should still fail.
319 with self.assertRaises(LookupError):
320 butler.datasetExists(*args, collections=this_run)
321 # getDirect() should still fail.
322 with self.assertRaises(FileNotFoundError):
323 butler.getDirect(ref)
324 # Registry shouldn't be able to find it by dataset_id anymore.
325 self.assertIsNone(butler.registry.getDataset(ref.id))
327 # Do explicit registry removal since we know they are
328 # empty
329 butler.registry.removeCollection(this_run)
330 expected_collections.remove(this_run)
332 # Put the dataset again, since the last thing we did was remove it
333 # and we want to use the default collection.
334 ref = butler.put(metric, refIn)
336 # Get with parameters
337 stop = 4
338 sliced = butler.get(ref, parameters={"slice": slice(stop)})
339 self.assertNotEqual(metric, sliced)
340 self.assertEqual(metric.summary, sliced.summary)
341 self.assertEqual(metric.output, sliced.output)
342 self.assertEqual(metric.data[:stop], sliced.data)
343 # getDeferred with parameters
344 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
345 self.assertNotEqual(metric, sliced)
346 self.assertEqual(metric.summary, sliced.summary)
347 self.assertEqual(metric.output, sliced.output)
348 self.assertEqual(metric.data[:stop], sliced.data)
349 # getDeferred with deferred parameters
350 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
351 self.assertNotEqual(metric, sliced)
352 self.assertEqual(metric.summary, sliced.summary)
353 self.assertEqual(metric.output, sliced.output)
354 self.assertEqual(metric.data[:stop], sliced.data)
356 if storageClass.isComposite():
357 # Check that components can be retrieved
358 metricOut = butler.get(ref.datasetType.name, dataId)
359 compNameS = ref.datasetType.componentTypeName("summary")
360 compNameD = ref.datasetType.componentTypeName("data")
361 summary = butler.get(compNameS, dataId)
362 self.assertEqual(summary, metric.summary)
363 data = butler.get(compNameD, dataId)
364 self.assertEqual(data, metric.data)
366 if "counter" in storageClass.derivedComponents:
367 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
368 self.assertEqual(count, len(data))
370 count = butler.get(
371 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
372 )
373 self.assertEqual(count, stop)
375 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
376 summary = butler.getDirect(compRef)
377 self.assertEqual(summary, metric.summary)
379 # Create a Dataset type that has the same name but is inconsistent.
380 inconsistentDatasetType = DatasetType(
381 datasetTypeName, dimensions, self.storageClassFactory.getStorageClass("Config")
382 )
384 # Getting with a dataset type that does not match registry fails
385 with self.assertRaises(ValueError):
386 butler.get(inconsistentDatasetType, dataId)
388 # Combining a DatasetRef with a dataId should fail
389 with self.assertRaises(ValueError):
390 butler.get(ref, dataId)
391 # Getting with an explicit ref should fail if the id doesn't match
392 with self.assertRaises(ValueError):
393 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
395 # Getting a dataset with unknown parameters should fail
396 with self.assertRaises(KeyError):
397 butler.get(ref, parameters={"unsupported": True})
399 # Check we have a collection
400 collections = set(butler.registry.queryCollections())
401 self.assertEqual(collections, expected_collections)
403 # Clean up to check that we can remove something that may have
404 # already had a component removed
405 butler.pruneDatasets([ref], unstore=True, purge=True)
407 # Check that we can configure a butler to accept a put even
408 # if it already has the dataset in registry.
409 ref = butler.put(metric, refIn)
411 # Repeat put will fail.
412 with self.assertRaises(ConflictingDefinitionError):
413 butler.put(metric, refIn)
415 # Remove the datastore entry.
416 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
418 # Put will still fail
419 with self.assertRaises(ConflictingDefinitionError):
420 butler.put(metric, refIn)
422 # Allow the put to succeed
423 butler._allow_put_of_predefined_dataset = True
424 ref2 = butler.put(metric, refIn)
425 self.assertEqual(ref2.id, ref.id)
427 # A second put will still fail but with a different exception
428 # than before.
429 with self.assertRaises(ConflictingDefinitionError):
430 butler.put(metric, refIn)
432 # Reset the flag to avoid confusion
433 butler._allow_put_of_predefined_dataset = False
435 # Leave the dataset in place since some downstream tests require
436 # something to be present
438 return butler
440 def testDeferredCollectionPassing(self):
441 # Construct a butler with no run or collection, but make it writeable.
442 butler = Butler(self.tmpConfigFile, writeable=True)
443 # Create and register a DatasetType
444 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
445 datasetType = self.addDatasetType(
446 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
447 )
448 # Add needed Dimensions
449 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
450 butler.registry.insertDimensionData(
451 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
452 )
453 butler.registry.insertDimensionData(
454 "visit",
455 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
456 )
457 dataId = {"instrument": "DummyCamComp", "visit": 423}
458 # Create dataset.
459 metric = makeExampleMetrics()
460 # Register a new run and put dataset.
461 run = "deferred"
462 self.assertTrue(butler.registry.registerRun(run))
463 # Second time it will be allowed but indicate no-op
464 self.assertFalse(butler.registry.registerRun(run))
465 ref = butler.put(metric, datasetType, dataId, run=run)
466 # Putting with no run should fail with TypeError.
467 with self.assertRaises(TypeError):
468 butler.put(metric, datasetType, dataId)
469 # Dataset should exist.
470 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
471 # We should be able to get the dataset back, but with and without
472 # a deferred dataset handle.
473 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
474 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
475 # Trying to find the dataset without any collection is a TypeError.
476 with self.assertRaises(TypeError):
477 butler.datasetExists(datasetType, dataId)
478 with self.assertRaises(TypeError):
479 butler.get(datasetType, dataId)
480 # Associate the dataset with a different collection.
481 butler.registry.registerCollection("tagged")
482 butler.registry.associate("tagged", [ref])
483 # Deleting the dataset from the new collection should make it findable
484 # in the original collection.
485 butler.pruneDatasets([ref], tags=["tagged"])
486 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
489class ButlerTests(ButlerPutGetTests):
490 """Tests for Butler."""
492 useTempRoot = True
494 def setUp(self):
495 """Create a new butler root for each test."""
496 self.root = makeTestTempDir(TESTDIR)
497 Butler.makeRepo(self.root, config=Config(self.configFile))
498 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
500 def testConstructor(self):
501 """Independent test of constructor."""
502 butler = Butler(self.tmpConfigFile, run="ingest")
503 self.assertIsInstance(butler, Butler)
505 # Check that butler.yaml is added automatically.
506 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
507 config_dir = self.tmpConfigFile[: -len(end)]
508 butler = Butler(config_dir, run="ingest")
509 self.assertIsInstance(butler, Butler)
511 collections = set(butler.registry.queryCollections())
512 self.assertEqual(collections, {"ingest"})
514 butler2 = Butler(butler=butler, collections=["other"])
515 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
516 self.assertIsNone(butler2.run)
517 self.assertIs(butler.datastore, butler2.datastore)
519 # Test that we can use an environment variable to find this
520 # repository.
521 butler_index = Config()
522 butler_index["label"] = self.tmpConfigFile
523 for suffix in (".yaml", ".json"):
524 # Ensure that the content differs so that we know that
525 # we aren't reusing the cache.
526 bad_label = f"s3://bucket/not_real{suffix}"
527 butler_index["bad_label"] = bad_label
528 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
529 butler_index.dumpToUri(temp_file)
530 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
531 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
532 uri = Butler.get_repo_uri("bad_label")
533 self.assertEqual(uri, ResourcePath(bad_label))
534 uri = Butler.get_repo_uri("label")
535 butler = Butler(uri, writeable=False)
536 self.assertIsInstance(butler, Butler)
537 with self.assertRaises(KeyError) as cm:
538 Butler.get_repo_uri("missing")
539 self.assertIn("not known to", str(cm.exception))
540 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
541 with self.assertRaises(FileNotFoundError):
542 Butler.get_repo_uri("label")
543 self.assertEqual(Butler.get_known_repos(), set())
544 with self.assertRaises(KeyError) as cm:
545 # No environment variable set.
546 Butler.get_repo_uri("label")
547 self.assertIn("No repository index defined", str(cm.exception))
548 self.assertEqual(Butler.get_known_repos(), set())
550 def testBasicPutGet(self):
551 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
552 self.runPutGetTest(storageClass, "test_metric")
554 def testCompositePutGetConcrete(self):
556 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
557 butler = self.runPutGetTest(storageClass, "test_metric")
559 # Should *not* be disassembled
560 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
561 self.assertEqual(len(datasets), 1)
562 uri, components = butler.getURIs(datasets[0])
563 self.assertIsInstance(uri, ResourcePath)
564 self.assertFalse(components)
565 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
566 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
568 # Predicted dataset
569 dataId = {"instrument": "DummyCamComp", "visit": 424}
570 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
571 self.assertFalse(components)
572 self.assertIsInstance(uri, ResourcePath)
573 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
574 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
576 def testCompositePutGetVirtual(self):
577 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
578 butler = self.runPutGetTest(storageClass, "test_metric_comp")
580 # Should be disassembled
581 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
582 self.assertEqual(len(datasets), 1)
583 uri, components = butler.getURIs(datasets[0])
585 if butler.datastore.isEphemeral:
586 # Never disassemble in-memory datastore
587 self.assertIsInstance(uri, ResourcePath)
588 self.assertFalse(components)
589 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
590 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
591 else:
592 self.assertIsNone(uri)
593 self.assertEqual(set(components), set(storageClass.components))
594 for compuri in components.values():
595 self.assertIsInstance(compuri, ResourcePath)
596 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
597 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
599 # Predicted dataset
600 dataId = {"instrument": "DummyCamComp", "visit": 424}
601 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
603 if butler.datastore.isEphemeral:
604 # Never disassembled
605 self.assertIsInstance(uri, ResourcePath)
606 self.assertFalse(components)
607 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
608 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
609 else:
610 self.assertIsNone(uri)
611 self.assertEqual(set(components), set(storageClass.components))
612 for compuri in components.values():
613 self.assertIsInstance(compuri, ResourcePath)
614 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
615 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
617 def testIngest(self):
618 butler = Butler(self.tmpConfigFile, run="ingest")
620 # Create and register a DatasetType
621 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
623 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
624 datasetTypeName = "metric"
626 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
628 # Add needed Dimensions
629 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
630 butler.registry.insertDimensionData(
631 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
632 )
633 for detector in (1, 2):
634 butler.registry.insertDimensionData(
635 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
636 )
638 butler.registry.insertDimensionData(
639 "visit",
640 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
641 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
642 )
644 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
645 dataRoot = os.path.join(TESTDIR, "data", "basic")
646 datasets = []
647 for detector in (1, 2):
648 detector_name = f"detector_{detector}"
649 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
650 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
651 # Create a DatasetRef for ingest
652 refIn = DatasetRef(datasetType, dataId, id=None)
654 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
656 butler.ingest(*datasets, transfer="copy")
658 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
659 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
661 metrics1 = butler.get(datasetTypeName, dataId1)
662 metrics2 = butler.get(datasetTypeName, dataId2)
663 self.assertNotEqual(metrics1, metrics2)
665 # Compare URIs
666 uri1 = butler.getURI(datasetTypeName, dataId1)
667 uri2 = butler.getURI(datasetTypeName, dataId2)
668 self.assertNotEqual(uri1, uri2)
670 # Now do a multi-dataset but single file ingest
671 metricFile = os.path.join(dataRoot, "detectors.yaml")
672 refs = []
673 for detector in (1, 2):
674 detector_name = f"detector_{detector}"
675 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
676 # Create a DatasetRef for ingest
677 refs.append(DatasetRef(datasetType, dataId, id=None))
679 datasets = []
680 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
682 butler.ingest(*datasets, transfer="copy")
684 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
685 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
687 multi1 = butler.get(datasetTypeName, dataId1)
688 multi2 = butler.get(datasetTypeName, dataId2)
690 self.assertEqual(multi1, metrics1)
691 self.assertEqual(multi2, metrics2)
693 # Compare URIs
694 uri1 = butler.getURI(datasetTypeName, dataId1)
695 uri2 = butler.getURI(datasetTypeName, dataId2)
696 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
698 # Test that removing one does not break the second
699 # This line will issue a warning log message for a ChainedDatastore
700 # that uses an InMemoryDatastore since in-memory can not ingest
701 # files.
702 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
703 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
704 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
705 multi2b = butler.get(datasetTypeName, dataId2)
706 self.assertEqual(multi2, multi2b)
708 def testPruneCollections(self):
709 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
710 butler = Butler(self.tmpConfigFile, writeable=True)
711 # Load registry data with dimensions to hang datasets off of.
712 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
713 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
714 # Add some RUN-type collections.
715 run1 = "run1"
716 butler.registry.registerRun(run1)
717 run2 = "run2"
718 butler.registry.registerRun(run2)
719 # put some datasets. ref1 and ref2 have the same data ID, and are in
720 # different runs. ref3 has a different data ID.
721 metric = makeExampleMetrics()
722 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
723 datasetType = self.addDatasetType(
724 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
725 )
726 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
727 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
728 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
730 # Try to delete a RUN collection without purge, or with purge and not
731 # unstore.
732 with self.assertRaises(TypeError):
733 butler.pruneCollection(run1)
734 with self.assertRaises(TypeError):
735 butler.pruneCollection(run2, purge=True)
736 # Add a TAGGED collection and associate ref3 only into it.
737 tag1 = "tag1"
738 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
739 self.assertTrue(registered)
740 # Registering a second time should be allowed.
741 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
742 self.assertFalse(registered)
743 butler.registry.associate(tag1, [ref3])
744 # Add a CHAINED collection that searches run1 and then run2. It
745 # logically contains only ref1, because ref2 is shadowed due to them
746 # having the same data ID and dataset type.
747 chain1 = "chain1"
748 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
749 butler.registry.setCollectionChain(chain1, [run1, run2])
750 # Try to delete RUN collections, which should fail with complete
751 # rollback because they're still referenced by the CHAINED
752 # collection.
753 with self.assertRaises(Exception):
754 butler.pruneCollection(run1, pruge=True, unstore=True)
755 with self.assertRaises(Exception):
756 butler.pruneCollection(run2, pruge=True, unstore=True)
757 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
758 existence = butler.datastore.mexists([ref1, ref2, ref3])
759 self.assertTrue(existence[ref1])
760 self.assertTrue(existence[ref2])
761 self.assertTrue(existence[ref3])
762 # Try to delete CHAINED and TAGGED collections with purge; should not
763 # work.
764 with self.assertRaises(TypeError):
765 butler.pruneCollection(tag1, purge=True, unstore=True)
766 with self.assertRaises(TypeError):
767 butler.pruneCollection(chain1, purge=True, unstore=True)
768 # Remove the tagged collection with unstore=False. This should not
769 # affect the datasets.
770 butler.pruneCollection(tag1)
771 with self.assertRaises(MissingCollectionError):
772 butler.registry.getCollectionType(tag1)
773 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
774 existence = butler.datastore.mexists([ref1, ref2, ref3])
775 self.assertTrue(existence[ref1])
776 self.assertTrue(existence[ref2])
777 self.assertTrue(existence[ref3])
778 # Add the tagged collection back in, and remove it with unstore=True.
779 # This should remove ref3 only from the datastore.
780 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
781 butler.registry.associate(tag1, [ref3])
782 butler.pruneCollection(tag1, unstore=True)
783 with self.assertRaises(MissingCollectionError):
784 butler.registry.getCollectionType(tag1)
785 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
786 existence = butler.datastore.mexists([ref1, ref2, ref3])
787 self.assertTrue(existence[ref1])
788 self.assertTrue(existence[ref2])
789 self.assertFalse(existence[ref3])
790 # Delete the chain with unstore=False. The datasets should not be
791 # affected at all.
792 butler.pruneCollection(chain1)
793 with self.assertRaises(MissingCollectionError):
794 butler.registry.getCollectionType(chain1)
795 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
796 existence = butler.datastore.mexists([ref1, ref2, ref3])
797 self.assertTrue(existence[ref1])
798 self.assertTrue(existence[ref2])
799 self.assertFalse(existence[ref3])
800 # Redefine and then delete the chain with unstore=True. Only ref1
801 # should be unstored (ref3 has already been unstored, but otherwise
802 # would be now).
803 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
804 butler.registry.setCollectionChain(chain1, [run1, run2])
805 butler.pruneCollection(chain1, unstore=True)
806 with self.assertRaises(MissingCollectionError):
807 butler.registry.getCollectionType(chain1)
808 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
809 existence = butler.datastore.mexists([ref1, ref2, ref3])
810 self.assertFalse(existence[ref1])
811 self.assertTrue(existence[ref2])
812 self.assertFalse(existence[ref3])
813 # Remove run1. This removes ref1 and ref3 from the registry (they're
814 # already gone from the datastore, which is fine).
815 butler.pruneCollection(run1, purge=True, unstore=True)
816 with self.assertRaises(MissingCollectionError):
817 butler.registry.getCollectionType(run1)
818 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
819 self.assertTrue(butler.datastore.exists(ref2))
820 # Remove run2. This removes ref2 from the registry and the datastore.
821 butler.pruneCollection(run2, purge=True, unstore=True)
822 with self.assertRaises(MissingCollectionError):
823 butler.registry.getCollectionType(run2)
824 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
826 # Now that the collections have been pruned we can remove the
827 # dataset type
828 butler.registry.removeDatasetType(datasetType.name)
830 def testPickle(self):
831 """Test pickle support."""
832 butler = Butler(self.tmpConfigFile, run="ingest")
833 butlerOut = pickle.loads(pickle.dumps(butler))
834 self.assertIsInstance(butlerOut, Butler)
835 self.assertEqual(butlerOut._config, butler._config)
836 self.assertEqual(butlerOut.collections, butler.collections)
837 self.assertEqual(butlerOut.run, butler.run)
839 def testGetDatasetTypes(self):
840 butler = Butler(self.tmpConfigFile, run="ingest")
841 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
842 dimensionEntries = [
843 (
844 "instrument",
845 {"instrument": "DummyCam"},
846 {"instrument": "DummyHSC"},
847 {"instrument": "DummyCamComp"},
848 ),
849 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
850 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
851 ]
852 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
853 # Add needed Dimensions
854 for args in dimensionEntries:
855 butler.registry.insertDimensionData(*args)
857 # When a DatasetType is added to the registry entries are not created
858 # for components but querying them can return the components.
859 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
860 components = set()
861 for datasetTypeName in datasetTypeNames:
862 # Create and register a DatasetType
863 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
865 for componentName in storageClass.components:
866 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
868 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
869 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
871 # Now that we have some dataset types registered, validate them
872 butler.validateConfiguration(
873 ignore=[
874 "test_metric_comp",
875 "metric3",
876 "calexp",
877 "DummySC",
878 "datasetType.component",
879 "random_data",
880 "random_data_2",
881 ]
882 )
884 # Add a new datasetType that will fail template validation
885 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
886 if self.validationCanFail:
887 with self.assertRaises(ValidationError):
888 butler.validateConfiguration()
890 # Rerun validation but with a subset of dataset type names
891 butler.validateConfiguration(datasetTypeNames=["metric4"])
893 # Rerun validation but ignore the bad datasetType
894 butler.validateConfiguration(
895 ignore=[
896 "test_metric_comp",
897 "metric3",
898 "calexp",
899 "DummySC",
900 "datasetType.component",
901 "random_data",
902 "random_data_2",
903 ]
904 )
906 def testTransaction(self):
907 butler = Butler(self.tmpConfigFile, run="ingest")
908 datasetTypeName = "test_metric"
909 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
910 dimensionEntries = (
911 ("instrument", {"instrument": "DummyCam"}),
912 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
913 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
914 )
915 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
916 metric = makeExampleMetrics()
917 dataId = {"instrument": "DummyCam", "visit": 42}
918 # Create and register a DatasetType
919 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
920 with self.assertRaises(TransactionTestError):
921 with butler.transaction():
922 # Add needed Dimensions
923 for args in dimensionEntries:
924 butler.registry.insertDimensionData(*args)
925 # Store a dataset
926 ref = butler.put(metric, datasetTypeName, dataId)
927 self.assertIsInstance(ref, DatasetRef)
928 # Test getDirect
929 metricOut = butler.getDirect(ref)
930 self.assertEqual(metric, metricOut)
931 # Test get
932 metricOut = butler.get(datasetTypeName, dataId)
933 self.assertEqual(metric, metricOut)
934 # Check we can get components
935 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
936 raise TransactionTestError("This should roll back the entire transaction")
937 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"):
938 butler.registry.expandDataId(dataId)
939 # Should raise LookupError for missing data ID value
940 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
941 butler.get(datasetTypeName, dataId)
942 # Also check explicitly if Dataset entry is missing
943 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
944 # Direct retrieval should not find the file in the Datastore
945 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
946 butler.getDirect(ref)
948 def testMakeRepo(self):
949 """Test that we can write butler configuration to a new repository via
950 the Butler.makeRepo interface and then instantiate a butler from the
951 repo root.
952 """
953 # Do not run the test if we know this datastore configuration does
954 # not support a file system root
955 if self.fullConfigKey is None:
956 return
958 # create two separate directories
959 root1 = tempfile.mkdtemp(dir=self.root)
960 root2 = tempfile.mkdtemp(dir=self.root)
962 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
963 limited = Config(self.configFile)
964 butler1 = Butler(butlerConfig)
965 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
966 full = Config(self.tmpConfigFile)
967 butler2 = Butler(butlerConfig)
968 # Butlers should have the same configuration regardless of whether
969 # defaults were expanded.
970 self.assertEqual(butler1._config, butler2._config)
971 # Config files loaded directly should not be the same.
972 self.assertNotEqual(limited, full)
973 # Make sure "limited" doesn't have a few keys we know it should be
974 # inheriting from defaults.
975 self.assertIn(self.fullConfigKey, full)
976 self.assertNotIn(self.fullConfigKey, limited)
978 # Collections don't appear until something is put in them
979 collections1 = set(butler1.registry.queryCollections())
980 self.assertEqual(collections1, set())
981 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
983 # Check that a config with no associated file name will not
984 # work properly with relocatable Butler repo
985 butlerConfig.configFile = None
986 with self.assertRaises(ValueError):
987 Butler(butlerConfig)
989 with self.assertRaises(FileExistsError):
990 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
992 def testStringification(self):
993 butler = Butler(self.tmpConfigFile, run="ingest")
994 butlerStr = str(butler)
996 if self.datastoreStr is not None:
997 for testStr in self.datastoreStr:
998 self.assertIn(testStr, butlerStr)
999 if self.registryStr is not None:
1000 self.assertIn(self.registryStr, butlerStr)
1002 datastoreName = butler.datastore.name
1003 if self.datastoreName is not None:
1004 for testStr in self.datastoreName:
1005 self.assertIn(testStr, datastoreName)
1007 def testButlerRewriteDataId(self):
1008 """Test that dataIds can be rewritten based on dimension records."""
1010 butler = Butler(self.tmpConfigFile, run="ingest")
1012 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1013 datasetTypeName = "random_data"
1015 # Create dimension records.
1016 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1017 butler.registry.insertDimensionData(
1018 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1019 )
1020 butler.registry.insertDimensionData(
1021 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1022 )
1024 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1025 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1026 butler.registry.registerDatasetType(datasetType)
1028 n_exposures = 5
1029 dayobs = 20210530
1031 for i in range(n_exposures):
1032 butler.registry.insertDimensionData(
1033 "exposure",
1034 {
1035 "instrument": "DummyCamComp",
1036 "id": i,
1037 "obs_id": f"exp{i}",
1038 "seq_num": i,
1039 "day_obs": dayobs,
1040 "physical_filter": "d-r",
1041 },
1042 )
1044 # Write some data.
1045 for i in range(n_exposures):
1046 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1048 # Use the seq_num for the put to test rewriting.
1049 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1050 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1052 # Check that the exposure is correct in the dataId
1053 self.assertEqual(ref.dataId["exposure"], i)
1055 # and check that we can get the dataset back with the same dataId
1056 new_metric = butler.get(datasetTypeName, dataId=dataId)
1057 self.assertEqual(new_metric, metric)
1060class FileDatastoreButlerTests(ButlerTests):
1061 """Common tests and specialization of ButlerTests for butlers backed
1062 by datastores that inherit from FileDatastore.
1063 """
1065 def checkFileExists(self, root, relpath):
1066 """Checks if file exists at a given path (relative to root).
1068 Test testPutTemplates verifies actual physical existance of the files
1069 in the requested location.
1070 """
1071 uri = ResourcePath(root, forceDirectory=True)
1072 return uri.join(relpath).exists()
1074 def testPutTemplates(self):
1075 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1076 butler = Butler(self.tmpConfigFile, run="ingest")
1078 # Add needed Dimensions
1079 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1080 butler.registry.insertDimensionData(
1081 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1082 )
1083 butler.registry.insertDimensionData(
1084 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1085 )
1086 butler.registry.insertDimensionData(
1087 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1088 )
1090 # Create and store a dataset
1091 metric = makeExampleMetrics()
1093 # Create two almost-identical DatasetTypes (both will use default
1094 # template)
1095 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1096 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1097 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1098 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1100 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1101 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1103 # Put with exactly the data ID keys needed
1104 ref = butler.put(metric, "metric1", dataId1)
1105 uri = butler.getURI(ref)
1106 self.assertTrue(
1107 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"),
1108 f"Checking existence of {uri}",
1109 )
1111 # Check the template based on dimensions
1112 butler.datastore.templates.validateTemplates([ref])
1114 # Put with extra data ID keys (physical_filter is an optional
1115 # dependency); should not change template (at least the way we're
1116 # defining them to behave now; the important thing is that they
1117 # must be consistent).
1118 ref = butler.put(metric, "metric2", dataId2)
1119 uri = butler.getURI(ref)
1120 self.assertTrue(
1121 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"),
1122 f"Checking existence of {uri}",
1123 )
1125 # Check the template based on dimensions
1126 butler.datastore.templates.validateTemplates([ref])
1128 # Now use a file template that will not result in unique filenames
1129 with self.assertRaises(FileTemplateValidationError):
1130 butler.put(metric, "metric3", dataId1)
1132 def testImportExport(self):
1133 # Run put/get tests just to create and populate a repo.
1134 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1135 self.runImportExportTest(storageClass)
1137 @unittest.expectedFailure
1138 def testImportExportVirtualComposite(self):
1139 # Run put/get tests just to create and populate a repo.
1140 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1141 self.runImportExportTest(storageClass)
1143 def runImportExportTest(self, storageClass):
1144 """This test does an export to a temp directory and an import back
1145 into a new temp directory repo. It does not assume a posix datastore"""
1146 exportButler = self.runPutGetTest(storageClass, "test_metric")
1147 print("Root:", exportButler.datastore.root)
1148 # Test that the repo actually has at least one dataset.
1149 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1150 self.assertGreater(len(datasets), 0)
1151 # Add a DimensionRecord that's unused by those datasets.
1152 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1153 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1154 # Export and then import datasets.
1155 with safeTestTempDir(TESTDIR) as exportDir:
1156 exportFile = os.path.join(exportDir, "exports.yaml")
1157 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1158 export.saveDatasets(datasets)
1159 # Export the same datasets again. This should quietly do
1160 # nothing because of internal deduplication, and it shouldn't
1161 # complain about being asked to export the "htm7" elements even
1162 # though there aren't any in these datasets or in the database.
1163 export.saveDatasets(datasets, elements=["htm7"])
1164 # Save one of the data IDs again; this should be harmless
1165 # because of internal deduplication.
1166 export.saveDataIds([datasets[0].dataId])
1167 # Save some dimension records directly.
1168 export.saveDimensionData("skymap", [skymapRecord])
1169 self.assertTrue(os.path.exists(exportFile))
1170 with safeTestTempDir(TESTDIR) as importDir:
1171 # We always want this to be a local posix butler
1172 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1173 # Calling script.butlerImport tests the implementation of the
1174 # butler command line interface "import" subcommand. Functions
1175 # in the script folder are generally considered protected and
1176 # should not be used as public api.
1177 with open(exportFile, "r") as f:
1178 script.butlerImport(
1179 importDir,
1180 export_file=f,
1181 directory=exportDir,
1182 transfer="auto",
1183 skip_dimensions=None,
1184 reuse_ids=False,
1185 )
1186 importButler = Butler(importDir, run="ingest")
1187 for ref in datasets:
1188 with self.subTest(ref=ref):
1189 # Test for existence by passing in the DatasetType and
1190 # data ID separately, to avoid lookup by dataset_id.
1191 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1192 self.assertEqual(
1193 list(importButler.registry.queryDimensionRecords("skymap")),
1194 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1195 )
1197 def testRemoveRuns(self):
1198 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1199 butler = Butler(self.tmpConfigFile, writeable=True)
1200 # Load registry data with dimensions to hang datasets off of.
1201 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1202 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1203 # Add some RUN-type collection.
1204 run1 = "run1"
1205 butler.registry.registerRun(run1)
1206 run2 = "run2"
1207 butler.registry.registerRun(run2)
1208 # put a dataset in each
1209 metric = makeExampleMetrics()
1210 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1211 datasetType = self.addDatasetType(
1212 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1213 )
1214 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1215 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1216 uri1 = butler.getURI(ref1, collections=[run1])
1217 uri2 = butler.getURI(ref2, collections=[run2])
1218 # Remove from both runs with different values for unstore.
1219 butler.removeRuns([run1], unstore=True)
1220 butler.removeRuns([run2], unstore=False)
1221 # Should be nothing in registry for either one, and datastore should
1222 # not think either exists.
1223 with self.assertRaises(MissingCollectionError):
1224 butler.registry.getCollectionType(run1)
1225 with self.assertRaises(MissingCollectionError):
1226 butler.registry.getCollectionType(run2)
1227 self.assertFalse(butler.datastore.exists(ref1))
1228 self.assertFalse(butler.datastore.exists(ref2))
1229 # The ref we unstored should be gone according to the URI, but the
1230 # one we forgot should still be around.
1231 self.assertFalse(uri1.exists())
1232 self.assertTrue(uri2.exists())
1235class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1236 """PosixDatastore specialization of a butler"""
1238 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1239 fullConfigKey = ".datastore.formatters"
1240 validationCanFail = True
1241 datastoreStr = ["/tmp"]
1242 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1243 registryStr = "/gen3.sqlite3"
1245 def testPathConstructor(self):
1246 """Independent test of constructor using PathLike."""
1247 butler = Butler(self.tmpConfigFile, run="ingest")
1248 self.assertIsInstance(butler, Butler)
1250 # And again with a Path object with the butler yaml
1251 path = pathlib.Path(self.tmpConfigFile)
1252 butler = Butler(path, writeable=False)
1253 self.assertIsInstance(butler, Butler)
1255 # And again with a Path object without the butler yaml
1256 # (making sure we skip it if the tmp config doesn't end
1257 # in butler.yaml -- which is the case for a subclass)
1258 if self.tmpConfigFile.endswith("butler.yaml"):
1259 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1260 butler = Butler(path, writeable=False)
1261 self.assertIsInstance(butler, Butler)
1263 def testExportTransferCopy(self):
1264 """Test local export using all transfer modes"""
1265 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1266 exportButler = self.runPutGetTest(storageClass, "test_metric")
1267 # Test that the repo actually has at least one dataset.
1268 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1269 self.assertGreater(len(datasets), 0)
1270 uris = [exportButler.getURI(d) for d in datasets]
1271 datastoreRoot = exportButler.datastore.root
1273 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1275 for path in pathsInStore:
1276 # Assume local file system
1277 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1279 for transfer in ("copy", "link", "symlink", "relsymlink"):
1280 with safeTestTempDir(TESTDIR) as exportDir:
1281 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1282 export.saveDatasets(datasets)
1283 for path in pathsInStore:
1284 self.assertTrue(
1285 self.checkFileExists(exportDir, path),
1286 f"Check that mode {transfer} exported files",
1287 )
1289 def testPruneDatasets(self):
1290 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1291 butler = Butler(self.tmpConfigFile, writeable=True)
1292 # Load registry data with dimensions to hang datasets off of.
1293 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1294 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1295 # Add some RUN-type collections.
1296 run1 = "run1"
1297 butler.registry.registerRun(run1)
1298 run2 = "run2"
1299 butler.registry.registerRun(run2)
1300 # put some datasets. ref1 and ref2 have the same data ID, and are in
1301 # different runs. ref3 has a different data ID.
1302 metric = makeExampleMetrics()
1303 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1304 datasetType = self.addDatasetType(
1305 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1306 )
1307 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1308 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1309 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1311 # Simple prune.
1312 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1313 with self.assertRaises(LookupError):
1314 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1316 # Put data back.
1317 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1318 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1319 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1321 # Check that in normal mode, deleting the record will lead to
1322 # trash not touching the file.
1323 uri1 = butler.datastore.getURI(ref1)
1324 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1325 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1326 butler.datastore.trash(ref1)
1327 butler.datastore.emptyTrash()
1328 self.assertTrue(uri1.exists())
1329 uri1.remove() # Clean it up.
1331 # Simulate execution butler setup by deleting the datastore
1332 # record but keeping the file around and trusting.
1333 butler.datastore.trustGetRequest = True
1334 uri2 = butler.datastore.getURI(ref2)
1335 uri3 = butler.datastore.getURI(ref3)
1336 self.assertTrue(uri2.exists())
1337 self.assertTrue(uri3.exists())
1339 # Remove the datastore record.
1340 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1341 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1342 self.assertTrue(uri2.exists())
1343 butler.datastore.trash([ref2, ref3])
1344 # Immediate removal for ref2 file
1345 self.assertFalse(uri2.exists())
1346 # But ref3 has to wait for the empty.
1347 self.assertTrue(uri3.exists())
1348 butler.datastore.emptyTrash()
1349 self.assertFalse(uri3.exists())
1351 # Clear out the datasets from registry.
1352 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1354 def testPytypeCoercion(self):
1355 """Test python type coercion on Butler.get"""
1357 # Store some data with the normal example storage class.
1358 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1359 datasetTypeName = "test_metric"
1360 butler = self.runPutGetTest(storageClass, datasetTypeName)
1362 dataId = {"instrument": "DummyCamComp", "visit": 423}
1363 metric = butler.get(datasetTypeName, dataId=dataId)
1364 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1366 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1367 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1369 # Now need to hack the registry dataset type definition.
1370 # There is no API for this.
1371 manager = butler.registry._managers.datasets
1372 manager._db.update(
1373 manager._static.dataset_type,
1374 {"name": datasetTypeName},
1375 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1376 )
1378 # Force reset of dataset type cache
1379 butler.registry.refresh()
1381 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1382 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1383 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1385 metric_model = butler.get(datasetTypeName, dataId=dataId)
1386 self.assertNotEqual(type(metric_model), type(metric))
1387 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1389 # Put the model and read it back to show that everything now
1390 # works as normal.
1391 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1392 metric_model_new = butler.get(metric_ref)
1393 self.assertEqual(metric_model_new, metric_model)
1395 # Hack the storage class again to something that will fail on the
1396 # get with no conversion class.
1397 manager._db.update(
1398 manager._static.dataset_type,
1399 {"name": datasetTypeName},
1400 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1401 )
1402 butler.registry.refresh()
1404 with self.assertRaises(ValueError):
1405 butler.get(datasetTypeName, dataId=dataId)
1408class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1409 """InMemoryDatastore specialization of a butler"""
1411 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1412 fullConfigKey = None
1413 useTempRoot = False
1414 validationCanFail = False
1415 datastoreStr = ["datastore='InMemory"]
1416 datastoreName = ["InMemoryDatastore@"]
1417 registryStr = "/gen3.sqlite3"
1419 def testIngest(self):
1420 pass
1423class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1424 """PosixDatastore specialization"""
1426 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1427 fullConfigKey = ".datastore.datastores.1.formatters"
1428 validationCanFail = True
1429 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1430 datastoreName = [
1431 "InMemoryDatastore@",
1432 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1433 "SecondDatastore",
1434 ]
1435 registryStr = "/gen3.sqlite3"
1438class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1439 """Test that a yaml file in one location can refer to a root in another."""
1441 datastoreStr = ["dir1"]
1442 # Disable the makeRepo test since we are deliberately not using
1443 # butler.yaml as the config name.
1444 fullConfigKey = None
1446 def setUp(self):
1447 self.root = makeTestTempDir(TESTDIR)
1449 # Make a new repository in one place
1450 self.dir1 = os.path.join(self.root, "dir1")
1451 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1453 # Move the yaml file to a different place and add a "root"
1454 self.dir2 = os.path.join(self.root, "dir2")
1455 os.makedirs(self.dir2, exist_ok=True)
1456 configFile1 = os.path.join(self.dir1, "butler.yaml")
1457 config = Config(configFile1)
1458 config["root"] = self.dir1
1459 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1460 config.dumpToUri(configFile2)
1461 os.remove(configFile1)
1462 self.tmpConfigFile = configFile2
1464 def testFileLocations(self):
1465 self.assertNotEqual(self.dir1, self.dir2)
1466 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1467 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1468 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1471class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1472 """Test that a config file created by makeRepo outside of repo works."""
1474 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1476 def setUp(self):
1477 self.root = makeTestTempDir(TESTDIR)
1478 self.root2 = makeTestTempDir(TESTDIR)
1480 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1481 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1483 def tearDown(self):
1484 if os.path.exists(self.root2):
1485 shutil.rmtree(self.root2, ignore_errors=True)
1486 super().tearDown()
1488 def testConfigExistence(self):
1489 c = Config(self.tmpConfigFile)
1490 uri_config = ResourcePath(c["root"])
1491 uri_expected = ResourcePath(self.root, forceDirectory=True)
1492 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1493 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1495 def testPutGet(self):
1496 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1497 self.runPutGetTest(storageClass, "test_metric")
1500class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1501 """Test that a config file created by makeRepo outside of repo works."""
1503 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1505 def setUp(self):
1506 self.root = makeTestTempDir(TESTDIR)
1507 self.root2 = makeTestTempDir(TESTDIR)
1509 self.tmpConfigFile = self.root2
1510 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1512 def testConfigExistence(self):
1513 # Append the yaml file else Config constructor does not know the file
1514 # type.
1515 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1516 super().testConfigExistence()
1519class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1520 """Test that a config file created by makeRepo outside of repo works."""
1522 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1524 def setUp(self):
1525 self.root = makeTestTempDir(TESTDIR)
1526 self.root2 = makeTestTempDir(TESTDIR)
1528 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1529 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1532@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1533@mock_s3
1534class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1535 """S3Datastore specialization of a butler; an S3 storage Datastore +
1536 a local in-memory SqlRegistry.
1537 """
1539 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1540 fullConfigKey = None
1541 validationCanFail = True
1543 bucketName = "anybucketname"
1544 """Name of the Bucket that will be used in the tests. The name is read from
1545 the config file used with the tests during set-up.
1546 """
1548 root = "butlerRoot/"
1549 """Root repository directory expected to be used in case useTempRoot=False.
1550 Otherwise the root is set to a 20 characters long randomly generated string
1551 during set-up.
1552 """
1554 datastoreStr = [f"datastore={root}"]
1555 """Contains all expected root locations in a format expected to be
1556 returned by Butler stringification.
1557 """
1559 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1560 """The expected format of the S3 Datastore string."""
1562 registryStr = "/gen3.sqlite3"
1563 """Expected format of the Registry string."""
1565 def genRoot(self):
1566 """Returns a random string of len 20 to serve as a root
1567 name for the temporary bucket repo.
1569 This is equivalent to tempfile.mkdtemp as this is what self.root
1570 becomes when useTempRoot is True.
1571 """
1572 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1573 return rndstr + "/"
1575 def setUp(self):
1576 config = Config(self.configFile)
1577 uri = ResourcePath(config[".datastore.datastore.root"])
1578 self.bucketName = uri.netloc
1580 # set up some fake credentials if they do not exist
1581 self.usingDummyCredentials = setAwsEnvCredentials()
1583 if self.useTempRoot:
1584 self.root = self.genRoot()
1585 rooturi = f"s3://{self.bucketName}/{self.root}"
1586 config.update({"datastore": {"datastore": {"root": rooturi}}})
1588 # need local folder to store registry database
1589 self.reg_dir = makeTestTempDir(TESTDIR)
1590 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1592 # MOTO needs to know that we expect Bucket bucketname to exist
1593 # (this used to be the class attribute bucketName)
1594 s3 = boto3.resource("s3")
1595 s3.create_bucket(Bucket=self.bucketName)
1597 self.datastoreStr = f"datastore={self.root}"
1598 self.datastoreName = [f"FileDatastore@{rooturi}"]
1599 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1600 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1602 def tearDown(self):
1603 s3 = boto3.resource("s3")
1604 bucket = s3.Bucket(self.bucketName)
1605 try:
1606 bucket.objects.all().delete()
1607 except botocore.exceptions.ClientError as e:
1608 if e.response["Error"]["Code"] == "404":
1609 # the key was not reachable - pass
1610 pass
1611 else:
1612 raise
1614 bucket = s3.Bucket(self.bucketName)
1615 bucket.delete()
1617 # unset any potentially set dummy credentials
1618 if self.usingDummyCredentials:
1619 unsetAwsEnvCredentials()
1621 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1622 shutil.rmtree(self.reg_dir, ignore_errors=True)
1624 if self.useTempRoot and os.path.exists(self.root):
1625 shutil.rmtree(self.root, ignore_errors=True)
1628@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1629# Mock required environment variables during tests
1630@unittest.mock.patch.dict(
1631 os.environ,
1632 {
1633 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1634 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1635 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1636 },
1637)
1638class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1639 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1640 a local in-memory SqlRegistry.
1641 """
1643 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1644 fullConfigKey = None
1645 validationCanFail = True
1647 serverName = "localhost"
1648 """Name of the server that will be used in the tests.
1649 """
1651 portNumber = 8080
1652 """Port on which the webdav server listens. Automatically chosen
1653 at setUpClass via the _getfreeport() method
1654 """
1656 root = "butlerRoot/"
1657 """Root repository directory expected to be used in case useTempRoot=False.
1658 Otherwise the root is set to a 20 characters long randomly generated string
1659 during set-up.
1660 """
1662 datastoreStr = [f"datastore={root}"]
1663 """Contains all expected root locations in a format expected to be
1664 returned by Butler stringification.
1665 """
1667 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1668 """The expected format of the WebdavDatastore string."""
1670 registryStr = "/gen3.sqlite3"
1671 """Expected format of the Registry string."""
1673 serverThread = None
1674 """Thread in which the local webdav server will run"""
1676 stopWebdavServer = False
1677 """This flag will cause the webdav server to
1678 gracefully shut down when True
1679 """
1681 def genRoot(self):
1682 """Returns a random string of len 20 to serve as a root
1683 name for the temporary bucket repo.
1685 This is equivalent to tempfile.mkdtemp as this is what self.root
1686 becomes when useTempRoot is True.
1687 """
1688 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1689 return rndstr + "/"
1691 @classmethod
1692 def setUpClass(cls):
1693 # Do the same as inherited class
1694 cls.storageClassFactory = StorageClassFactory()
1695 cls.storageClassFactory.addFromConfig(cls.configFile)
1697 cls.portNumber = cls._getfreeport()
1698 # Run a local webdav server on which tests will be run
1699 cls.serverThread = Thread(
1700 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1701 )
1702 cls.serverThread.start()
1703 # Wait for it to start
1704 time.sleep(3)
1706 @classmethod
1707 def tearDownClass(cls):
1708 # Ask for graceful shut down of the webdav server
1709 cls.stopWebdavServer = True
1710 # Wait for the thread to exit
1711 cls.serverThread.join()
1713 # Mock required environment variables during tests
1714 @unittest.mock.patch.dict(
1715 os.environ,
1716 {
1717 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1718 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1719 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1720 },
1721 )
1722 def setUp(self):
1723 config = Config(self.configFile)
1725 if self.useTempRoot:
1726 self.root = self.genRoot()
1727 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1728 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1730 # need local folder to store registry database
1731 self.reg_dir = makeTestTempDir(TESTDIR)
1732 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1734 self.datastoreStr = f"datastore={self.root}"
1735 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1737 if not isWebdavEndpoint(self.rooturi):
1738 raise OSError("Webdav server not running properly: cannot run tests.")
1740 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1741 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1743 # Mock required environment variables during tests
1744 @unittest.mock.patch.dict(
1745 os.environ,
1746 {
1747 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1748 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1749 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1750 },
1751 )
1752 def tearDown(self):
1753 # Clear temporary directory
1754 ResourcePath(self.rooturi).remove()
1755 ResourcePath(self.rooturi).session.close()
1757 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1758 shutil.rmtree(self.reg_dir, ignore_errors=True)
1760 if self.useTempRoot and os.path.exists(self.root):
1761 shutil.rmtree(self.root, ignore_errors=True)
1763 def _serveWebdav(self, port: int, stopWebdavServer):
1764 """Starts a local webdav-compatible HTTP server,
1765 Listening on http://localhost:port
1766 This server only runs when this test class is instantiated,
1767 and then shuts down. Must be started is a separate thread.
1769 Parameters
1770 ----------
1771 port : `int`
1772 The port number on which the server should listen
1773 """
1774 root_path = gettempdir()
1776 config = {
1777 "host": "0.0.0.0",
1778 "port": port,
1779 "provider_mapping": {"/": root_path},
1780 "http_authenticator": {"domain_controller": None},
1781 "simple_dc": {"user_mapping": {"*": True}},
1782 "verbose": 0,
1783 }
1784 app = WsgiDAVApp(config)
1786 server_args = {
1787 "bind_addr": (config["host"], config["port"]),
1788 "wsgi_app": app,
1789 }
1790 server = wsgi.Server(**server_args)
1791 server.prepare()
1793 try:
1794 # Start the actual server in a separate thread
1795 t = Thread(target=server.serve, daemon=True)
1796 t.start()
1797 # watch stopWebdavServer, and gracefully
1798 # shut down the server when True
1799 while True:
1800 if stopWebdavServer():
1801 break
1802 time.sleep(1)
1803 except KeyboardInterrupt:
1804 print("Caught Ctrl-C, shutting down...")
1805 finally:
1806 server.stop()
1807 t.join()
1809 def _getfreeport():
1810 """
1811 Determines a free port using sockets.
1812 """
1813 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1814 free_socket.bind(("0.0.0.0", 0))
1815 free_socket.listen()
1816 port = free_socket.getsockname()[1]
1817 free_socket.close()
1818 return port
1821class PosixDatastoreTransfers(unittest.TestCase):
1822 """Test data transfers between butlers.
1824 Test for different managers. UUID to UUID and integer to integer are
1825 tested. UUID to integer is not supported since we do not currently
1826 want to allow that. Integer to UUID is supported with the caveat
1827 that UUID4 will be generated and this will be incorrect for raw
1828 dataset types. The test ignores that.
1829 """
1831 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1833 @classmethod
1834 def setUpClass(cls):
1835 cls.storageClassFactory = StorageClassFactory()
1836 cls.storageClassFactory.addFromConfig(cls.configFile)
1838 def setUp(self):
1839 self.root = makeTestTempDir(TESTDIR)
1840 self.config = Config(self.configFile)
1842 def tearDown(self):
1843 removeTestTempDir(self.root)
1845 def create_butler(self, manager, label):
1846 config = Config(self.configFile)
1847 config["registry", "managers", "datasets"] = manager
1848 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1850 def create_butlers(self, manager1, manager2):
1851 self.source_butler = self.create_butler(manager1, "1")
1852 self.target_butler = self.create_butler(manager2, "2")
1854 def testTransferUuidToUuid(self):
1855 self.create_butlers(
1856 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1857 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1858 )
1859 # Setting id_gen_map should have no effect here
1860 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1862 def testTransferIntToInt(self):
1863 self.create_butlers(
1864 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1865 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1866 )
1867 # int dataset ID only allows UNIQUE
1868 self.assertButlerTransfers()
1870 def testTransferIntToUuid(self):
1871 self.create_butlers(
1872 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1873 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1874 )
1875 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1877 def testTransferMissing(self):
1878 """Test transfers where datastore records are missing.
1880 This is how execution butler works.
1881 """
1882 self.create_butlers(
1883 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1884 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1885 )
1887 # Configure the source butler to allow trust.
1888 self.source_butler.datastore.trustGetRequest = True
1890 self.assertButlerTransfers(purge=True)
1892 def testTransferMissingDisassembly(self):
1893 """Test transfers where datastore records are missing.
1895 This is how execution butler works.
1896 """
1897 self.create_butlers(
1898 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1899 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1900 )
1902 # Configure the source butler to allow trust.
1903 self.source_butler.datastore.trustGetRequest = True
1905 # Test disassembly.
1906 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1908 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1909 """Test that a run can be transferred to another butler."""
1911 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1912 datasetTypeName = "random_data"
1914 # Test will create 3 collections and we will want to transfer
1915 # two of those three.
1916 runs = ["run1", "run2", "other"]
1918 # Also want to use two different dataset types to ensure that
1919 # grouping works.
1920 datasetTypeNames = ["random_data", "random_data_2"]
1922 # Create the run collections in the source butler.
1923 for run in runs:
1924 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1926 # Create dimensions in both butlers (transfer will not create them).
1927 n_exposures = 30
1928 for butler in (self.source_butler, self.target_butler):
1929 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1930 butler.registry.insertDimensionData(
1931 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1932 )
1933 butler.registry.insertDimensionData(
1934 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1935 )
1937 for i in range(n_exposures):
1938 butler.registry.insertDimensionData(
1939 "exposure",
1940 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1941 )
1943 # Create dataset types in the source butler.
1944 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1945 for datasetTypeName in datasetTypeNames:
1946 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1947 self.source_butler.registry.registerDatasetType(datasetType)
1949 # Write a dataset to an unrelated run -- this will ensure that
1950 # we are rewriting integer dataset ids in the target if necessary.
1951 # Will not be relevant for UUID.
1952 run = "distraction"
1953 butler = Butler(butler=self.source_butler, run=run)
1954 butler.put(
1955 makeExampleMetrics(),
1956 datasetTypeName,
1957 exposure=1,
1958 instrument="DummyCamComp",
1959 physical_filter="d-r",
1960 )
1962 # Write some example metrics to the source
1963 butler = Butler(butler=self.source_butler)
1965 # Set of DatasetRefs that should be in the list of refs to transfer
1966 # but which will not be transferred.
1967 deleted = set()
1969 n_expected = 20 # Number of datasets expected to be transferred
1970 source_refs = []
1971 for i in range(n_exposures):
1972 # Put a third of datasets into each collection, only retain
1973 # two thirds.
1974 index = i % 3
1975 run = runs[index]
1976 datasetTypeName = datasetTypeNames[i % 2]
1978 metric_data = {
1979 "summary": {"counter": i},
1980 "output": {"text": "metric"},
1981 "data": [2 * x for x in range(i)],
1982 }
1983 metric = MetricsExample(**metric_data)
1984 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1985 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
1987 # Remove the datastore record using low-level API
1988 if purge:
1989 # Remove records for a fraction.
1990 if index == 1:
1992 # For one of these delete the file as well.
1993 # This allows the "missing" code to filter the
1994 # file out.
1995 if not deleted:
1996 primary, uris = butler.datastore.getURIs(ref)
1997 if primary:
1998 primary.remove()
1999 for uri in uris.values():
2000 uri.remove()
2001 n_expected -= 1
2002 deleted.add(ref)
2004 # Remove the datastore record.
2005 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2007 if index < 2:
2008 source_refs.append(ref)
2009 if ref not in deleted:
2010 new_metric = butler.get(ref.unresolved(), collections=run)
2011 self.assertEqual(new_metric, metric)
2013 # Create some bad dataset types to ensure we check for inconsistent
2014 # definitions.
2015 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2016 for datasetTypeName in datasetTypeNames:
2017 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2018 self.target_butler.registry.registerDatasetType(datasetType)
2019 with self.assertRaises(ConflictingDefinitionError):
2020 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2021 # And remove the bad definitions.
2022 for datasetTypeName in datasetTypeNames:
2023 self.target_butler.registry.removeDatasetType(datasetTypeName)
2025 # Transfer without creating dataset types should fail.
2026 with self.assertRaises(KeyError):
2027 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2029 # Now transfer them to the second butler
2030 with self.assertLogs(level=logging.DEBUG) as cm:
2031 transferred = self.target_butler.transfer_from(
2032 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2033 )
2034 self.assertEqual(len(transferred), n_expected)
2035 log_output = ";".join(cm.output)
2036 self.assertIn("found in datastore for chunk", log_output)
2037 self.assertIn("Creating output run", log_output)
2039 # Do the transfer twice to ensure that it will do nothing extra.
2040 # Only do this if purge=True because it does not work for int
2041 # dataset_id.
2042 if purge:
2043 # This should not need to register dataset types.
2044 transferred = self.target_butler.transfer_from(
2045 self.source_butler, source_refs, id_gen_map=id_gen_map
2046 )
2047 self.assertEqual(len(transferred), n_expected)
2049 # Also do an explicit low-level transfer to trigger some
2050 # edge cases.
2051 with self.assertLogs(level=logging.DEBUG) as cm:
2052 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2053 log_output = ";".join(cm.output)
2054 self.assertIn("no file artifacts exist", log_output)
2056 with self.assertRaises(TypeError):
2057 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2059 with self.assertRaises(ValueError):
2060 self.target_butler.datastore.transfer_from(
2061 self.source_butler.datastore, source_refs, transfer="split"
2062 )
2064 # Now try to get the same refs from the new butler.
2065 for ref in source_refs:
2066 if ref not in deleted:
2067 unresolved_ref = ref.unresolved()
2068 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2069 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2070 self.assertEqual(new_metric, old_metric)
2072 # Now prune run2 collection and create instead a CHAINED collection.
2073 # This should block the transfer.
2074 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2075 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2076 with self.assertRaises(TypeError):
2077 # Re-importing the run1 datasets can be problematic if they
2078 # use integer IDs so filter those out.
2079 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2080 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2083if __name__ == "__main__": 2083 ↛ 2084line 2083 didn't jump to line 2084, because the condition on line 2083 was never true
2084 unittest.main()