Coverage for tests/test_butler.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionSearch,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import ConflictingDefinitionError, MissingCollectionError
77from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
78from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
79from lsst.resources import ResourcePath
80from lsst.resources.http import isWebdavEndpoint
81from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
82from lsst.utils import doImport
83from lsst.utils.introspection import get_full_type_name
85TESTDIR = os.path.abspath(os.path.dirname(__file__))
88def makeExampleMetrics():
89 return MetricsExample(
90 {"AM1": 5.2, "AM2": 30.6},
91 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
92 [563, 234, 456.7, 752, 8, 9, 27],
93 )
96class TransactionTestError(Exception):
97 """Specific error for testing transactions, to prevent misdiagnosing
98 that might otherwise occur when a standard exception is used.
99 """
101 pass
104class ButlerConfigTests(unittest.TestCase):
105 """Simple tests for ButlerConfig that are not tested in any other test
106 cases."""
108 def testSearchPath(self):
109 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
110 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
111 config1 = ButlerConfig(configFile)
112 self.assertNotIn("testConfigs", "\n".join(cm.output))
114 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
115 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
116 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
117 self.assertIn("testConfigs", "\n".join(cm.output))
119 key = ("datastore", "records", "table")
120 self.assertNotEqual(config1[key], config2[key])
121 self.assertEqual(config2[key], "override_record")
124class ButlerPutGetTests:
125 """Helper method for running a suite of put/get tests from different
126 butler configurations."""
128 root = None
130 @staticmethod
131 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
132 """Create a DatasetType and register it"""
133 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
134 registry.registerDatasetType(datasetType)
135 return datasetType
137 @classmethod
138 def setUpClass(cls):
139 cls.storageClassFactory = StorageClassFactory()
140 cls.storageClassFactory.addFromConfig(cls.configFile)
142 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
143 datasetType = datasetRef.datasetType
144 dataId = datasetRef.dataId
145 deferred = butler.getDirectDeferred(datasetRef)
147 for component in components:
148 compTypeName = datasetType.componentTypeName(component)
149 result = butler.get(compTypeName, dataId, collections=collections)
150 self.assertEqual(result, getattr(reference, component))
151 result_deferred = deferred.get(component=component)
152 self.assertEqual(result_deferred, result)
154 def tearDown(self):
155 removeTestTempDir(self.root)
157 def create_butler(self, run, storageClass, datasetTypeName):
158 butler = Butler(self.tmpConfigFile, run=run)
160 collections = set(butler.registry.queryCollections())
161 self.assertEqual(collections, set([run]))
163 # Create and register a DatasetType
164 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
166 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
168 # Add needed Dimensions
169 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
170 butler.registry.insertDimensionData(
171 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
172 )
173 butler.registry.insertDimensionData(
174 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
175 )
176 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
177 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
178 butler.registry.insertDimensionData(
179 "visit",
180 {
181 "instrument": "DummyCamComp",
182 "id": 423,
183 "name": "fourtwentythree",
184 "physical_filter": "d-r",
185 "visit_system": 1,
186 "datetime_begin": visit_start,
187 "datetime_end": visit_end,
188 },
189 )
191 # Add a second visit for some later tests
192 butler.registry.insertDimensionData(
193 "visit",
194 {
195 "instrument": "DummyCamComp",
196 "id": 424,
197 "name": "fourtwentyfour",
198 "physical_filter": "d-r",
199 "visit_system": 1,
200 },
201 )
202 return butler, datasetType
204 def runPutGetTest(self, storageClass, datasetTypeName):
205 # New datasets will be added to run and tag, but we will only look in
206 # tag when looking up datasets.
207 run = "ingest"
208 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
210 # Create and store a dataset
211 metric = makeExampleMetrics()
212 dataId = {"instrument": "DummyCamComp", "visit": 423}
214 # Create a DatasetRef for put
215 refIn = DatasetRef(datasetType, dataId, id=None)
217 # Put with a preexisting id should fail
218 with self.assertRaises(ValueError):
219 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
221 # Put and remove the dataset once as a DatasetRef, once as a dataId,
222 # and once with a DatasetType
224 # Keep track of any collections we add and do not clean up
225 expected_collections = {run}
227 counter = 0
228 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
229 # Since we are using subTest we can get cascading failures
230 # here with the first attempt failing and the others failing
231 # immediately because the dataset already exists. Work around
232 # this by using a distinct run collection each time
233 counter += 1
234 this_run = f"put_run_{counter}"
235 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
236 expected_collections.update({this_run})
238 with self.subTest(args=args):
239 ref = butler.put(metric, *args, run=this_run)
240 self.assertIsInstance(ref, DatasetRef)
242 # Test getDirect
243 metricOut = butler.getDirect(ref)
244 self.assertEqual(metric, metricOut)
245 # Test get
246 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
247 self.assertEqual(metric, metricOut)
248 # Test get with a datasetRef
249 metricOut = butler.get(ref, collections=this_run)
250 self.assertEqual(metric, metricOut)
251 # Test getDeferred with dataId
252 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
253 self.assertEqual(metric, metricOut)
254 # Test getDeferred with a datasetRef
255 metricOut = butler.getDeferred(ref, collections=this_run).get()
256 self.assertEqual(metric, metricOut)
257 # and deferred direct with ref
258 metricOut = butler.getDirectDeferred(ref).get()
259 self.assertEqual(metric, metricOut)
261 # Check we can get components
262 if storageClass.isComposite():
263 self.assertGetComponents(
264 butler, ref, ("summary", "data", "output"), metric, collections=this_run
265 )
267 # Can the artifacts themselves be retrieved?
268 if not butler.datastore.isEphemeral:
269 root_uri = ResourcePath(self.root)
271 for preserve_path in (True, False):
272 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
273 # Use copy so that we can test that overwrite
274 # protection works (using "auto" for File URIs would
275 # use hard links and subsequent transfer would work
276 # because it knows they are the same file).
277 transferred = butler.retrieveArtifacts(
278 [ref], destination, preserve_path=preserve_path, transfer="copy"
279 )
280 self.assertGreater(len(transferred), 0)
281 artifacts = list(ResourcePath.findFileResources([destination]))
282 self.assertEqual(set(transferred), set(artifacts))
284 for artifact in transferred:
285 path_in_destination = artifact.relative_to(destination)
286 self.assertIsNotNone(path_in_destination)
288 # when path is not preserved there should not be
289 # any path separators.
290 num_seps = path_in_destination.count("/")
291 if preserve_path:
292 self.assertGreater(num_seps, 0)
293 else:
294 self.assertEqual(num_seps, 0)
296 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
297 n_uris = len(secondary_uris)
298 if primary_uri:
299 n_uris += 1
300 self.assertEqual(
301 len(artifacts),
302 n_uris,
303 "Comparing expected artifacts vs actual:"
304 f" {artifacts} vs {primary_uri} and {secondary_uris}",
305 )
307 if preserve_path:
308 # No need to run these twice
309 with self.assertRaises(ValueError):
310 butler.retrieveArtifacts([ref], destination, transfer="move")
312 with self.assertRaises(FileExistsError):
313 butler.retrieveArtifacts([ref], destination)
315 transferred_again = butler.retrieveArtifacts(
316 [ref], destination, preserve_path=preserve_path, overwrite=True
317 )
318 self.assertEqual(set(transferred_again), set(transferred))
320 # Now remove the dataset completely.
321 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
322 # Lookup with original args should still fail.
323 with self.assertRaises(LookupError):
324 butler.datasetExists(*args, collections=this_run)
325 # getDirect() should still fail.
326 with self.assertRaises(FileNotFoundError):
327 butler.getDirect(ref)
328 # Registry shouldn't be able to find it by dataset_id anymore.
329 self.assertIsNone(butler.registry.getDataset(ref.id))
331 # Do explicit registry removal since we know they are
332 # empty
333 butler.registry.removeCollection(this_run)
334 expected_collections.remove(this_run)
336 # Put the dataset again, since the last thing we did was remove it
337 # and we want to use the default collection.
338 ref = butler.put(metric, refIn)
340 # Get with parameters
341 stop = 4
342 sliced = butler.get(ref, parameters={"slice": slice(stop)})
343 self.assertNotEqual(metric, sliced)
344 self.assertEqual(metric.summary, sliced.summary)
345 self.assertEqual(metric.output, sliced.output)
346 self.assertEqual(metric.data[:stop], sliced.data)
347 # getDeferred with parameters
348 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
349 self.assertNotEqual(metric, sliced)
350 self.assertEqual(metric.summary, sliced.summary)
351 self.assertEqual(metric.output, sliced.output)
352 self.assertEqual(metric.data[:stop], sliced.data)
353 # getDeferred with deferred parameters
354 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
355 self.assertNotEqual(metric, sliced)
356 self.assertEqual(metric.summary, sliced.summary)
357 self.assertEqual(metric.output, sliced.output)
358 self.assertEqual(metric.data[:stop], sliced.data)
360 if storageClass.isComposite():
361 # Check that components can be retrieved
362 metricOut = butler.get(ref.datasetType.name, dataId)
363 compNameS = ref.datasetType.componentTypeName("summary")
364 compNameD = ref.datasetType.componentTypeName("data")
365 summary = butler.get(compNameS, dataId)
366 self.assertEqual(summary, metric.summary)
367 data = butler.get(compNameD, dataId)
368 self.assertEqual(data, metric.data)
370 if "counter" in storageClass.derivedComponents:
371 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
372 self.assertEqual(count, len(data))
374 count = butler.get(
375 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
376 )
377 self.assertEqual(count, stop)
379 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
380 summary = butler.getDirect(compRef)
381 self.assertEqual(summary, metric.summary)
383 # Create a Dataset type that has the same name but is inconsistent.
384 inconsistentDatasetType = DatasetType(
385 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
386 )
388 # Getting with a dataset type that does not match registry fails
389 with self.assertRaises(ValueError):
390 butler.get(inconsistentDatasetType, dataId)
392 # Combining a DatasetRef with a dataId should fail
393 with self.assertRaises(ValueError):
394 butler.get(ref, dataId)
395 # Getting with an explicit ref should fail if the id doesn't match
396 with self.assertRaises(ValueError):
397 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
399 # Getting a dataset with unknown parameters should fail
400 with self.assertRaises(KeyError):
401 butler.get(ref, parameters={"unsupported": True})
403 # Check we have a collection
404 collections = set(butler.registry.queryCollections())
405 self.assertEqual(collections, expected_collections)
407 # Clean up to check that we can remove something that may have
408 # already had a component removed
409 butler.pruneDatasets([ref], unstore=True, purge=True)
411 # Check that we can configure a butler to accept a put even
412 # if it already has the dataset in registry.
413 ref = butler.put(metric, refIn)
415 # Repeat put will fail.
416 with self.assertRaises(ConflictingDefinitionError):
417 butler.put(metric, refIn)
419 # Remove the datastore entry.
420 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
422 # Put will still fail
423 with self.assertRaises(ConflictingDefinitionError):
424 butler.put(metric, refIn)
426 # Allow the put to succeed
427 butler._allow_put_of_predefined_dataset = True
428 ref2 = butler.put(metric, refIn)
429 self.assertEqual(ref2.id, ref.id)
431 # A second put will still fail but with a different exception
432 # than before.
433 with self.assertRaises(ConflictingDefinitionError):
434 butler.put(metric, refIn)
436 # Reset the flag to avoid confusion
437 butler._allow_put_of_predefined_dataset = False
439 # Leave the dataset in place since some downstream tests require
440 # something to be present
442 return butler
444 def testDeferredCollectionPassing(self):
445 # Construct a butler with no run or collection, but make it writeable.
446 butler = Butler(self.tmpConfigFile, writeable=True)
447 # Create and register a DatasetType
448 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
449 datasetType = self.addDatasetType(
450 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
451 )
452 # Add needed Dimensions
453 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
454 butler.registry.insertDimensionData(
455 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
456 )
457 butler.registry.insertDimensionData(
458 "visit",
459 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
460 )
461 dataId = {"instrument": "DummyCamComp", "visit": 423}
462 # Create dataset.
463 metric = makeExampleMetrics()
464 # Register a new run and put dataset.
465 run = "deferred"
466 self.assertTrue(butler.registry.registerRun(run))
467 # Second time it will be allowed but indicate no-op
468 self.assertFalse(butler.registry.registerRun(run))
469 ref = butler.put(metric, datasetType, dataId, run=run)
470 # Putting with no run should fail with TypeError.
471 with self.assertRaises(TypeError):
472 butler.put(metric, datasetType, dataId)
473 # Dataset should exist.
474 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
475 # We should be able to get the dataset back, but with and without
476 # a deferred dataset handle.
477 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
478 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
479 # Trying to find the dataset without any collection is a TypeError.
480 with self.assertRaises(TypeError):
481 butler.datasetExists(datasetType, dataId)
482 with self.assertRaises(TypeError):
483 butler.get(datasetType, dataId)
484 # Associate the dataset with a different collection.
485 butler.registry.registerCollection("tagged")
486 butler.registry.associate("tagged", [ref])
487 # Deleting the dataset from the new collection should make it findable
488 # in the original collection.
489 butler.pruneDatasets([ref], tags=["tagged"])
490 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
493class ButlerTests(ButlerPutGetTests):
494 """Tests for Butler."""
496 useTempRoot = True
498 def setUp(self):
499 """Create a new butler root for each test."""
500 self.root = makeTestTempDir(TESTDIR)
501 Butler.makeRepo(self.root, config=Config(self.configFile))
502 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
504 def testConstructor(self):
505 """Independent test of constructor."""
506 butler = Butler(self.tmpConfigFile, run="ingest")
507 self.assertIsInstance(butler, Butler)
509 # Check that butler.yaml is added automatically.
510 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
511 config_dir = self.tmpConfigFile[: -len(end)]
512 butler = Butler(config_dir, run="ingest")
513 self.assertIsInstance(butler, Butler)
515 collections = set(butler.registry.queryCollections())
516 self.assertEqual(collections, {"ingest"})
518 # Check that some special characters can be included in run name.
519 special_run = "u@b.c-A"
520 butler_special = Butler(butler=butler, run=special_run)
521 collections = set(butler_special.registry.queryCollections("*@*"))
522 self.assertEqual(collections, {special_run})
524 butler2 = Butler(butler=butler, collections=["other"])
525 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
526 self.assertIsNone(butler2.run)
527 self.assertIs(butler.datastore, butler2.datastore)
529 # Test that we can use an environment variable to find this
530 # repository.
531 butler_index = Config()
532 butler_index["label"] = self.tmpConfigFile
533 for suffix in (".yaml", ".json"):
534 # Ensure that the content differs so that we know that
535 # we aren't reusing the cache.
536 bad_label = f"s3://bucket/not_real{suffix}"
537 butler_index["bad_label"] = bad_label
538 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
539 butler_index.dumpToUri(temp_file)
540 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
541 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
542 uri = Butler.get_repo_uri("bad_label")
543 self.assertEqual(uri, ResourcePath(bad_label))
544 uri = Butler.get_repo_uri("label")
545 butler = Butler(uri, writeable=False)
546 self.assertIsInstance(butler, Butler)
547 with self.assertRaises(KeyError) as cm:
548 Butler.get_repo_uri("missing")
549 self.assertIn("not known to", str(cm.exception))
550 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
551 with self.assertRaises(FileNotFoundError):
552 Butler.get_repo_uri("label")
553 self.assertEqual(Butler.get_known_repos(), set())
554 with self.assertRaises(KeyError) as cm:
555 # No environment variable set.
556 Butler.get_repo_uri("label")
557 self.assertIn("No repository index defined", str(cm.exception))
558 self.assertEqual(Butler.get_known_repos(), set())
560 def testBasicPutGet(self):
561 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
562 self.runPutGetTest(storageClass, "test_metric")
564 def testCompositePutGetConcrete(self):
566 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
567 butler = self.runPutGetTest(storageClass, "test_metric")
569 # Should *not* be disassembled
570 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
571 self.assertEqual(len(datasets), 1)
572 uri, components = butler.getURIs(datasets[0])
573 self.assertIsInstance(uri, ResourcePath)
574 self.assertFalse(components)
575 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
576 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
578 # Predicted dataset
579 dataId = {"instrument": "DummyCamComp", "visit": 424}
580 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
581 self.assertFalse(components)
582 self.assertIsInstance(uri, ResourcePath)
583 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
584 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
586 def testCompositePutGetVirtual(self):
587 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
588 butler = self.runPutGetTest(storageClass, "test_metric_comp")
590 # Should be disassembled
591 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
592 self.assertEqual(len(datasets), 1)
593 uri, components = butler.getURIs(datasets[0])
595 if butler.datastore.isEphemeral:
596 # Never disassemble in-memory datastore
597 self.assertIsInstance(uri, ResourcePath)
598 self.assertFalse(components)
599 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
600 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
601 else:
602 self.assertIsNone(uri)
603 self.assertEqual(set(components), set(storageClass.components))
604 for compuri in components.values():
605 self.assertIsInstance(compuri, ResourcePath)
606 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
607 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
609 # Predicted dataset
610 dataId = {"instrument": "DummyCamComp", "visit": 424}
611 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
613 if butler.datastore.isEphemeral:
614 # Never disassembled
615 self.assertIsInstance(uri, ResourcePath)
616 self.assertFalse(components)
617 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
618 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
619 else:
620 self.assertIsNone(uri)
621 self.assertEqual(set(components), set(storageClass.components))
622 for compuri in components.values():
623 self.assertIsInstance(compuri, ResourcePath)
624 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
625 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
627 def testIngest(self):
628 butler = Butler(self.tmpConfigFile, run="ingest")
630 # Create and register a DatasetType
631 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
633 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
634 datasetTypeName = "metric"
636 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
638 # Add needed Dimensions
639 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
640 butler.registry.insertDimensionData(
641 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
642 )
643 for detector in (1, 2):
644 butler.registry.insertDimensionData(
645 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
646 )
648 butler.registry.insertDimensionData(
649 "visit",
650 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
651 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
652 )
654 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
655 dataRoot = os.path.join(TESTDIR, "data", "basic")
656 datasets = []
657 for detector in (1, 2):
658 detector_name = f"detector_{detector}"
659 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
660 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
661 # Create a DatasetRef for ingest
662 refIn = DatasetRef(datasetType, dataId, id=None)
664 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
666 butler.ingest(*datasets, transfer="copy")
668 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
669 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
671 metrics1 = butler.get(datasetTypeName, dataId1)
672 metrics2 = butler.get(datasetTypeName, dataId2)
673 self.assertNotEqual(metrics1, metrics2)
675 # Compare URIs
676 uri1 = butler.getURI(datasetTypeName, dataId1)
677 uri2 = butler.getURI(datasetTypeName, dataId2)
678 self.assertNotEqual(uri1, uri2)
680 # Now do a multi-dataset but single file ingest
681 metricFile = os.path.join(dataRoot, "detectors.yaml")
682 refs = []
683 for detector in (1, 2):
684 detector_name = f"detector_{detector}"
685 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
686 # Create a DatasetRef for ingest
687 refs.append(DatasetRef(datasetType, dataId, id=None))
689 datasets = []
690 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
692 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
694 # Check that the datastore recorded no file size.
695 # Not all datastores can support this.
696 try:
697 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
698 self.assertEqual(infos[0].file_size, -1)
699 except AttributeError:
700 pass
702 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
703 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
705 multi1 = butler.get(datasetTypeName, dataId1)
706 multi2 = butler.get(datasetTypeName, dataId2)
708 self.assertEqual(multi1, metrics1)
709 self.assertEqual(multi2, metrics2)
711 # Compare URIs
712 uri1 = butler.getURI(datasetTypeName, dataId1)
713 uri2 = butler.getURI(datasetTypeName, dataId2)
714 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
716 # Test that removing one does not break the second
717 # This line will issue a warning log message for a ChainedDatastore
718 # that uses an InMemoryDatastore since in-memory can not ingest
719 # files.
720 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
721 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
722 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
723 multi2b = butler.get(datasetTypeName, dataId2)
724 self.assertEqual(multi2, multi2b)
726 def testPruneCollections(self):
727 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
728 butler = Butler(self.tmpConfigFile, writeable=True)
729 # Load registry data with dimensions to hang datasets off of.
730 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
731 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
732 # Add some RUN-type collections.
733 run1 = "run1"
734 butler.registry.registerRun(run1)
735 run2 = "run2"
736 butler.registry.registerRun(run2)
737 # put some datasets. ref1 and ref2 have the same data ID, and are in
738 # different runs. ref3 has a different data ID.
739 metric = makeExampleMetrics()
740 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
741 datasetType = self.addDatasetType(
742 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
743 )
744 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
745 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
746 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
748 # Try to delete a RUN collection without purge, or with purge and not
749 # unstore.
750 with self.assertRaises(TypeError):
751 butler.pruneCollection(run1)
752 with self.assertRaises(TypeError):
753 butler.pruneCollection(run2, purge=True)
754 # Add a TAGGED collection and associate ref3 only into it.
755 tag1 = "tag1"
756 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
757 self.assertTrue(registered)
758 # Registering a second time should be allowed.
759 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
760 self.assertFalse(registered)
761 butler.registry.associate(tag1, [ref3])
762 # Add a CHAINED collection that searches run1 and then run2. It
763 # logically contains only ref1, because ref2 is shadowed due to them
764 # having the same data ID and dataset type.
765 chain1 = "chain1"
766 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
767 butler.registry.setCollectionChain(chain1, [run1, run2])
768 # Try to delete RUN collections, which should fail with complete
769 # rollback because they're still referenced by the CHAINED
770 # collection.
771 with self.assertRaises(Exception):
772 butler.pruneCollection(run1, pruge=True, unstore=True)
773 with self.assertRaises(Exception):
774 butler.pruneCollection(run2, pruge=True, unstore=True)
775 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
776 existence = butler.datastore.mexists([ref1, ref2, ref3])
777 self.assertTrue(existence[ref1])
778 self.assertTrue(existence[ref2])
779 self.assertTrue(existence[ref3])
780 # Try to delete CHAINED and TAGGED collections with purge; should not
781 # work.
782 with self.assertRaises(TypeError):
783 butler.pruneCollection(tag1, purge=True, unstore=True)
784 with self.assertRaises(TypeError):
785 butler.pruneCollection(chain1, purge=True, unstore=True)
786 # Remove the tagged collection with unstore=False. This should not
787 # affect the datasets.
788 butler.pruneCollection(tag1)
789 with self.assertRaises(MissingCollectionError):
790 butler.registry.getCollectionType(tag1)
791 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
792 existence = butler.datastore.mexists([ref1, ref2, ref3])
793 self.assertTrue(existence[ref1])
794 self.assertTrue(existence[ref2])
795 self.assertTrue(existence[ref3])
796 # Add the tagged collection back in, and remove it with unstore=True.
797 # This should remove ref3 only from the datastore.
798 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
799 butler.registry.associate(tag1, [ref3])
800 butler.pruneCollection(tag1, unstore=True)
801 with self.assertRaises(MissingCollectionError):
802 butler.registry.getCollectionType(tag1)
803 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
804 existence = butler.datastore.mexists([ref1, ref2, ref3])
805 self.assertTrue(existence[ref1])
806 self.assertTrue(existence[ref2])
807 self.assertFalse(existence[ref3])
808 # Delete the chain with unstore=False. The datasets should not be
809 # affected at all.
810 butler.pruneCollection(chain1)
811 with self.assertRaises(MissingCollectionError):
812 butler.registry.getCollectionType(chain1)
813 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
814 existence = butler.datastore.mexists([ref1, ref2, ref3])
815 self.assertTrue(existence[ref1])
816 self.assertTrue(existence[ref2])
817 self.assertFalse(existence[ref3])
818 # Redefine and then delete the chain with unstore=True. Only ref1
819 # should be unstored (ref3 has already been unstored, but otherwise
820 # would be now).
821 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
822 butler.registry.setCollectionChain(chain1, [run1, run2])
823 butler.pruneCollection(chain1, unstore=True)
824 with self.assertRaises(MissingCollectionError):
825 butler.registry.getCollectionType(chain1)
826 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
827 existence = butler.datastore.mexists([ref1, ref2, ref3])
828 self.assertFalse(existence[ref1])
829 self.assertTrue(existence[ref2])
830 self.assertFalse(existence[ref3])
831 # Remove run1. This removes ref1 and ref3 from the registry (they're
832 # already gone from the datastore, which is fine).
833 butler.pruneCollection(run1, purge=True, unstore=True)
834 with self.assertRaises(MissingCollectionError):
835 butler.registry.getCollectionType(run1)
836 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
837 self.assertTrue(butler.datastore.exists(ref2))
838 # Remove run2. This removes ref2 from the registry and the datastore.
839 butler.pruneCollection(run2, purge=True, unstore=True)
840 with self.assertRaises(MissingCollectionError):
841 butler.registry.getCollectionType(run2)
842 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
844 # Now that the collections have been pruned we can remove the
845 # dataset type
846 butler.registry.removeDatasetType(datasetType.name)
848 def testPickle(self):
849 """Test pickle support."""
850 butler = Butler(self.tmpConfigFile, run="ingest")
851 butlerOut = pickle.loads(pickle.dumps(butler))
852 self.assertIsInstance(butlerOut, Butler)
853 self.assertEqual(butlerOut._config, butler._config)
854 self.assertEqual(butlerOut.collections, butler.collections)
855 self.assertEqual(butlerOut.run, butler.run)
857 def testGetDatasetTypes(self):
858 butler = Butler(self.tmpConfigFile, run="ingest")
859 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
860 dimensionEntries = [
861 (
862 "instrument",
863 {"instrument": "DummyCam"},
864 {"instrument": "DummyHSC"},
865 {"instrument": "DummyCamComp"},
866 ),
867 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
868 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
869 ]
870 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
871 # Add needed Dimensions
872 for args in dimensionEntries:
873 butler.registry.insertDimensionData(*args)
875 # When a DatasetType is added to the registry entries are not created
876 # for components but querying them can return the components.
877 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
878 components = set()
879 for datasetTypeName in datasetTypeNames:
880 # Create and register a DatasetType
881 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
883 for componentName in storageClass.components:
884 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
886 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
887 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
889 # Now that we have some dataset types registered, validate them
890 butler.validateConfiguration(
891 ignore=[
892 "test_metric_comp",
893 "metric3",
894 "calexp",
895 "DummySC",
896 "datasetType.component",
897 "random_data",
898 "random_data_2",
899 ]
900 )
902 # Add a new datasetType that will fail template validation
903 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
904 if self.validationCanFail:
905 with self.assertRaises(ValidationError):
906 butler.validateConfiguration()
908 # Rerun validation but with a subset of dataset type names
909 butler.validateConfiguration(datasetTypeNames=["metric4"])
911 # Rerun validation but ignore the bad datasetType
912 butler.validateConfiguration(
913 ignore=[
914 "test_metric_comp",
915 "metric3",
916 "calexp",
917 "DummySC",
918 "datasetType.component",
919 "random_data",
920 "random_data_2",
921 ]
922 )
924 def testTransaction(self):
925 butler = Butler(self.tmpConfigFile, run="ingest")
926 datasetTypeName = "test_metric"
927 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
928 dimensionEntries = (
929 ("instrument", {"instrument": "DummyCam"}),
930 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
931 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
932 )
933 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
934 metric = makeExampleMetrics()
935 dataId = {"instrument": "DummyCam", "visit": 42}
936 # Create and register a DatasetType
937 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
938 with self.assertRaises(TransactionTestError):
939 with butler.transaction():
940 # Add needed Dimensions
941 for args in dimensionEntries:
942 butler.registry.insertDimensionData(*args)
943 # Store a dataset
944 ref = butler.put(metric, datasetTypeName, dataId)
945 self.assertIsInstance(ref, DatasetRef)
946 # Test getDirect
947 metricOut = butler.getDirect(ref)
948 self.assertEqual(metric, metricOut)
949 # Test get
950 metricOut = butler.get(datasetTypeName, dataId)
951 self.assertEqual(metric, metricOut)
952 # Check we can get components
953 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
954 raise TransactionTestError("This should roll back the entire transaction")
955 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"):
956 butler.registry.expandDataId(dataId)
957 # Should raise LookupError for missing data ID value
958 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
959 butler.get(datasetTypeName, dataId)
960 # Also check explicitly if Dataset entry is missing
961 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
962 # Direct retrieval should not find the file in the Datastore
963 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
964 butler.getDirect(ref)
966 def testMakeRepo(self):
967 """Test that we can write butler configuration to a new repository via
968 the Butler.makeRepo interface and then instantiate a butler from the
969 repo root.
970 """
971 # Do not run the test if we know this datastore configuration does
972 # not support a file system root
973 if self.fullConfigKey is None:
974 return
976 # create two separate directories
977 root1 = tempfile.mkdtemp(dir=self.root)
978 root2 = tempfile.mkdtemp(dir=self.root)
980 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
981 limited = Config(self.configFile)
982 butler1 = Butler(butlerConfig)
983 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
984 full = Config(self.tmpConfigFile)
985 butler2 = Butler(butlerConfig)
986 # Butlers should have the same configuration regardless of whether
987 # defaults were expanded.
988 self.assertEqual(butler1._config, butler2._config)
989 # Config files loaded directly should not be the same.
990 self.assertNotEqual(limited, full)
991 # Make sure "limited" doesn't have a few keys we know it should be
992 # inheriting from defaults.
993 self.assertIn(self.fullConfigKey, full)
994 self.assertNotIn(self.fullConfigKey, limited)
996 # Collections don't appear until something is put in them
997 collections1 = set(butler1.registry.queryCollections())
998 self.assertEqual(collections1, set())
999 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1001 # Check that a config with no associated file name will not
1002 # work properly with relocatable Butler repo
1003 butlerConfig.configFile = None
1004 with self.assertRaises(ValueError):
1005 Butler(butlerConfig)
1007 with self.assertRaises(FileExistsError):
1008 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1010 def testStringification(self):
1011 butler = Butler(self.tmpConfigFile, run="ingest")
1012 butlerStr = str(butler)
1014 if self.datastoreStr is not None:
1015 for testStr in self.datastoreStr:
1016 self.assertIn(testStr, butlerStr)
1017 if self.registryStr is not None:
1018 self.assertIn(self.registryStr, butlerStr)
1020 datastoreName = butler.datastore.name
1021 if self.datastoreName is not None:
1022 for testStr in self.datastoreName:
1023 self.assertIn(testStr, datastoreName)
1025 def testButlerRewriteDataId(self):
1026 """Test that dataIds can be rewritten based on dimension records."""
1028 butler = Butler(self.tmpConfigFile, run="ingest")
1030 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1031 datasetTypeName = "random_data"
1033 # Create dimension records.
1034 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1035 butler.registry.insertDimensionData(
1036 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1037 )
1038 butler.registry.insertDimensionData(
1039 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1040 )
1042 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1043 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1044 butler.registry.registerDatasetType(datasetType)
1046 n_exposures = 5
1047 dayobs = 20210530
1049 for i in range(n_exposures):
1050 butler.registry.insertDimensionData(
1051 "exposure",
1052 {
1053 "instrument": "DummyCamComp",
1054 "id": i,
1055 "obs_id": f"exp{i}",
1056 "seq_num": i,
1057 "day_obs": dayobs,
1058 "physical_filter": "d-r",
1059 },
1060 )
1062 # Write some data.
1063 for i in range(n_exposures):
1064 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1066 # Use the seq_num for the put to test rewriting.
1067 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1068 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1070 # Check that the exposure is correct in the dataId
1071 self.assertEqual(ref.dataId["exposure"], i)
1073 # and check that we can get the dataset back with the same dataId
1074 new_metric = butler.get(datasetTypeName, dataId=dataId)
1075 self.assertEqual(new_metric, metric)
1078class FileDatastoreButlerTests(ButlerTests):
1079 """Common tests and specialization of ButlerTests for butlers backed
1080 by datastores that inherit from FileDatastore.
1081 """
1083 def checkFileExists(self, root, relpath):
1084 """Checks if file exists at a given path (relative to root).
1086 Test testPutTemplates verifies actual physical existance of the files
1087 in the requested location.
1088 """
1089 uri = ResourcePath(root, forceDirectory=True)
1090 return uri.join(relpath).exists()
1092 def testPutTemplates(self):
1093 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1094 butler = Butler(self.tmpConfigFile, run="ingest")
1096 # Add needed Dimensions
1097 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1098 butler.registry.insertDimensionData(
1099 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1100 )
1101 butler.registry.insertDimensionData(
1102 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1103 )
1104 butler.registry.insertDimensionData(
1105 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1106 )
1108 # Create and store a dataset
1109 metric = makeExampleMetrics()
1111 # Create two almost-identical DatasetTypes (both will use default
1112 # template)
1113 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1114 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1115 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1116 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1118 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1119 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1121 # Put with exactly the data ID keys needed
1122 ref = butler.put(metric, "metric1", dataId1)
1123 uri = butler.getURI(ref)
1124 self.assertTrue(
1125 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"),
1126 f"Checking existence of {uri}",
1127 )
1129 # Check the template based on dimensions
1130 butler.datastore.templates.validateTemplates([ref])
1132 # Put with extra data ID keys (physical_filter is an optional
1133 # dependency); should not change template (at least the way we're
1134 # defining them to behave now; the important thing is that they
1135 # must be consistent).
1136 ref = butler.put(metric, "metric2", dataId2)
1137 uri = butler.getURI(ref)
1138 self.assertTrue(
1139 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"),
1140 f"Checking existence of {uri}",
1141 )
1143 # Check the template based on dimensions
1144 butler.datastore.templates.validateTemplates([ref])
1146 # Now use a file template that will not result in unique filenames
1147 with self.assertRaises(FileTemplateValidationError):
1148 butler.put(metric, "metric3", dataId1)
1150 def testImportExport(self):
1151 # Run put/get tests just to create and populate a repo.
1152 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1153 self.runImportExportTest(storageClass)
1155 @unittest.expectedFailure
1156 def testImportExportVirtualComposite(self):
1157 # Run put/get tests just to create and populate a repo.
1158 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1159 self.runImportExportTest(storageClass)
1161 def runImportExportTest(self, storageClass):
1162 """This test does an export to a temp directory and an import back
1163 into a new temp directory repo. It does not assume a posix datastore"""
1164 exportButler = self.runPutGetTest(storageClass, "test_metric")
1165 print("Root:", exportButler.datastore.root)
1166 # Test that the repo actually has at least one dataset.
1167 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1168 self.assertGreater(len(datasets), 0)
1169 # Add a DimensionRecord that's unused by those datasets.
1170 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1171 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1172 # Export and then import datasets.
1173 with safeTestTempDir(TESTDIR) as exportDir:
1174 exportFile = os.path.join(exportDir, "exports.yaml")
1175 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1176 export.saveDatasets(datasets)
1177 # Export the same datasets again. This should quietly do
1178 # nothing because of internal deduplication, and it shouldn't
1179 # complain about being asked to export the "htm7" elements even
1180 # though there aren't any in these datasets or in the database.
1181 export.saveDatasets(datasets, elements=["htm7"])
1182 # Save one of the data IDs again; this should be harmless
1183 # because of internal deduplication.
1184 export.saveDataIds([datasets[0].dataId])
1185 # Save some dimension records directly.
1186 export.saveDimensionData("skymap", [skymapRecord])
1187 self.assertTrue(os.path.exists(exportFile))
1188 with safeTestTempDir(TESTDIR) as importDir:
1189 # We always want this to be a local posix butler
1190 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1191 # Calling script.butlerImport tests the implementation of the
1192 # butler command line interface "import" subcommand. Functions
1193 # in the script folder are generally considered protected and
1194 # should not be used as public api.
1195 with open(exportFile, "r") as f:
1196 script.butlerImport(
1197 importDir,
1198 export_file=f,
1199 directory=exportDir,
1200 transfer="auto",
1201 skip_dimensions=None,
1202 reuse_ids=False,
1203 )
1204 importButler = Butler(importDir, run="ingest")
1205 for ref in datasets:
1206 with self.subTest(ref=ref):
1207 # Test for existence by passing in the DatasetType and
1208 # data ID separately, to avoid lookup by dataset_id.
1209 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1210 self.assertEqual(
1211 list(importButler.registry.queryDimensionRecords("skymap")),
1212 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1213 )
1215 def testRemoveRuns(self):
1216 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1217 butler = Butler(self.tmpConfigFile, writeable=True)
1218 # Load registry data with dimensions to hang datasets off of.
1219 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1220 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1221 # Add some RUN-type collection.
1222 run1 = "run1"
1223 butler.registry.registerRun(run1)
1224 run2 = "run2"
1225 butler.registry.registerRun(run2)
1226 # put a dataset in each
1227 metric = makeExampleMetrics()
1228 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1229 datasetType = self.addDatasetType(
1230 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1231 )
1232 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1233 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1234 uri1 = butler.getURI(ref1, collections=[run1])
1235 uri2 = butler.getURI(ref2, collections=[run2])
1236 # Remove from both runs with different values for unstore.
1237 butler.removeRuns([run1], unstore=True)
1238 butler.removeRuns([run2], unstore=False)
1239 # Should be nothing in registry for either one, and datastore should
1240 # not think either exists.
1241 with self.assertRaises(MissingCollectionError):
1242 butler.registry.getCollectionType(run1)
1243 with self.assertRaises(MissingCollectionError):
1244 butler.registry.getCollectionType(run2)
1245 self.assertFalse(butler.datastore.exists(ref1))
1246 self.assertFalse(butler.datastore.exists(ref2))
1247 # The ref we unstored should be gone according to the URI, but the
1248 # one we forgot should still be around.
1249 self.assertFalse(uri1.exists())
1250 self.assertTrue(uri2.exists())
1253class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1254 """PosixDatastore specialization of a butler"""
1256 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1257 fullConfigKey = ".datastore.formatters"
1258 validationCanFail = True
1259 datastoreStr = ["/tmp"]
1260 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1261 registryStr = "/gen3.sqlite3"
1263 def testPathConstructor(self):
1264 """Independent test of constructor using PathLike."""
1265 butler = Butler(self.tmpConfigFile, run="ingest")
1266 self.assertIsInstance(butler, Butler)
1268 # And again with a Path object with the butler yaml
1269 path = pathlib.Path(self.tmpConfigFile)
1270 butler = Butler(path, writeable=False)
1271 self.assertIsInstance(butler, Butler)
1273 # And again with a Path object without the butler yaml
1274 # (making sure we skip it if the tmp config doesn't end
1275 # in butler.yaml -- which is the case for a subclass)
1276 if self.tmpConfigFile.endswith("butler.yaml"):
1277 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1278 butler = Butler(path, writeable=False)
1279 self.assertIsInstance(butler, Butler)
1281 def testExportTransferCopy(self):
1282 """Test local export using all transfer modes"""
1283 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1284 exportButler = self.runPutGetTest(storageClass, "test_metric")
1285 # Test that the repo actually has at least one dataset.
1286 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1287 self.assertGreater(len(datasets), 0)
1288 uris = [exportButler.getURI(d) for d in datasets]
1289 datastoreRoot = exportButler.datastore.root
1291 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1293 for path in pathsInStore:
1294 # Assume local file system
1295 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1297 for transfer in ("copy", "link", "symlink", "relsymlink"):
1298 with safeTestTempDir(TESTDIR) as exportDir:
1299 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1300 export.saveDatasets(datasets)
1301 for path in pathsInStore:
1302 self.assertTrue(
1303 self.checkFileExists(exportDir, path),
1304 f"Check that mode {transfer} exported files",
1305 )
1307 def testPruneDatasets(self):
1308 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1309 butler = Butler(self.tmpConfigFile, writeable=True)
1310 # Load registry data with dimensions to hang datasets off of.
1311 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1312 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1313 # Add some RUN-type collections.
1314 run1 = "run1"
1315 butler.registry.registerRun(run1)
1316 run2 = "run2"
1317 butler.registry.registerRun(run2)
1318 # put some datasets. ref1 and ref2 have the same data ID, and are in
1319 # different runs. ref3 has a different data ID.
1320 metric = makeExampleMetrics()
1321 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1322 datasetType = self.addDatasetType(
1323 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1324 )
1325 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1326 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1327 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1329 # Simple prune.
1330 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1331 with self.assertRaises(LookupError):
1332 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1334 # Put data back.
1335 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1336 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1337 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1339 # Check that in normal mode, deleting the record will lead to
1340 # trash not touching the file.
1341 uri1 = butler.datastore.getURI(ref1)
1342 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1343 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1344 butler.datastore.trash(ref1)
1345 butler.datastore.emptyTrash()
1346 self.assertTrue(uri1.exists())
1347 uri1.remove() # Clean it up.
1349 # Simulate execution butler setup by deleting the datastore
1350 # record but keeping the file around and trusting.
1351 butler.datastore.trustGetRequest = True
1352 uri2 = butler.datastore.getURI(ref2)
1353 uri3 = butler.datastore.getURI(ref3)
1354 self.assertTrue(uri2.exists())
1355 self.assertTrue(uri3.exists())
1357 # Remove the datastore record.
1358 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1359 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1360 self.assertTrue(uri2.exists())
1361 butler.datastore.trash([ref2, ref3])
1362 # Immediate removal for ref2 file
1363 self.assertFalse(uri2.exists())
1364 # But ref3 has to wait for the empty.
1365 self.assertTrue(uri3.exists())
1366 butler.datastore.emptyTrash()
1367 self.assertFalse(uri3.exists())
1369 # Clear out the datasets from registry.
1370 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1372 def testPytypePutCoercion(self):
1373 """Test python type coercion on Butler.get and put."""
1375 # Store some data with the normal example storage class.
1376 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1377 datasetTypeName = "test_metric"
1378 butler, _ = self.create_butler("ingest", storageClass, datasetTypeName)
1380 dataId = {"instrument": "DummyCamComp", "visit": 423}
1382 # Put a dict and this should coerce to a MetricsExample
1383 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1384 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1385 test_metric = butler.getDirect(metric_ref)
1386 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1387 self.assertEqual(test_metric.summary, test_dict["summary"])
1388 self.assertEqual(test_metric.output, test_dict["output"])
1390 def testPytypeCoercion(self):
1391 """Test python type coercion on Butler.get and put."""
1393 # Store some data with the normal example storage class.
1394 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1395 datasetTypeName = "test_metric"
1396 butler = self.runPutGetTest(storageClass, datasetTypeName)
1398 dataId = {"instrument": "DummyCamComp", "visit": 423}
1399 metric = butler.get(datasetTypeName, dataId=dataId)
1400 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1402 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1403 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1405 # Now need to hack the registry dataset type definition.
1406 # There is no API for this.
1407 manager = butler.registry._managers.datasets
1408 manager._db.update(
1409 manager._static.dataset_type,
1410 {"name": datasetTypeName},
1411 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1412 )
1414 # Force reset of dataset type cache
1415 butler.registry.refresh()
1417 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1418 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1419 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1421 metric_model = butler.get(datasetTypeName, dataId=dataId)
1422 self.assertNotEqual(type(metric_model), type(metric))
1423 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1425 # Put the model and read it back to show that everything now
1426 # works as normal.
1427 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1428 metric_model_new = butler.get(metric_ref)
1429 self.assertEqual(metric_model_new, metric_model)
1431 # Hack the storage class again to something that will fail on the
1432 # get with no conversion class.
1433 manager._db.update(
1434 manager._static.dataset_type,
1435 {"name": datasetTypeName},
1436 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1437 )
1438 butler.registry.refresh()
1440 with self.assertRaises(ValueError):
1441 butler.get(datasetTypeName, dataId=dataId)
1444class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1445 """InMemoryDatastore specialization of a butler"""
1447 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1448 fullConfigKey = None
1449 useTempRoot = False
1450 validationCanFail = False
1451 datastoreStr = ["datastore='InMemory"]
1452 datastoreName = ["InMemoryDatastore@"]
1453 registryStr = "/gen3.sqlite3"
1455 def testIngest(self):
1456 pass
1459class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1460 """PosixDatastore specialization"""
1462 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1463 fullConfigKey = ".datastore.datastores.1.formatters"
1464 validationCanFail = True
1465 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1466 datastoreName = [
1467 "InMemoryDatastore@",
1468 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1469 "SecondDatastore",
1470 ]
1471 registryStr = "/gen3.sqlite3"
1474class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1475 """Test that a yaml file in one location can refer to a root in another."""
1477 datastoreStr = ["dir1"]
1478 # Disable the makeRepo test since we are deliberately not using
1479 # butler.yaml as the config name.
1480 fullConfigKey = None
1482 def setUp(self):
1483 self.root = makeTestTempDir(TESTDIR)
1485 # Make a new repository in one place
1486 self.dir1 = os.path.join(self.root, "dir1")
1487 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1489 # Move the yaml file to a different place and add a "root"
1490 self.dir2 = os.path.join(self.root, "dir2")
1491 os.makedirs(self.dir2, exist_ok=True)
1492 configFile1 = os.path.join(self.dir1, "butler.yaml")
1493 config = Config(configFile1)
1494 config["root"] = self.dir1
1495 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1496 config.dumpToUri(configFile2)
1497 os.remove(configFile1)
1498 self.tmpConfigFile = configFile2
1500 def testFileLocations(self):
1501 self.assertNotEqual(self.dir1, self.dir2)
1502 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1503 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1504 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1507class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1508 """Test that a config file created by makeRepo outside of repo works."""
1510 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1512 def setUp(self):
1513 self.root = makeTestTempDir(TESTDIR)
1514 self.root2 = makeTestTempDir(TESTDIR)
1516 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1517 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1519 def tearDown(self):
1520 if os.path.exists(self.root2):
1521 shutil.rmtree(self.root2, ignore_errors=True)
1522 super().tearDown()
1524 def testConfigExistence(self):
1525 c = Config(self.tmpConfigFile)
1526 uri_config = ResourcePath(c["root"])
1527 uri_expected = ResourcePath(self.root, forceDirectory=True)
1528 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1529 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1531 def testPutGet(self):
1532 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1533 self.runPutGetTest(storageClass, "test_metric")
1536class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1537 """Test that a config file created by makeRepo outside of repo works."""
1539 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1541 def setUp(self):
1542 self.root = makeTestTempDir(TESTDIR)
1543 self.root2 = makeTestTempDir(TESTDIR)
1545 self.tmpConfigFile = self.root2
1546 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1548 def testConfigExistence(self):
1549 # Append the yaml file else Config constructor does not know the file
1550 # type.
1551 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1552 super().testConfigExistence()
1555class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1556 """Test that a config file created by makeRepo outside of repo works."""
1558 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1560 def setUp(self):
1561 self.root = makeTestTempDir(TESTDIR)
1562 self.root2 = makeTestTempDir(TESTDIR)
1564 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1565 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1568@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1569@mock_s3
1570class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1571 """S3Datastore specialization of a butler; an S3 storage Datastore +
1572 a local in-memory SqlRegistry.
1573 """
1575 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1576 fullConfigKey = None
1577 validationCanFail = True
1579 bucketName = "anybucketname"
1580 """Name of the Bucket that will be used in the tests. The name is read from
1581 the config file used with the tests during set-up.
1582 """
1584 root = "butlerRoot/"
1585 """Root repository directory expected to be used in case useTempRoot=False.
1586 Otherwise the root is set to a 20 characters long randomly generated string
1587 during set-up.
1588 """
1590 datastoreStr = [f"datastore={root}"]
1591 """Contains all expected root locations in a format expected to be
1592 returned by Butler stringification.
1593 """
1595 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1596 """The expected format of the S3 Datastore string."""
1598 registryStr = "/gen3.sqlite3"
1599 """Expected format of the Registry string."""
1601 def genRoot(self):
1602 """Returns a random string of len 20 to serve as a root
1603 name for the temporary bucket repo.
1605 This is equivalent to tempfile.mkdtemp as this is what self.root
1606 becomes when useTempRoot is True.
1607 """
1608 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1609 return rndstr + "/"
1611 def setUp(self):
1612 config = Config(self.configFile)
1613 uri = ResourcePath(config[".datastore.datastore.root"])
1614 self.bucketName = uri.netloc
1616 # set up some fake credentials if they do not exist
1617 self.usingDummyCredentials = setAwsEnvCredentials()
1619 if self.useTempRoot:
1620 self.root = self.genRoot()
1621 rooturi = f"s3://{self.bucketName}/{self.root}"
1622 config.update({"datastore": {"datastore": {"root": rooturi}}})
1624 # need local folder to store registry database
1625 self.reg_dir = makeTestTempDir(TESTDIR)
1626 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1628 # MOTO needs to know that we expect Bucket bucketname to exist
1629 # (this used to be the class attribute bucketName)
1630 s3 = boto3.resource("s3")
1631 s3.create_bucket(Bucket=self.bucketName)
1633 self.datastoreStr = f"datastore={self.root}"
1634 self.datastoreName = [f"FileDatastore@{rooturi}"]
1635 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1636 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1638 def tearDown(self):
1639 s3 = boto3.resource("s3")
1640 bucket = s3.Bucket(self.bucketName)
1641 try:
1642 bucket.objects.all().delete()
1643 except botocore.exceptions.ClientError as e:
1644 if e.response["Error"]["Code"] == "404":
1645 # the key was not reachable - pass
1646 pass
1647 else:
1648 raise
1650 bucket = s3.Bucket(self.bucketName)
1651 bucket.delete()
1653 # unset any potentially set dummy credentials
1654 if self.usingDummyCredentials:
1655 unsetAwsEnvCredentials()
1657 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1658 shutil.rmtree(self.reg_dir, ignore_errors=True)
1660 if self.useTempRoot and os.path.exists(self.root):
1661 shutil.rmtree(self.root, ignore_errors=True)
1664@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1665# Mock required environment variables during tests
1666@unittest.mock.patch.dict(
1667 os.environ,
1668 {
1669 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1670 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1671 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1672 },
1673)
1674class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1675 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1676 a local in-memory SqlRegistry.
1677 """
1679 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1680 fullConfigKey = None
1681 validationCanFail = True
1683 serverName = "localhost"
1684 """Name of the server that will be used in the tests.
1685 """
1687 portNumber = 8080
1688 """Port on which the webdav server listens. Automatically chosen
1689 at setUpClass via the _getfreeport() method
1690 """
1692 root = "butlerRoot/"
1693 """Root repository directory expected to be used in case useTempRoot=False.
1694 Otherwise the root is set to a 20 characters long randomly generated string
1695 during set-up.
1696 """
1698 datastoreStr = [f"datastore={root}"]
1699 """Contains all expected root locations in a format expected to be
1700 returned by Butler stringification.
1701 """
1703 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1704 """The expected format of the WebdavDatastore string."""
1706 registryStr = "/gen3.sqlite3"
1707 """Expected format of the Registry string."""
1709 serverThread = None
1710 """Thread in which the local webdav server will run"""
1712 stopWebdavServer = False
1713 """This flag will cause the webdav server to
1714 gracefully shut down when True
1715 """
1717 def genRoot(self):
1718 """Returns a random string of len 20 to serve as a root
1719 name for the temporary bucket repo.
1721 This is equivalent to tempfile.mkdtemp as this is what self.root
1722 becomes when useTempRoot is True.
1723 """
1724 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1725 return rndstr + "/"
1727 @classmethod
1728 def setUpClass(cls):
1729 # Do the same as inherited class
1730 cls.storageClassFactory = StorageClassFactory()
1731 cls.storageClassFactory.addFromConfig(cls.configFile)
1733 cls.portNumber = cls._getfreeport()
1734 # Run a local webdav server on which tests will be run
1735 cls.serverThread = Thread(
1736 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1737 )
1738 cls.serverThread.start()
1739 # Wait for it to start
1740 time.sleep(3)
1742 @classmethod
1743 def tearDownClass(cls):
1744 # Ask for graceful shut down of the webdav server
1745 cls.stopWebdavServer = True
1746 # Wait for the thread to exit
1747 cls.serverThread.join()
1749 # Mock required environment variables during tests
1750 @unittest.mock.patch.dict(
1751 os.environ,
1752 {
1753 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1754 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1755 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1756 },
1757 )
1758 def setUp(self):
1759 config = Config(self.configFile)
1761 if self.useTempRoot:
1762 self.root = self.genRoot()
1763 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1764 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1766 # need local folder to store registry database
1767 self.reg_dir = makeTestTempDir(TESTDIR)
1768 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1770 self.datastoreStr = f"datastore={self.root}"
1771 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1773 if not isWebdavEndpoint(self.rooturi):
1774 raise OSError("Webdav server not running properly: cannot run tests.")
1776 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1777 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1779 # Mock required environment variables during tests
1780 @unittest.mock.patch.dict(
1781 os.environ,
1782 {
1783 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1784 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1785 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1786 },
1787 )
1788 def tearDown(self):
1789 # Clear temporary directory
1790 ResourcePath(self.rooturi).remove()
1791 ResourcePath(self.rooturi).session.close()
1793 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1794 shutil.rmtree(self.reg_dir, ignore_errors=True)
1796 if self.useTempRoot and os.path.exists(self.root):
1797 shutil.rmtree(self.root, ignore_errors=True)
1799 def _serveWebdav(self, port: int, stopWebdavServer):
1800 """Starts a local webdav-compatible HTTP server,
1801 Listening on http://localhost:port
1802 This server only runs when this test class is instantiated,
1803 and then shuts down. Must be started is a separate thread.
1805 Parameters
1806 ----------
1807 port : `int`
1808 The port number on which the server should listen
1809 """
1810 root_path = gettempdir()
1812 config = {
1813 "host": "0.0.0.0",
1814 "port": port,
1815 "provider_mapping": {"/": root_path},
1816 "http_authenticator": {"domain_controller": None},
1817 "simple_dc": {"user_mapping": {"*": True}},
1818 "verbose": 0,
1819 }
1820 app = WsgiDAVApp(config)
1822 server_args = {
1823 "bind_addr": (config["host"], config["port"]),
1824 "wsgi_app": app,
1825 }
1826 server = wsgi.Server(**server_args)
1827 server.prepare()
1829 try:
1830 # Start the actual server in a separate thread
1831 t = Thread(target=server.serve, daemon=True)
1832 t.start()
1833 # watch stopWebdavServer, and gracefully
1834 # shut down the server when True
1835 while True:
1836 if stopWebdavServer():
1837 break
1838 time.sleep(1)
1839 except KeyboardInterrupt:
1840 print("Caught Ctrl-C, shutting down...")
1841 finally:
1842 server.stop()
1843 t.join()
1845 def _getfreeport():
1846 """
1847 Determines a free port using sockets.
1848 """
1849 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1850 free_socket.bind(("0.0.0.0", 0))
1851 free_socket.listen()
1852 port = free_socket.getsockname()[1]
1853 free_socket.close()
1854 return port
1857class PosixDatastoreTransfers(unittest.TestCase):
1858 """Test data transfers between butlers.
1860 Test for different managers. UUID to UUID and integer to integer are
1861 tested. UUID to integer is not supported since we do not currently
1862 want to allow that. Integer to UUID is supported with the caveat
1863 that UUID4 will be generated and this will be incorrect for raw
1864 dataset types. The test ignores that.
1865 """
1867 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1869 @classmethod
1870 def setUpClass(cls):
1871 cls.storageClassFactory = StorageClassFactory()
1872 cls.storageClassFactory.addFromConfig(cls.configFile)
1874 def setUp(self):
1875 self.root = makeTestTempDir(TESTDIR)
1876 self.config = Config(self.configFile)
1878 def tearDown(self):
1879 removeTestTempDir(self.root)
1881 def create_butler(self, manager, label):
1882 config = Config(self.configFile)
1883 config["registry", "managers", "datasets"] = manager
1884 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1886 def create_butlers(self, manager1, manager2):
1887 self.source_butler = self.create_butler(manager1, "1")
1888 self.target_butler = self.create_butler(manager2, "2")
1890 def testTransferUuidToUuid(self):
1891 self.create_butlers(
1892 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1893 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1894 )
1895 # Setting id_gen_map should have no effect here
1896 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1898 def testTransferIntToInt(self):
1899 self.create_butlers(
1900 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1901 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1902 )
1903 # int dataset ID only allows UNIQUE
1904 self.assertButlerTransfers()
1906 def testTransferIntToUuid(self):
1907 self.create_butlers(
1908 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1909 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1910 )
1911 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1913 def testTransferMissing(self):
1914 """Test transfers where datastore records are missing.
1916 This is how execution butler works.
1917 """
1918 self.create_butlers(
1919 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1920 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1921 )
1923 # Configure the source butler to allow trust.
1924 self.source_butler.datastore.trustGetRequest = True
1926 self.assertButlerTransfers(purge=True)
1928 def testTransferMissingDisassembly(self):
1929 """Test transfers where datastore records are missing.
1931 This is how execution butler works.
1932 """
1933 self.create_butlers(
1934 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1935 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1936 )
1938 # Configure the source butler to allow trust.
1939 self.source_butler.datastore.trustGetRequest = True
1941 # Test disassembly.
1942 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1944 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1945 """Test that a run can be transferred to another butler."""
1947 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1948 datasetTypeName = "random_data"
1950 # Test will create 3 collections and we will want to transfer
1951 # two of those three.
1952 runs = ["run1", "run2", "other"]
1954 # Also want to use two different dataset types to ensure that
1955 # grouping works.
1956 datasetTypeNames = ["random_data", "random_data_2"]
1958 # Create the run collections in the source butler.
1959 for run in runs:
1960 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1962 # Create dimensions in both butlers (transfer will not create them).
1963 n_exposures = 30
1964 for butler in (self.source_butler, self.target_butler):
1965 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1966 butler.registry.insertDimensionData(
1967 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1968 )
1969 butler.registry.insertDimensionData(
1970 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1971 )
1973 for i in range(n_exposures):
1974 butler.registry.insertDimensionData(
1975 "exposure",
1976 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1977 )
1979 # Create dataset types in the source butler.
1980 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1981 for datasetTypeName in datasetTypeNames:
1982 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1983 self.source_butler.registry.registerDatasetType(datasetType)
1985 # Write a dataset to an unrelated run -- this will ensure that
1986 # we are rewriting integer dataset ids in the target if necessary.
1987 # Will not be relevant for UUID.
1988 run = "distraction"
1989 butler = Butler(butler=self.source_butler, run=run)
1990 butler.put(
1991 makeExampleMetrics(),
1992 datasetTypeName,
1993 exposure=1,
1994 instrument="DummyCamComp",
1995 physical_filter="d-r",
1996 )
1998 # Write some example metrics to the source
1999 butler = Butler(butler=self.source_butler)
2001 # Set of DatasetRefs that should be in the list of refs to transfer
2002 # but which will not be transferred.
2003 deleted = set()
2005 n_expected = 20 # Number of datasets expected to be transferred
2006 source_refs = []
2007 for i in range(n_exposures):
2008 # Put a third of datasets into each collection, only retain
2009 # two thirds.
2010 index = i % 3
2011 run = runs[index]
2012 datasetTypeName = datasetTypeNames[i % 2]
2014 metric_data = {
2015 "summary": {"counter": i},
2016 "output": {"text": "metric"},
2017 "data": [2 * x for x in range(i)],
2018 }
2019 metric = MetricsExample(**metric_data)
2020 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2021 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2023 # Remove the datastore record using low-level API
2024 if purge:
2025 # Remove records for a fraction.
2026 if index == 1:
2028 # For one of these delete the file as well.
2029 # This allows the "missing" code to filter the
2030 # file out.
2031 if not deleted:
2032 primary, uris = butler.datastore.getURIs(ref)
2033 if primary:
2034 primary.remove()
2035 for uri in uris.values():
2036 uri.remove()
2037 n_expected -= 1
2038 deleted.add(ref)
2040 # Remove the datastore record.
2041 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2043 if index < 2:
2044 source_refs.append(ref)
2045 if ref not in deleted:
2046 new_metric = butler.get(ref.unresolved(), collections=run)
2047 self.assertEqual(new_metric, metric)
2049 # Create some bad dataset types to ensure we check for inconsistent
2050 # definitions.
2051 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2052 for datasetTypeName in datasetTypeNames:
2053 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2054 self.target_butler.registry.registerDatasetType(datasetType)
2055 with self.assertRaises(ConflictingDefinitionError):
2056 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2057 # And remove the bad definitions.
2058 for datasetTypeName in datasetTypeNames:
2059 self.target_butler.registry.removeDatasetType(datasetTypeName)
2061 # Transfer without creating dataset types should fail.
2062 with self.assertRaises(KeyError):
2063 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2065 # Now transfer them to the second butler
2066 with self.assertLogs(level=logging.DEBUG) as cm:
2067 transferred = self.target_butler.transfer_from(
2068 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2069 )
2070 self.assertEqual(len(transferred), n_expected)
2071 log_output = ";".join(cm.output)
2072 self.assertIn("found in datastore for chunk", log_output)
2073 self.assertIn("Creating output run", log_output)
2075 # Do the transfer twice to ensure that it will do nothing extra.
2076 # Only do this if purge=True because it does not work for int
2077 # dataset_id.
2078 if purge:
2079 # This should not need to register dataset types.
2080 transferred = self.target_butler.transfer_from(
2081 self.source_butler, source_refs, id_gen_map=id_gen_map
2082 )
2083 self.assertEqual(len(transferred), n_expected)
2085 # Also do an explicit low-level transfer to trigger some
2086 # edge cases.
2087 with self.assertLogs(level=logging.DEBUG) as cm:
2088 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2089 log_output = ";".join(cm.output)
2090 self.assertIn("no file artifacts exist", log_output)
2092 with self.assertRaises(TypeError):
2093 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2095 with self.assertRaises(ValueError):
2096 self.target_butler.datastore.transfer_from(
2097 self.source_butler.datastore, source_refs, transfer="split"
2098 )
2100 # Now try to get the same refs from the new butler.
2101 for ref in source_refs:
2102 if ref not in deleted:
2103 unresolved_ref = ref.unresolved()
2104 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2105 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2106 self.assertEqual(new_metric, old_metric)
2108 # Now prune run2 collection and create instead a CHAINED collection.
2109 # This should block the transfer.
2110 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2111 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2112 with self.assertRaises(TypeError):
2113 # Re-importing the run1 datasets can be problematic if they
2114 # use integer IDs so filter those out.
2115 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2116 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2119if __name__ == "__main__": 2119 ↛ 2120line 2119 didn't jump to line 2120, because the condition on line 2119 was never true
2120 unittest.main()