Coverage for tests/test_butler.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionSearch,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import ConflictingDefinitionError, MissingCollectionError
77from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
78from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
79from lsst.resources import ResourcePath
80from lsst.resources.http import isWebdavEndpoint
81from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
82from lsst.utils import doImport
83from lsst.utils.introspection import get_full_type_name
85TESTDIR = os.path.abspath(os.path.dirname(__file__))
88def makeExampleMetrics():
89 return MetricsExample(
90 {"AM1": 5.2, "AM2": 30.6},
91 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
92 [563, 234, 456.7, 752, 8, 9, 27],
93 )
96class TransactionTestError(Exception):
97 """Specific error for testing transactions, to prevent misdiagnosing
98 that might otherwise occur when a standard exception is used.
99 """
101 pass
104class ButlerConfigTests(unittest.TestCase):
105 """Simple tests for ButlerConfig that are not tested in any other test
106 cases."""
108 def testSearchPath(self):
109 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
110 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
111 config1 = ButlerConfig(configFile)
112 self.assertNotIn("testConfigs", "\n".join(cm.output))
114 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
115 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
116 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
117 self.assertIn("testConfigs", "\n".join(cm.output))
119 key = ("datastore", "records", "table")
120 self.assertNotEqual(config1[key], config2[key])
121 self.assertEqual(config2[key], "override_record")
124class ButlerPutGetTests:
125 """Helper method for running a suite of put/get tests from different
126 butler configurations."""
128 root = None
130 @staticmethod
131 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
132 """Create a DatasetType and register it"""
133 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
134 registry.registerDatasetType(datasetType)
135 return datasetType
137 @classmethod
138 def setUpClass(cls):
139 cls.storageClassFactory = StorageClassFactory()
140 cls.storageClassFactory.addFromConfig(cls.configFile)
142 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
143 datasetType = datasetRef.datasetType
144 dataId = datasetRef.dataId
145 deferred = butler.getDirectDeferred(datasetRef)
147 for component in components:
148 compTypeName = datasetType.componentTypeName(component)
149 result = butler.get(compTypeName, dataId, collections=collections)
150 self.assertEqual(result, getattr(reference, component))
151 result_deferred = deferred.get(component=component)
152 self.assertEqual(result_deferred, result)
154 def tearDown(self):
155 removeTestTempDir(self.root)
157 def create_butler(self, run, storageClass, datasetTypeName):
158 butler = Butler(self.tmpConfigFile, run=run)
160 collections = set(butler.registry.queryCollections())
161 self.assertEqual(collections, set([run]))
163 # Create and register a DatasetType
164 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
166 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
168 # Add needed Dimensions
169 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
170 butler.registry.insertDimensionData(
171 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
172 )
173 butler.registry.insertDimensionData(
174 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
175 )
176 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
177 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
178 butler.registry.insertDimensionData(
179 "visit",
180 {
181 "instrument": "DummyCamComp",
182 "id": 423,
183 "name": "fourtwentythree",
184 "physical_filter": "d-r",
185 "visit_system": 1,
186 "datetime_begin": visit_start,
187 "datetime_end": visit_end,
188 },
189 )
191 # Add more visits for some later tests
192 for visit_id in (424, 425):
193 butler.registry.insertDimensionData(
194 "visit",
195 {
196 "instrument": "DummyCamComp",
197 "id": visit_id,
198 "name": f"fourtwentyfour_{visit_id}",
199 "physical_filter": "d-r",
200 "visit_system": 1,
201 },
202 )
203 return butler, datasetType
205 def runPutGetTest(self, storageClass, datasetTypeName):
206 # New datasets will be added to run and tag, but we will only look in
207 # tag when looking up datasets.
208 run = "ingest"
209 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
211 # Create and store a dataset
212 metric = makeExampleMetrics()
213 dataId = {"instrument": "DummyCamComp", "visit": 423}
215 # Create a DatasetRef for put
216 refIn = DatasetRef(datasetType, dataId, id=None)
218 # Put with a preexisting id should fail
219 with self.assertRaises(ValueError):
220 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
222 # Put and remove the dataset once as a DatasetRef, once as a dataId,
223 # and once with a DatasetType
225 # Keep track of any collections we add and do not clean up
226 expected_collections = {run}
228 counter = 0
229 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
230 # Since we are using subTest we can get cascading failures
231 # here with the first attempt failing and the others failing
232 # immediately because the dataset already exists. Work around
233 # this by using a distinct run collection each time
234 counter += 1
235 this_run = f"put_run_{counter}"
236 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
237 expected_collections.update({this_run})
239 with self.subTest(args=args):
240 ref = butler.put(metric, *args, run=this_run)
241 self.assertIsInstance(ref, DatasetRef)
243 # Test getDirect
244 metricOut = butler.getDirect(ref)
245 self.assertEqual(metric, metricOut)
246 # Test get
247 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
248 self.assertEqual(metric, metricOut)
249 # Test get with a datasetRef
250 metricOut = butler.get(ref, collections=this_run)
251 self.assertEqual(metric, metricOut)
252 # Test getDeferred with dataId
253 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
254 self.assertEqual(metric, metricOut)
255 # Test getDeferred with a datasetRef
256 metricOut = butler.getDeferred(ref, collections=this_run).get()
257 self.assertEqual(metric, metricOut)
258 # and deferred direct with ref
259 metricOut = butler.getDirectDeferred(ref).get()
260 self.assertEqual(metric, metricOut)
262 # Check we can get components
263 if storageClass.isComposite():
264 self.assertGetComponents(
265 butler, ref, ("summary", "data", "output"), metric, collections=this_run
266 )
268 # Can the artifacts themselves be retrieved?
269 if not butler.datastore.isEphemeral:
270 root_uri = ResourcePath(self.root)
272 for preserve_path in (True, False):
273 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
274 # Use copy so that we can test that overwrite
275 # protection works (using "auto" for File URIs would
276 # use hard links and subsequent transfer would work
277 # because it knows they are the same file).
278 transferred = butler.retrieveArtifacts(
279 [ref], destination, preserve_path=preserve_path, transfer="copy"
280 )
281 self.assertGreater(len(transferred), 0)
282 artifacts = list(ResourcePath.findFileResources([destination]))
283 self.assertEqual(set(transferred), set(artifacts))
285 for artifact in transferred:
286 path_in_destination = artifact.relative_to(destination)
287 self.assertIsNotNone(path_in_destination)
289 # when path is not preserved there should not be
290 # any path separators.
291 num_seps = path_in_destination.count("/")
292 if preserve_path:
293 self.assertGreater(num_seps, 0)
294 else:
295 self.assertEqual(num_seps, 0)
297 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
298 n_uris = len(secondary_uris)
299 if primary_uri:
300 n_uris += 1
301 self.assertEqual(
302 len(artifacts),
303 n_uris,
304 "Comparing expected artifacts vs actual:"
305 f" {artifacts} vs {primary_uri} and {secondary_uris}",
306 )
308 if preserve_path:
309 # No need to run these twice
310 with self.assertRaises(ValueError):
311 butler.retrieveArtifacts([ref], destination, transfer="move")
313 with self.assertRaises(FileExistsError):
314 butler.retrieveArtifacts([ref], destination)
316 transferred_again = butler.retrieveArtifacts(
317 [ref], destination, preserve_path=preserve_path, overwrite=True
318 )
319 self.assertEqual(set(transferred_again), set(transferred))
321 # Now remove the dataset completely.
322 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
323 # Lookup with original args should still fail.
324 with self.assertRaises(LookupError):
325 butler.datasetExists(*args, collections=this_run)
326 # getDirect() should still fail.
327 with self.assertRaises(FileNotFoundError):
328 butler.getDirect(ref)
329 # Registry shouldn't be able to find it by dataset_id anymore.
330 self.assertIsNone(butler.registry.getDataset(ref.id))
332 # Do explicit registry removal since we know they are
333 # empty
334 butler.registry.removeCollection(this_run)
335 expected_collections.remove(this_run)
337 # Put the dataset again, since the last thing we did was remove it
338 # and we want to use the default collection.
339 ref = butler.put(metric, refIn)
341 # Get with parameters
342 stop = 4
343 sliced = butler.get(ref, parameters={"slice": slice(stop)})
344 self.assertNotEqual(metric, sliced)
345 self.assertEqual(metric.summary, sliced.summary)
346 self.assertEqual(metric.output, sliced.output)
347 self.assertEqual(metric.data[:stop], sliced.data)
348 # getDeferred with parameters
349 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
350 self.assertNotEqual(metric, sliced)
351 self.assertEqual(metric.summary, sliced.summary)
352 self.assertEqual(metric.output, sliced.output)
353 self.assertEqual(metric.data[:stop], sliced.data)
354 # getDeferred with deferred parameters
355 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
356 self.assertNotEqual(metric, sliced)
357 self.assertEqual(metric.summary, sliced.summary)
358 self.assertEqual(metric.output, sliced.output)
359 self.assertEqual(metric.data[:stop], sliced.data)
361 if storageClass.isComposite():
362 # Check that components can be retrieved
363 metricOut = butler.get(ref.datasetType.name, dataId)
364 compNameS = ref.datasetType.componentTypeName("summary")
365 compNameD = ref.datasetType.componentTypeName("data")
366 summary = butler.get(compNameS, dataId)
367 self.assertEqual(summary, metric.summary)
368 data = butler.get(compNameD, dataId)
369 self.assertEqual(data, metric.data)
371 if "counter" in storageClass.derivedComponents:
372 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
373 self.assertEqual(count, len(data))
375 count = butler.get(
376 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
377 )
378 self.assertEqual(count, stop)
380 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
381 summary = butler.getDirect(compRef)
382 self.assertEqual(summary, metric.summary)
384 # Create a Dataset type that has the same name but is inconsistent.
385 inconsistentDatasetType = DatasetType(
386 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
387 )
389 # Getting with a dataset type that does not match registry fails
390 with self.assertRaises(ValueError):
391 butler.get(inconsistentDatasetType, dataId)
393 # Combining a DatasetRef with a dataId should fail
394 with self.assertRaises(ValueError):
395 butler.get(ref, dataId)
396 # Getting with an explicit ref should fail if the id doesn't match
397 with self.assertRaises(ValueError):
398 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
400 # Getting a dataset with unknown parameters should fail
401 with self.assertRaises(KeyError):
402 butler.get(ref, parameters={"unsupported": True})
404 # Check we have a collection
405 collections = set(butler.registry.queryCollections())
406 self.assertEqual(collections, expected_collections)
408 # Clean up to check that we can remove something that may have
409 # already had a component removed
410 butler.pruneDatasets([ref], unstore=True, purge=True)
412 # Check that we can configure a butler to accept a put even
413 # if it already has the dataset in registry.
414 ref = butler.put(metric, refIn)
416 # Repeat put will fail.
417 with self.assertRaises(ConflictingDefinitionError):
418 butler.put(metric, refIn)
420 # Remove the datastore entry.
421 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
423 # Put will still fail
424 with self.assertRaises(ConflictingDefinitionError):
425 butler.put(metric, refIn)
427 # Allow the put to succeed
428 butler._allow_put_of_predefined_dataset = True
429 ref2 = butler.put(metric, refIn)
430 self.assertEqual(ref2.id, ref.id)
432 # A second put will still fail but with a different exception
433 # than before.
434 with self.assertRaises(ConflictingDefinitionError):
435 butler.put(metric, refIn)
437 # Reset the flag to avoid confusion
438 butler._allow_put_of_predefined_dataset = False
440 # Leave the dataset in place since some downstream tests require
441 # something to be present
443 return butler
445 def testDeferredCollectionPassing(self):
446 # Construct a butler with no run or collection, but make it writeable.
447 butler = Butler(self.tmpConfigFile, writeable=True)
448 # Create and register a DatasetType
449 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
450 datasetType = self.addDatasetType(
451 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
452 )
453 # Add needed Dimensions
454 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
455 butler.registry.insertDimensionData(
456 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
457 )
458 butler.registry.insertDimensionData(
459 "visit",
460 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
461 )
462 dataId = {"instrument": "DummyCamComp", "visit": 423}
463 # Create dataset.
464 metric = makeExampleMetrics()
465 # Register a new run and put dataset.
466 run = "deferred"
467 self.assertTrue(butler.registry.registerRun(run))
468 # Second time it will be allowed but indicate no-op
469 self.assertFalse(butler.registry.registerRun(run))
470 ref = butler.put(metric, datasetType, dataId, run=run)
471 # Putting with no run should fail with TypeError.
472 with self.assertRaises(TypeError):
473 butler.put(metric, datasetType, dataId)
474 # Dataset should exist.
475 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
476 # We should be able to get the dataset back, but with and without
477 # a deferred dataset handle.
478 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
479 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
480 # Trying to find the dataset without any collection is a TypeError.
481 with self.assertRaises(TypeError):
482 butler.datasetExists(datasetType, dataId)
483 with self.assertRaises(TypeError):
484 butler.get(datasetType, dataId)
485 # Associate the dataset with a different collection.
486 butler.registry.registerCollection("tagged")
487 butler.registry.associate("tagged", [ref])
488 # Deleting the dataset from the new collection should make it findable
489 # in the original collection.
490 butler.pruneDatasets([ref], tags=["tagged"])
491 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
494class ButlerTests(ButlerPutGetTests):
495 """Tests for Butler."""
497 useTempRoot = True
499 def setUp(self):
500 """Create a new butler root for each test."""
501 self.root = makeTestTempDir(TESTDIR)
502 Butler.makeRepo(self.root, config=Config(self.configFile))
503 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
505 def testConstructor(self):
506 """Independent test of constructor."""
507 butler = Butler(self.tmpConfigFile, run="ingest")
508 self.assertIsInstance(butler, Butler)
510 # Check that butler.yaml is added automatically.
511 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
512 config_dir = self.tmpConfigFile[: -len(end)]
513 butler = Butler(config_dir, run="ingest")
514 self.assertIsInstance(butler, Butler)
516 collections = set(butler.registry.queryCollections())
517 self.assertEqual(collections, {"ingest"})
519 # Check that some special characters can be included in run name.
520 special_run = "u@b.c-A"
521 butler_special = Butler(butler=butler, run=special_run)
522 collections = set(butler_special.registry.queryCollections("*@*"))
523 self.assertEqual(collections, {special_run})
525 butler2 = Butler(butler=butler, collections=["other"])
526 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
527 self.assertIsNone(butler2.run)
528 self.assertIs(butler.datastore, butler2.datastore)
530 # Test that we can use an environment variable to find this
531 # repository.
532 butler_index = Config()
533 butler_index["label"] = self.tmpConfigFile
534 for suffix in (".yaml", ".json"):
535 # Ensure that the content differs so that we know that
536 # we aren't reusing the cache.
537 bad_label = f"s3://bucket/not_real{suffix}"
538 butler_index["bad_label"] = bad_label
539 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
540 butler_index.dumpToUri(temp_file)
541 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
542 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
543 uri = Butler.get_repo_uri("bad_label")
544 self.assertEqual(uri, ResourcePath(bad_label))
545 uri = Butler.get_repo_uri("label")
546 butler = Butler(uri, writeable=False)
547 self.assertIsInstance(butler, Butler)
548 with self.assertRaises(KeyError) as cm:
549 Butler.get_repo_uri("missing")
550 self.assertIn("not known to", str(cm.exception))
551 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
552 with self.assertRaises(FileNotFoundError):
553 Butler.get_repo_uri("label")
554 self.assertEqual(Butler.get_known_repos(), set())
555 with self.assertRaises(KeyError) as cm:
556 # No environment variable set.
557 Butler.get_repo_uri("label")
558 self.assertIn("No repository index defined", str(cm.exception))
559 self.assertEqual(Butler.get_known_repos(), set())
561 def testBasicPutGet(self):
562 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
563 self.runPutGetTest(storageClass, "test_metric")
565 def testCompositePutGetConcrete(self):
567 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
568 butler = self.runPutGetTest(storageClass, "test_metric")
570 # Should *not* be disassembled
571 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
572 self.assertEqual(len(datasets), 1)
573 uri, components = butler.getURIs(datasets[0])
574 self.assertIsInstance(uri, ResourcePath)
575 self.assertFalse(components)
576 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
577 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
579 # Predicted dataset
580 dataId = {"instrument": "DummyCamComp", "visit": 424}
581 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
582 self.assertFalse(components)
583 self.assertIsInstance(uri, ResourcePath)
584 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
585 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
587 def testCompositePutGetVirtual(self):
588 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
589 butler = self.runPutGetTest(storageClass, "test_metric_comp")
591 # Should be disassembled
592 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
593 self.assertEqual(len(datasets), 1)
594 uri, components = butler.getURIs(datasets[0])
596 if butler.datastore.isEphemeral:
597 # Never disassemble in-memory datastore
598 self.assertIsInstance(uri, ResourcePath)
599 self.assertFalse(components)
600 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
601 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
602 else:
603 self.assertIsNone(uri)
604 self.assertEqual(set(components), set(storageClass.components))
605 for compuri in components.values():
606 self.assertIsInstance(compuri, ResourcePath)
607 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
608 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
610 # Predicted dataset
611 dataId = {"instrument": "DummyCamComp", "visit": 424}
612 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
614 if butler.datastore.isEphemeral:
615 # Never disassembled
616 self.assertIsInstance(uri, ResourcePath)
617 self.assertFalse(components)
618 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
619 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
620 else:
621 self.assertIsNone(uri)
622 self.assertEqual(set(components), set(storageClass.components))
623 for compuri in components.values():
624 self.assertIsInstance(compuri, ResourcePath)
625 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
626 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
628 def testIngest(self):
629 butler = Butler(self.tmpConfigFile, run="ingest")
631 # Create and register a DatasetType
632 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
634 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
635 datasetTypeName = "metric"
637 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
639 # Add needed Dimensions
640 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
641 butler.registry.insertDimensionData(
642 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
643 )
644 for detector in (1, 2):
645 butler.registry.insertDimensionData(
646 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
647 )
649 butler.registry.insertDimensionData(
650 "visit",
651 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
652 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
653 )
655 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
656 dataRoot = os.path.join(TESTDIR, "data", "basic")
657 datasets = []
658 for detector in (1, 2):
659 detector_name = f"detector_{detector}"
660 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
661 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
662 # Create a DatasetRef for ingest
663 refIn = DatasetRef(datasetType, dataId, id=None)
665 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
667 butler.ingest(*datasets, transfer="copy")
669 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
670 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
672 metrics1 = butler.get(datasetTypeName, dataId1)
673 metrics2 = butler.get(datasetTypeName, dataId2)
674 self.assertNotEqual(metrics1, metrics2)
676 # Compare URIs
677 uri1 = butler.getURI(datasetTypeName, dataId1)
678 uri2 = butler.getURI(datasetTypeName, dataId2)
679 self.assertNotEqual(uri1, uri2)
681 # Now do a multi-dataset but single file ingest
682 metricFile = os.path.join(dataRoot, "detectors.yaml")
683 refs = []
684 for detector in (1, 2):
685 detector_name = f"detector_{detector}"
686 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
687 # Create a DatasetRef for ingest
688 refs.append(DatasetRef(datasetType, dataId, id=None))
690 datasets = []
691 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
693 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
695 # Check that the datastore recorded no file size.
696 # Not all datastores can support this.
697 try:
698 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
699 self.assertEqual(infos[0].file_size, -1)
700 except AttributeError:
701 pass
703 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
704 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
706 multi1 = butler.get(datasetTypeName, dataId1)
707 multi2 = butler.get(datasetTypeName, dataId2)
709 self.assertEqual(multi1, metrics1)
710 self.assertEqual(multi2, metrics2)
712 # Compare URIs
713 uri1 = butler.getURI(datasetTypeName, dataId1)
714 uri2 = butler.getURI(datasetTypeName, dataId2)
715 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
717 # Test that removing one does not break the second
718 # This line will issue a warning log message for a ChainedDatastore
719 # that uses an InMemoryDatastore since in-memory can not ingest
720 # files.
721 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
722 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
723 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
724 multi2b = butler.get(datasetTypeName, dataId2)
725 self.assertEqual(multi2, multi2b)
727 def testPruneCollections(self):
728 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
729 butler = Butler(self.tmpConfigFile, writeable=True)
730 # Load registry data with dimensions to hang datasets off of.
731 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
732 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
733 # Add some RUN-type collections.
734 run1 = "run1"
735 butler.registry.registerRun(run1)
736 run2 = "run2"
737 butler.registry.registerRun(run2)
738 # put some datasets. ref1 and ref2 have the same data ID, and are in
739 # different runs. ref3 has a different data ID.
740 metric = makeExampleMetrics()
741 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
742 datasetType = self.addDatasetType(
743 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
744 )
745 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
746 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
747 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
749 # Try to delete a RUN collection without purge, or with purge and not
750 # unstore.
751 with self.assertRaises(TypeError):
752 butler.pruneCollection(run1)
753 with self.assertRaises(TypeError):
754 butler.pruneCollection(run2, purge=True)
755 # Add a TAGGED collection and associate ref3 only into it.
756 tag1 = "tag1"
757 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
758 self.assertTrue(registered)
759 # Registering a second time should be allowed.
760 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
761 self.assertFalse(registered)
762 butler.registry.associate(tag1, [ref3])
763 # Add a CHAINED collection that searches run1 and then run2. It
764 # logically contains only ref1, because ref2 is shadowed due to them
765 # having the same data ID and dataset type.
766 chain1 = "chain1"
767 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
768 butler.registry.setCollectionChain(chain1, [run1, run2])
769 # Try to delete RUN collections, which should fail with complete
770 # rollback because they're still referenced by the CHAINED
771 # collection.
772 with self.assertRaises(Exception):
773 butler.pruneCollection(run1, pruge=True, unstore=True)
774 with self.assertRaises(Exception):
775 butler.pruneCollection(run2, pruge=True, unstore=True)
776 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
777 existence = butler.datastore.mexists([ref1, ref2, ref3])
778 self.assertTrue(existence[ref1])
779 self.assertTrue(existence[ref2])
780 self.assertTrue(existence[ref3])
781 # Try to delete CHAINED and TAGGED collections with purge; should not
782 # work.
783 with self.assertRaises(TypeError):
784 butler.pruneCollection(tag1, purge=True, unstore=True)
785 with self.assertRaises(TypeError):
786 butler.pruneCollection(chain1, purge=True, unstore=True)
787 # Remove the tagged collection with unstore=False. This should not
788 # affect the datasets.
789 butler.pruneCollection(tag1)
790 with self.assertRaises(MissingCollectionError):
791 butler.registry.getCollectionType(tag1)
792 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
793 existence = butler.datastore.mexists([ref1, ref2, ref3])
794 self.assertTrue(existence[ref1])
795 self.assertTrue(existence[ref2])
796 self.assertTrue(existence[ref3])
797 # Add the tagged collection back in, and remove it with unstore=True.
798 # This should remove ref3 only from the datastore.
799 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
800 butler.registry.associate(tag1, [ref3])
801 butler.pruneCollection(tag1, unstore=True)
802 with self.assertRaises(MissingCollectionError):
803 butler.registry.getCollectionType(tag1)
804 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
805 existence = butler.datastore.mexists([ref1, ref2, ref3])
806 self.assertTrue(existence[ref1])
807 self.assertTrue(existence[ref2])
808 self.assertFalse(existence[ref3])
809 # Delete the chain with unstore=False. The datasets should not be
810 # affected at all.
811 butler.pruneCollection(chain1)
812 with self.assertRaises(MissingCollectionError):
813 butler.registry.getCollectionType(chain1)
814 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
815 existence = butler.datastore.mexists([ref1, ref2, ref3])
816 self.assertTrue(existence[ref1])
817 self.assertTrue(existence[ref2])
818 self.assertFalse(existence[ref3])
819 # Redefine and then delete the chain with unstore=True. Only ref1
820 # should be unstored (ref3 has already been unstored, but otherwise
821 # would be now).
822 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
823 butler.registry.setCollectionChain(chain1, [run1, run2])
824 butler.pruneCollection(chain1, unstore=True)
825 with self.assertRaises(MissingCollectionError):
826 butler.registry.getCollectionType(chain1)
827 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
828 existence = butler.datastore.mexists([ref1, ref2, ref3])
829 self.assertFalse(existence[ref1])
830 self.assertTrue(existence[ref2])
831 self.assertFalse(existence[ref3])
832 # Remove run1. This removes ref1 and ref3 from the registry (they're
833 # already gone from the datastore, which is fine).
834 butler.pruneCollection(run1, purge=True, unstore=True)
835 with self.assertRaises(MissingCollectionError):
836 butler.registry.getCollectionType(run1)
837 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
838 self.assertTrue(butler.datastore.exists(ref2))
839 # Remove run2. This removes ref2 from the registry and the datastore.
840 butler.pruneCollection(run2, purge=True, unstore=True)
841 with self.assertRaises(MissingCollectionError):
842 butler.registry.getCollectionType(run2)
843 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
845 # Now that the collections have been pruned we can remove the
846 # dataset type
847 butler.registry.removeDatasetType(datasetType.name)
849 def testPickle(self):
850 """Test pickle support."""
851 butler = Butler(self.tmpConfigFile, run="ingest")
852 butlerOut = pickle.loads(pickle.dumps(butler))
853 self.assertIsInstance(butlerOut, Butler)
854 self.assertEqual(butlerOut._config, butler._config)
855 self.assertEqual(butlerOut.collections, butler.collections)
856 self.assertEqual(butlerOut.run, butler.run)
858 def testGetDatasetTypes(self):
859 butler = Butler(self.tmpConfigFile, run="ingest")
860 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
861 dimensionEntries = [
862 (
863 "instrument",
864 {"instrument": "DummyCam"},
865 {"instrument": "DummyHSC"},
866 {"instrument": "DummyCamComp"},
867 ),
868 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
869 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
870 ]
871 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
872 # Add needed Dimensions
873 for args in dimensionEntries:
874 butler.registry.insertDimensionData(*args)
876 # When a DatasetType is added to the registry entries are not created
877 # for components but querying them can return the components.
878 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
879 components = set()
880 for datasetTypeName in datasetTypeNames:
881 # Create and register a DatasetType
882 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
884 for componentName in storageClass.components:
885 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
887 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
888 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
890 # Now that we have some dataset types registered, validate them
891 butler.validateConfiguration(
892 ignore=[
893 "test_metric_comp",
894 "metric3",
895 "calexp",
896 "DummySC",
897 "datasetType.component",
898 "random_data",
899 "random_data_2",
900 ]
901 )
903 # Add a new datasetType that will fail template validation
904 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
905 if self.validationCanFail:
906 with self.assertRaises(ValidationError):
907 butler.validateConfiguration()
909 # Rerun validation but with a subset of dataset type names
910 butler.validateConfiguration(datasetTypeNames=["metric4"])
912 # Rerun validation but ignore the bad datasetType
913 butler.validateConfiguration(
914 ignore=[
915 "test_metric_comp",
916 "metric3",
917 "calexp",
918 "DummySC",
919 "datasetType.component",
920 "random_data",
921 "random_data_2",
922 ]
923 )
925 def testTransaction(self):
926 butler = Butler(self.tmpConfigFile, run="ingest")
927 datasetTypeName = "test_metric"
928 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
929 dimensionEntries = (
930 ("instrument", {"instrument": "DummyCam"}),
931 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
932 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
933 )
934 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
935 metric = makeExampleMetrics()
936 dataId = {"instrument": "DummyCam", "visit": 42}
937 # Create and register a DatasetType
938 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
939 with self.assertRaises(TransactionTestError):
940 with butler.transaction():
941 # Add needed Dimensions
942 for args in dimensionEntries:
943 butler.registry.insertDimensionData(*args)
944 # Store a dataset
945 ref = butler.put(metric, datasetTypeName, dataId)
946 self.assertIsInstance(ref, DatasetRef)
947 # Test getDirect
948 metricOut = butler.getDirect(ref)
949 self.assertEqual(metric, metricOut)
950 # Test get
951 metricOut = butler.get(datasetTypeName, dataId)
952 self.assertEqual(metric, metricOut)
953 # Check we can get components
954 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
955 raise TransactionTestError("This should roll back the entire transaction")
956 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"):
957 butler.registry.expandDataId(dataId)
958 # Should raise LookupError for missing data ID value
959 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
960 butler.get(datasetTypeName, dataId)
961 # Also check explicitly if Dataset entry is missing
962 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
963 # Direct retrieval should not find the file in the Datastore
964 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
965 butler.getDirect(ref)
967 def testMakeRepo(self):
968 """Test that we can write butler configuration to a new repository via
969 the Butler.makeRepo interface and then instantiate a butler from the
970 repo root.
971 """
972 # Do not run the test if we know this datastore configuration does
973 # not support a file system root
974 if self.fullConfigKey is None:
975 return
977 # create two separate directories
978 root1 = tempfile.mkdtemp(dir=self.root)
979 root2 = tempfile.mkdtemp(dir=self.root)
981 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
982 limited = Config(self.configFile)
983 butler1 = Butler(butlerConfig)
984 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
985 full = Config(self.tmpConfigFile)
986 butler2 = Butler(butlerConfig)
987 # Butlers should have the same configuration regardless of whether
988 # defaults were expanded.
989 self.assertEqual(butler1._config, butler2._config)
990 # Config files loaded directly should not be the same.
991 self.assertNotEqual(limited, full)
992 # Make sure "limited" doesn't have a few keys we know it should be
993 # inheriting from defaults.
994 self.assertIn(self.fullConfigKey, full)
995 self.assertNotIn(self.fullConfigKey, limited)
997 # Collections don't appear until something is put in them
998 collections1 = set(butler1.registry.queryCollections())
999 self.assertEqual(collections1, set())
1000 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1002 # Check that a config with no associated file name will not
1003 # work properly with relocatable Butler repo
1004 butlerConfig.configFile = None
1005 with self.assertRaises(ValueError):
1006 Butler(butlerConfig)
1008 with self.assertRaises(FileExistsError):
1009 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1011 def testStringification(self):
1012 butler = Butler(self.tmpConfigFile, run="ingest")
1013 butlerStr = str(butler)
1015 if self.datastoreStr is not None:
1016 for testStr in self.datastoreStr:
1017 self.assertIn(testStr, butlerStr)
1018 if self.registryStr is not None:
1019 self.assertIn(self.registryStr, butlerStr)
1021 datastoreName = butler.datastore.name
1022 if self.datastoreName is not None:
1023 for testStr in self.datastoreName:
1024 self.assertIn(testStr, datastoreName)
1026 def testButlerRewriteDataId(self):
1027 """Test that dataIds can be rewritten based on dimension records."""
1029 butler = Butler(self.tmpConfigFile, run="ingest")
1031 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1032 datasetTypeName = "random_data"
1034 # Create dimension records.
1035 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1036 butler.registry.insertDimensionData(
1037 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1038 )
1039 butler.registry.insertDimensionData(
1040 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1041 )
1043 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1044 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1045 butler.registry.registerDatasetType(datasetType)
1047 n_exposures = 5
1048 dayobs = 20210530
1050 for i in range(n_exposures):
1051 butler.registry.insertDimensionData(
1052 "exposure",
1053 {
1054 "instrument": "DummyCamComp",
1055 "id": i,
1056 "obs_id": f"exp{i}",
1057 "seq_num": i,
1058 "day_obs": dayobs,
1059 "physical_filter": "d-r",
1060 },
1061 )
1063 # Write some data.
1064 for i in range(n_exposures):
1065 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1067 # Use the seq_num for the put to test rewriting.
1068 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1069 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1071 # Check that the exposure is correct in the dataId
1072 self.assertEqual(ref.dataId["exposure"], i)
1074 # and check that we can get the dataset back with the same dataId
1075 new_metric = butler.get(datasetTypeName, dataId=dataId)
1076 self.assertEqual(new_metric, metric)
1079class FileDatastoreButlerTests(ButlerTests):
1080 """Common tests and specialization of ButlerTests for butlers backed
1081 by datastores that inherit from FileDatastore.
1082 """
1084 def checkFileExists(self, root, relpath):
1085 """Checks if file exists at a given path (relative to root).
1087 Test testPutTemplates verifies actual physical existance of the files
1088 in the requested location.
1089 """
1090 uri = ResourcePath(root, forceDirectory=True)
1091 return uri.join(relpath).exists()
1093 def testPutTemplates(self):
1094 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1095 butler = Butler(self.tmpConfigFile, run="ingest")
1097 # Add needed Dimensions
1098 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1099 butler.registry.insertDimensionData(
1100 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1101 )
1102 butler.registry.insertDimensionData(
1103 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1104 )
1105 butler.registry.insertDimensionData(
1106 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1107 )
1109 # Create and store a dataset
1110 metric = makeExampleMetrics()
1112 # Create two almost-identical DatasetTypes (both will use default
1113 # template)
1114 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1115 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1116 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1117 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1119 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1120 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1122 # Put with exactly the data ID keys needed
1123 ref = butler.put(metric, "metric1", dataId1)
1124 uri = butler.getURI(ref)
1125 self.assertTrue(
1126 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"),
1127 f"Checking existence of {uri}",
1128 )
1130 # Check the template based on dimensions
1131 butler.datastore.templates.validateTemplates([ref])
1133 # Put with extra data ID keys (physical_filter is an optional
1134 # dependency); should not change template (at least the way we're
1135 # defining them to behave now; the important thing is that they
1136 # must be consistent).
1137 ref = butler.put(metric, "metric2", dataId2)
1138 uri = butler.getURI(ref)
1139 self.assertTrue(
1140 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"),
1141 f"Checking existence of {uri}",
1142 )
1144 # Check the template based on dimensions
1145 butler.datastore.templates.validateTemplates([ref])
1147 # Now use a file template that will not result in unique filenames
1148 with self.assertRaises(FileTemplateValidationError):
1149 butler.put(metric, "metric3", dataId1)
1151 def testImportExport(self):
1152 # Run put/get tests just to create and populate a repo.
1153 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1154 self.runImportExportTest(storageClass)
1156 @unittest.expectedFailure
1157 def testImportExportVirtualComposite(self):
1158 # Run put/get tests just to create and populate a repo.
1159 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1160 self.runImportExportTest(storageClass)
1162 def runImportExportTest(self, storageClass):
1163 """This test does an export to a temp directory and an import back
1164 into a new temp directory repo. It does not assume a posix datastore"""
1165 exportButler = self.runPutGetTest(storageClass, "test_metric")
1166 print("Root:", exportButler.datastore.root)
1167 # Test that the repo actually has at least one dataset.
1168 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1169 self.assertGreater(len(datasets), 0)
1170 # Add a DimensionRecord that's unused by those datasets.
1171 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1172 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1173 # Export and then import datasets.
1174 with safeTestTempDir(TESTDIR) as exportDir:
1175 exportFile = os.path.join(exportDir, "exports.yaml")
1176 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1177 export.saveDatasets(datasets)
1178 # Export the same datasets again. This should quietly do
1179 # nothing because of internal deduplication, and it shouldn't
1180 # complain about being asked to export the "htm7" elements even
1181 # though there aren't any in these datasets or in the database.
1182 export.saveDatasets(datasets, elements=["htm7"])
1183 # Save one of the data IDs again; this should be harmless
1184 # because of internal deduplication.
1185 export.saveDataIds([datasets[0].dataId])
1186 # Save some dimension records directly.
1187 export.saveDimensionData("skymap", [skymapRecord])
1188 self.assertTrue(os.path.exists(exportFile))
1189 with safeTestTempDir(TESTDIR) as importDir:
1190 # We always want this to be a local posix butler
1191 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1192 # Calling script.butlerImport tests the implementation of the
1193 # butler command line interface "import" subcommand. Functions
1194 # in the script folder are generally considered protected and
1195 # should not be used as public api.
1196 with open(exportFile, "r") as f:
1197 script.butlerImport(
1198 importDir,
1199 export_file=f,
1200 directory=exportDir,
1201 transfer="auto",
1202 skip_dimensions=None,
1203 reuse_ids=False,
1204 )
1205 importButler = Butler(importDir, run="ingest")
1206 for ref in datasets:
1207 with self.subTest(ref=ref):
1208 # Test for existence by passing in the DatasetType and
1209 # data ID separately, to avoid lookup by dataset_id.
1210 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1211 self.assertEqual(
1212 list(importButler.registry.queryDimensionRecords("skymap")),
1213 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1214 )
1216 def testRemoveRuns(self):
1217 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1218 butler = Butler(self.tmpConfigFile, writeable=True)
1219 # Load registry data with dimensions to hang datasets off of.
1220 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1221 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1222 # Add some RUN-type collection.
1223 run1 = "run1"
1224 butler.registry.registerRun(run1)
1225 run2 = "run2"
1226 butler.registry.registerRun(run2)
1227 # put a dataset in each
1228 metric = makeExampleMetrics()
1229 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1230 datasetType = self.addDatasetType(
1231 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1232 )
1233 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1234 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1235 uri1 = butler.getURI(ref1, collections=[run1])
1236 uri2 = butler.getURI(ref2, collections=[run2])
1237 # Remove from both runs with different values for unstore.
1238 butler.removeRuns([run1], unstore=True)
1239 butler.removeRuns([run2], unstore=False)
1240 # Should be nothing in registry for either one, and datastore should
1241 # not think either exists.
1242 with self.assertRaises(MissingCollectionError):
1243 butler.registry.getCollectionType(run1)
1244 with self.assertRaises(MissingCollectionError):
1245 butler.registry.getCollectionType(run2)
1246 self.assertFalse(butler.datastore.exists(ref1))
1247 self.assertFalse(butler.datastore.exists(ref2))
1248 # The ref we unstored should be gone according to the URI, but the
1249 # one we forgot should still be around.
1250 self.assertFalse(uri1.exists())
1251 self.assertTrue(uri2.exists())
1254class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1255 """PosixDatastore specialization of a butler"""
1257 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1258 fullConfigKey = ".datastore.formatters"
1259 validationCanFail = True
1260 datastoreStr = ["/tmp"]
1261 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1262 registryStr = "/gen3.sqlite3"
1264 def testPathConstructor(self):
1265 """Independent test of constructor using PathLike."""
1266 butler = Butler(self.tmpConfigFile, run="ingest")
1267 self.assertIsInstance(butler, Butler)
1269 # And again with a Path object with the butler yaml
1270 path = pathlib.Path(self.tmpConfigFile)
1271 butler = Butler(path, writeable=False)
1272 self.assertIsInstance(butler, Butler)
1274 # And again with a Path object without the butler yaml
1275 # (making sure we skip it if the tmp config doesn't end
1276 # in butler.yaml -- which is the case for a subclass)
1277 if self.tmpConfigFile.endswith("butler.yaml"):
1278 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1279 butler = Butler(path, writeable=False)
1280 self.assertIsInstance(butler, Butler)
1282 def testExportTransferCopy(self):
1283 """Test local export using all transfer modes"""
1284 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1285 exportButler = self.runPutGetTest(storageClass, "test_metric")
1286 # Test that the repo actually has at least one dataset.
1287 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1288 self.assertGreater(len(datasets), 0)
1289 uris = [exportButler.getURI(d) for d in datasets]
1290 datastoreRoot = exportButler.datastore.root
1292 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1294 for path in pathsInStore:
1295 # Assume local file system
1296 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1298 for transfer in ("copy", "link", "symlink", "relsymlink"):
1299 with safeTestTempDir(TESTDIR) as exportDir:
1300 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1301 export.saveDatasets(datasets)
1302 for path in pathsInStore:
1303 self.assertTrue(
1304 self.checkFileExists(exportDir, path),
1305 f"Check that mode {transfer} exported files",
1306 )
1308 def testPruneDatasets(self):
1309 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1310 butler = Butler(self.tmpConfigFile, writeable=True)
1311 # Load registry data with dimensions to hang datasets off of.
1312 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1313 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1314 # Add some RUN-type collections.
1315 run1 = "run1"
1316 butler.registry.registerRun(run1)
1317 run2 = "run2"
1318 butler.registry.registerRun(run2)
1319 # put some datasets. ref1 and ref2 have the same data ID, and are in
1320 # different runs. ref3 has a different data ID.
1321 metric = makeExampleMetrics()
1322 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1323 datasetType = self.addDatasetType(
1324 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1325 )
1326 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1327 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1328 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1330 # Simple prune.
1331 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1332 with self.assertRaises(LookupError):
1333 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1335 # Put data back.
1336 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1337 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1338 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1340 # Check that in normal mode, deleting the record will lead to
1341 # trash not touching the file.
1342 uri1 = butler.datastore.getURI(ref1)
1343 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1344 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1345 butler.datastore.trash(ref1)
1346 butler.datastore.emptyTrash()
1347 self.assertTrue(uri1.exists())
1348 uri1.remove() # Clean it up.
1350 # Simulate execution butler setup by deleting the datastore
1351 # record but keeping the file around and trusting.
1352 butler.datastore.trustGetRequest = True
1353 uri2 = butler.datastore.getURI(ref2)
1354 uri3 = butler.datastore.getURI(ref3)
1355 self.assertTrue(uri2.exists())
1356 self.assertTrue(uri3.exists())
1358 # Remove the datastore record.
1359 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1360 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1361 self.assertTrue(uri2.exists())
1362 butler.datastore.trash([ref2, ref3])
1363 # Immediate removal for ref2 file
1364 self.assertFalse(uri2.exists())
1365 # But ref3 has to wait for the empty.
1366 self.assertTrue(uri3.exists())
1367 butler.datastore.emptyTrash()
1368 self.assertFalse(uri3.exists())
1370 # Clear out the datasets from registry.
1371 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1373 def testPytypePutCoercion(self):
1374 """Test python type coercion on Butler.get and put."""
1376 # Store some data with the normal example storage class.
1377 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1378 datasetTypeName = "test_metric"
1379 butler, _ = self.create_butler("ingest", storageClass, datasetTypeName)
1381 dataId = {"instrument": "DummyCamComp", "visit": 423}
1383 # Put a dict and this should coerce to a MetricsExample
1384 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1385 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1386 test_metric = butler.getDirect(metric_ref)
1387 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1388 self.assertEqual(test_metric.summary, test_dict["summary"])
1389 self.assertEqual(test_metric.output, test_dict["output"])
1391 # Check that the put still works if a DatasetType is given with
1392 # a definition matching this python type.
1393 registry_type = butler.registry.getDatasetType(datasetTypeName)
1394 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
1395 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
1396 self.assertEqual(metric2_ref.datasetType, registry_type)
1398 # The get will return the type expected by registry.
1399 test_metric2 = butler.getDirect(metric2_ref)
1400 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
1402 # Make a new DatasetRef with the compatible but different DatasetType.
1403 # This should now return a dict.
1404 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
1405 test_dict2 = butler.getDirect(new_ref)
1406 self.assertEqual(get_full_type_name(test_dict2), "dict")
1408 # Get it again with the wrong dataset type definition using get()
1409 # rather than getDirect(). This should be consistent with getDirect()
1410 # behavior and return the type of the DatasetType.
1411 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
1412 self.assertEqual(get_full_type_name(test_dict3), "dict")
1414 def testPytypeCoercion(self):
1415 """Test python type coercion on Butler.get and put."""
1417 # Store some data with the normal example storage class.
1418 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1419 datasetTypeName = "test_metric"
1420 butler = self.runPutGetTest(storageClass, datasetTypeName)
1422 dataId = {"instrument": "DummyCamComp", "visit": 423}
1423 metric = butler.get(datasetTypeName, dataId=dataId)
1424 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1426 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1427 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1429 # Now need to hack the registry dataset type definition.
1430 # There is no API for this.
1431 manager = butler.registry._managers.datasets
1432 manager._db.update(
1433 manager._static.dataset_type,
1434 {"name": datasetTypeName},
1435 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1436 )
1438 # Force reset of dataset type cache
1439 butler.registry.refresh()
1441 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1442 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1443 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1445 metric_model = butler.get(datasetTypeName, dataId=dataId)
1446 self.assertNotEqual(type(metric_model), type(metric))
1447 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1449 # Put the model and read it back to show that everything now
1450 # works as normal.
1451 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1452 metric_model_new = butler.get(metric_ref)
1453 self.assertEqual(metric_model_new, metric_model)
1455 # Hack the storage class again to something that will fail on the
1456 # get with no conversion class.
1457 manager._db.update(
1458 manager._static.dataset_type,
1459 {"name": datasetTypeName},
1460 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1461 )
1462 butler.registry.refresh()
1464 with self.assertRaises(ValueError):
1465 butler.get(datasetTypeName, dataId=dataId)
1468class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1469 """InMemoryDatastore specialization of a butler"""
1471 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1472 fullConfigKey = None
1473 useTempRoot = False
1474 validationCanFail = False
1475 datastoreStr = ["datastore='InMemory"]
1476 datastoreName = ["InMemoryDatastore@"]
1477 registryStr = "/gen3.sqlite3"
1479 def testIngest(self):
1480 pass
1483class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1484 """PosixDatastore specialization"""
1486 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1487 fullConfigKey = ".datastore.datastores.1.formatters"
1488 validationCanFail = True
1489 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1490 datastoreName = [
1491 "InMemoryDatastore@",
1492 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1493 "SecondDatastore",
1494 ]
1495 registryStr = "/gen3.sqlite3"
1498class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1499 """Test that a yaml file in one location can refer to a root in another."""
1501 datastoreStr = ["dir1"]
1502 # Disable the makeRepo test since we are deliberately not using
1503 # butler.yaml as the config name.
1504 fullConfigKey = None
1506 def setUp(self):
1507 self.root = makeTestTempDir(TESTDIR)
1509 # Make a new repository in one place
1510 self.dir1 = os.path.join(self.root, "dir1")
1511 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1513 # Move the yaml file to a different place and add a "root"
1514 self.dir2 = os.path.join(self.root, "dir2")
1515 os.makedirs(self.dir2, exist_ok=True)
1516 configFile1 = os.path.join(self.dir1, "butler.yaml")
1517 config = Config(configFile1)
1518 config["root"] = self.dir1
1519 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1520 config.dumpToUri(configFile2)
1521 os.remove(configFile1)
1522 self.tmpConfigFile = configFile2
1524 def testFileLocations(self):
1525 self.assertNotEqual(self.dir1, self.dir2)
1526 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1527 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1528 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1531class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1532 """Test that a config file created by makeRepo outside of repo works."""
1534 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1536 def setUp(self):
1537 self.root = makeTestTempDir(TESTDIR)
1538 self.root2 = makeTestTempDir(TESTDIR)
1540 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1541 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1543 def tearDown(self):
1544 if os.path.exists(self.root2):
1545 shutil.rmtree(self.root2, ignore_errors=True)
1546 super().tearDown()
1548 def testConfigExistence(self):
1549 c = Config(self.tmpConfigFile)
1550 uri_config = ResourcePath(c["root"])
1551 uri_expected = ResourcePath(self.root, forceDirectory=True)
1552 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1553 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1555 def testPutGet(self):
1556 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1557 self.runPutGetTest(storageClass, "test_metric")
1560class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1561 """Test that a config file created by makeRepo outside of repo works."""
1563 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1565 def setUp(self):
1566 self.root = makeTestTempDir(TESTDIR)
1567 self.root2 = makeTestTempDir(TESTDIR)
1569 self.tmpConfigFile = self.root2
1570 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1572 def testConfigExistence(self):
1573 # Append the yaml file else Config constructor does not know the file
1574 # type.
1575 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1576 super().testConfigExistence()
1579class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1580 """Test that a config file created by makeRepo outside of repo works."""
1582 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1584 def setUp(self):
1585 self.root = makeTestTempDir(TESTDIR)
1586 self.root2 = makeTestTempDir(TESTDIR)
1588 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1589 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1592@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1593class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1594 """S3Datastore specialization of a butler; an S3 storage Datastore +
1595 a local in-memory SqlRegistry.
1596 """
1598 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1599 fullConfigKey = None
1600 validationCanFail = True
1602 bucketName = "anybucketname"
1603 """Name of the Bucket that will be used in the tests. The name is read from
1604 the config file used with the tests during set-up.
1605 """
1607 root = "butlerRoot/"
1608 """Root repository directory expected to be used in case useTempRoot=False.
1609 Otherwise the root is set to a 20 characters long randomly generated string
1610 during set-up.
1611 """
1613 datastoreStr = [f"datastore={root}"]
1614 """Contains all expected root locations in a format expected to be
1615 returned by Butler stringification.
1616 """
1618 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1619 """The expected format of the S3 Datastore string."""
1621 registryStr = "/gen3.sqlite3"
1622 """Expected format of the Registry string."""
1624 mock_s3 = mock_s3()
1625 """The mocked s3 interface from moto."""
1627 def genRoot(self):
1628 """Returns a random string of len 20 to serve as a root
1629 name for the temporary bucket repo.
1631 This is equivalent to tempfile.mkdtemp as this is what self.root
1632 becomes when useTempRoot is True.
1633 """
1634 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1635 return rndstr + "/"
1637 def setUp(self):
1638 config = Config(self.configFile)
1639 uri = ResourcePath(config[".datastore.datastore.root"])
1640 self.bucketName = uri.netloc
1642 # Enable S3 mocking of tests.
1643 self.mock_s3.start()
1645 # set up some fake credentials if they do not exist
1646 self.usingDummyCredentials = setAwsEnvCredentials()
1648 if self.useTempRoot:
1649 self.root = self.genRoot()
1650 rooturi = f"s3://{self.bucketName}/{self.root}"
1651 config.update({"datastore": {"datastore": {"root": rooturi}}})
1653 # need local folder to store registry database
1654 self.reg_dir = makeTestTempDir(TESTDIR)
1655 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1657 # MOTO needs to know that we expect Bucket bucketname to exist
1658 # (this used to be the class attribute bucketName)
1659 s3 = boto3.resource("s3")
1660 s3.create_bucket(Bucket=self.bucketName)
1662 self.datastoreStr = f"datastore={self.root}"
1663 self.datastoreName = [f"FileDatastore@{rooturi}"]
1664 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1665 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1667 def tearDown(self):
1668 s3 = boto3.resource("s3")
1669 bucket = s3.Bucket(self.bucketName)
1670 try:
1671 bucket.objects.all().delete()
1672 except botocore.exceptions.ClientError as e:
1673 if e.response["Error"]["Code"] == "404":
1674 # the key was not reachable - pass
1675 pass
1676 else:
1677 raise
1679 bucket = s3.Bucket(self.bucketName)
1680 bucket.delete()
1682 # Stop the S3 mock.
1683 self.mock_s3.stop()
1685 # unset any potentially set dummy credentials
1686 if self.usingDummyCredentials:
1687 unsetAwsEnvCredentials()
1689 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1690 shutil.rmtree(self.reg_dir, ignore_errors=True)
1692 if self.useTempRoot and os.path.exists(self.root):
1693 shutil.rmtree(self.root, ignore_errors=True)
1696@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1697# Mock required environment variables during tests
1698@unittest.mock.patch.dict(
1699 os.environ,
1700 {
1701 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1702 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1703 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1704 },
1705)
1706class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1707 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1708 a local in-memory SqlRegistry.
1709 """
1711 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1712 fullConfigKey = None
1713 validationCanFail = True
1715 serverName = "localhost"
1716 """Name of the server that will be used in the tests.
1717 """
1719 portNumber = 8080
1720 """Port on which the webdav server listens. Automatically chosen
1721 at setUpClass via the _getfreeport() method
1722 """
1724 root = "butlerRoot/"
1725 """Root repository directory expected to be used in case useTempRoot=False.
1726 Otherwise the root is set to a 20 characters long randomly generated string
1727 during set-up.
1728 """
1730 datastoreStr = [f"datastore={root}"]
1731 """Contains all expected root locations in a format expected to be
1732 returned by Butler stringification.
1733 """
1735 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1736 """The expected format of the WebdavDatastore string."""
1738 registryStr = "/gen3.sqlite3"
1739 """Expected format of the Registry string."""
1741 serverThread = None
1742 """Thread in which the local webdav server will run"""
1744 stopWebdavServer = False
1745 """This flag will cause the webdav server to
1746 gracefully shut down when True
1747 """
1749 def genRoot(self):
1750 """Returns a random string of len 20 to serve as a root
1751 name for the temporary bucket repo.
1753 This is equivalent to tempfile.mkdtemp as this is what self.root
1754 becomes when useTempRoot is True.
1755 """
1756 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1757 return rndstr + "/"
1759 @classmethod
1760 def setUpClass(cls):
1761 # Do the same as inherited class
1762 cls.storageClassFactory = StorageClassFactory()
1763 cls.storageClassFactory.addFromConfig(cls.configFile)
1765 cls.portNumber = cls._getfreeport()
1766 # Run a local webdav server on which tests will be run
1767 cls.serverThread = Thread(
1768 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1769 )
1770 cls.serverThread.start()
1771 # Wait for it to start
1772 time.sleep(3)
1774 @classmethod
1775 def tearDownClass(cls):
1776 # Ask for graceful shut down of the webdav server
1777 cls.stopWebdavServer = True
1778 # Wait for the thread to exit
1779 cls.serverThread.join()
1781 # Mock required environment variables during tests
1782 @unittest.mock.patch.dict(
1783 os.environ,
1784 {
1785 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1786 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1787 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1788 },
1789 )
1790 def setUp(self):
1791 config = Config(self.configFile)
1793 if self.useTempRoot:
1794 self.root = self.genRoot()
1795 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1796 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1798 # need local folder to store registry database
1799 self.reg_dir = makeTestTempDir(TESTDIR)
1800 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1802 self.datastoreStr = f"datastore={self.root}"
1803 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1805 if not isWebdavEndpoint(self.rooturi):
1806 raise OSError("Webdav server not running properly: cannot run tests.")
1808 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1809 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1811 # Mock required environment variables during tests
1812 @unittest.mock.patch.dict(
1813 os.environ,
1814 {
1815 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1816 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1817 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1818 },
1819 )
1820 def tearDown(self):
1821 # Clear temporary directory
1822 ResourcePath(self.rooturi).remove()
1823 ResourcePath(self.rooturi).session.close()
1825 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1826 shutil.rmtree(self.reg_dir, ignore_errors=True)
1828 if self.useTempRoot and os.path.exists(self.root):
1829 shutil.rmtree(self.root, ignore_errors=True)
1831 def _serveWebdav(self, port: int, stopWebdavServer):
1832 """Starts a local webdav-compatible HTTP server,
1833 Listening on http://localhost:port
1834 This server only runs when this test class is instantiated,
1835 and then shuts down. Must be started is a separate thread.
1837 Parameters
1838 ----------
1839 port : `int`
1840 The port number on which the server should listen
1841 """
1842 root_path = gettempdir()
1844 config = {
1845 "host": "0.0.0.0",
1846 "port": port,
1847 "provider_mapping": {"/": root_path},
1848 "http_authenticator": {"domain_controller": None},
1849 "simple_dc": {"user_mapping": {"*": True}},
1850 "verbose": 0,
1851 }
1852 app = WsgiDAVApp(config)
1854 server_args = {
1855 "bind_addr": (config["host"], config["port"]),
1856 "wsgi_app": app,
1857 }
1858 server = wsgi.Server(**server_args)
1859 server.prepare()
1861 try:
1862 # Start the actual server in a separate thread
1863 t = Thread(target=server.serve, daemon=True)
1864 t.start()
1865 # watch stopWebdavServer, and gracefully
1866 # shut down the server when True
1867 while True:
1868 if stopWebdavServer():
1869 break
1870 time.sleep(1)
1871 except KeyboardInterrupt:
1872 print("Caught Ctrl-C, shutting down...")
1873 finally:
1874 server.stop()
1875 t.join()
1877 def _getfreeport():
1878 """
1879 Determines a free port using sockets.
1880 """
1881 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1882 free_socket.bind(("0.0.0.0", 0))
1883 free_socket.listen()
1884 port = free_socket.getsockname()[1]
1885 free_socket.close()
1886 return port
1889class PosixDatastoreTransfers(unittest.TestCase):
1890 """Test data transfers between butlers.
1892 Test for different managers. UUID to UUID and integer to integer are
1893 tested. UUID to integer is not supported since we do not currently
1894 want to allow that. Integer to UUID is supported with the caveat
1895 that UUID4 will be generated and this will be incorrect for raw
1896 dataset types. The test ignores that.
1897 """
1899 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1901 @classmethod
1902 def setUpClass(cls):
1903 cls.storageClassFactory = StorageClassFactory()
1904 cls.storageClassFactory.addFromConfig(cls.configFile)
1906 def setUp(self):
1907 self.root = makeTestTempDir(TESTDIR)
1908 self.config = Config(self.configFile)
1910 def tearDown(self):
1911 removeTestTempDir(self.root)
1913 def create_butler(self, manager, label):
1914 config = Config(self.configFile)
1915 config["registry", "managers", "datasets"] = manager
1916 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1918 def create_butlers(self, manager1, manager2):
1919 self.source_butler = self.create_butler(manager1, "1")
1920 self.target_butler = self.create_butler(manager2, "2")
1922 def testTransferUuidToUuid(self):
1923 self.create_butlers(
1924 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1925 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1926 )
1927 # Setting id_gen_map should have no effect here
1928 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1930 def testTransferIntToInt(self):
1931 self.create_butlers(
1932 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1933 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1934 )
1935 # int dataset ID only allows UNIQUE
1936 self.assertButlerTransfers()
1938 def testTransferIntToUuid(self):
1939 self.create_butlers(
1940 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1941 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1942 )
1943 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1945 def testTransferMissing(self):
1946 """Test transfers where datastore records are missing.
1948 This is how execution butler works.
1949 """
1950 self.create_butlers(
1951 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1952 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1953 )
1955 # Configure the source butler to allow trust.
1956 self.source_butler.datastore.trustGetRequest = True
1958 self.assertButlerTransfers(purge=True)
1960 def testTransferMissingDisassembly(self):
1961 """Test transfers where datastore records are missing.
1963 This is how execution butler works.
1964 """
1965 self.create_butlers(
1966 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1967 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1968 )
1970 # Configure the source butler to allow trust.
1971 self.source_butler.datastore.trustGetRequest = True
1973 # Test disassembly.
1974 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1976 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1977 """Test that a run can be transferred to another butler."""
1979 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1980 datasetTypeName = "random_data"
1982 # Test will create 3 collections and we will want to transfer
1983 # two of those three.
1984 runs = ["run1", "run2", "other"]
1986 # Also want to use two different dataset types to ensure that
1987 # grouping works.
1988 datasetTypeNames = ["random_data", "random_data_2"]
1990 # Create the run collections in the source butler.
1991 for run in runs:
1992 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1994 # Create dimensions in both butlers (transfer will not create them).
1995 n_exposures = 30
1996 for butler in (self.source_butler, self.target_butler):
1997 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1998 butler.registry.insertDimensionData(
1999 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2000 )
2001 butler.registry.insertDimensionData(
2002 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2003 )
2005 for i in range(n_exposures):
2006 butler.registry.insertDimensionData(
2007 "exposure",
2008 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2009 )
2011 # Create dataset types in the source butler.
2012 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
2013 for datasetTypeName in datasetTypeNames:
2014 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2015 self.source_butler.registry.registerDatasetType(datasetType)
2017 # Write a dataset to an unrelated run -- this will ensure that
2018 # we are rewriting integer dataset ids in the target if necessary.
2019 # Will not be relevant for UUID.
2020 run = "distraction"
2021 butler = Butler(butler=self.source_butler, run=run)
2022 butler.put(
2023 makeExampleMetrics(),
2024 datasetTypeName,
2025 exposure=1,
2026 instrument="DummyCamComp",
2027 physical_filter="d-r",
2028 )
2030 # Write some example metrics to the source
2031 butler = Butler(butler=self.source_butler)
2033 # Set of DatasetRefs that should be in the list of refs to transfer
2034 # but which will not be transferred.
2035 deleted = set()
2037 n_expected = 20 # Number of datasets expected to be transferred
2038 source_refs = []
2039 for i in range(n_exposures):
2040 # Put a third of datasets into each collection, only retain
2041 # two thirds.
2042 index = i % 3
2043 run = runs[index]
2044 datasetTypeName = datasetTypeNames[i % 2]
2046 metric_data = {
2047 "summary": {"counter": i},
2048 "output": {"text": "metric"},
2049 "data": [2 * x for x in range(i)],
2050 }
2051 metric = MetricsExample(**metric_data)
2052 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2053 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2055 # Remove the datastore record using low-level API
2056 if purge:
2057 # Remove records for a fraction.
2058 if index == 1:
2060 # For one of these delete the file as well.
2061 # This allows the "missing" code to filter the
2062 # file out.
2063 if not deleted:
2064 primary, uris = butler.datastore.getURIs(ref)
2065 if primary:
2066 primary.remove()
2067 for uri in uris.values():
2068 uri.remove()
2069 n_expected -= 1
2070 deleted.add(ref)
2072 # Remove the datastore record.
2073 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2075 if index < 2:
2076 source_refs.append(ref)
2077 if ref not in deleted:
2078 new_metric = butler.get(ref.unresolved(), collections=run)
2079 self.assertEqual(new_metric, metric)
2081 # Create some bad dataset types to ensure we check for inconsistent
2082 # definitions.
2083 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2084 for datasetTypeName in datasetTypeNames:
2085 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2086 self.target_butler.registry.registerDatasetType(datasetType)
2087 with self.assertRaises(ConflictingDefinitionError):
2088 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2089 # And remove the bad definitions.
2090 for datasetTypeName in datasetTypeNames:
2091 self.target_butler.registry.removeDatasetType(datasetTypeName)
2093 # Transfer without creating dataset types should fail.
2094 with self.assertRaises(KeyError):
2095 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2097 # Now transfer them to the second butler
2098 with self.assertLogs(level=logging.DEBUG) as cm:
2099 transferred = self.target_butler.transfer_from(
2100 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2101 )
2102 self.assertEqual(len(transferred), n_expected)
2103 log_output = ";".join(cm.output)
2104 self.assertIn("found in datastore for chunk", log_output)
2105 self.assertIn("Creating output run", log_output)
2107 # Do the transfer twice to ensure that it will do nothing extra.
2108 # Only do this if purge=True because it does not work for int
2109 # dataset_id.
2110 if purge:
2111 # This should not need to register dataset types.
2112 transferred = self.target_butler.transfer_from(
2113 self.source_butler, source_refs, id_gen_map=id_gen_map
2114 )
2115 self.assertEqual(len(transferred), n_expected)
2117 # Also do an explicit low-level transfer to trigger some
2118 # edge cases.
2119 with self.assertLogs(level=logging.DEBUG) as cm:
2120 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2121 log_output = ";".join(cm.output)
2122 self.assertIn("no file artifacts exist", log_output)
2124 with self.assertRaises(TypeError):
2125 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2127 with self.assertRaises(ValueError):
2128 self.target_butler.datastore.transfer_from(
2129 self.source_butler.datastore, source_refs, transfer="split"
2130 )
2132 # Now try to get the same refs from the new butler.
2133 for ref in source_refs:
2134 if ref not in deleted:
2135 unresolved_ref = ref.unresolved()
2136 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2137 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2138 self.assertEqual(new_metric, old_metric)
2140 # Now prune run2 collection and create instead a CHAINED collection.
2141 # This should block the transfer.
2142 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2143 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2144 with self.assertRaises(TypeError):
2145 # Re-importing the run1 datasets can be problematic if they
2146 # use integer IDs so filter those out.
2147 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2148 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2151if __name__ == "__main__": 2151 ↛ 2152line 2151 didn't jump to line 2152, because the condition on line 2151 was never true
2152 unittest.main()