Coverage for tests/test_butler.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 CollectionSearch,
64 CollectionType,
65 Config,
66 DatasetIdGenEnum,
67 DatasetRef,
68 DatasetType,
69 FileDataset,
70 FileTemplateValidationError,
71 StorageClassFactory,
72 ValidationError,
73 script,
74)
75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
76from lsst.daf.butler.registry import ConflictingDefinitionError, MissingCollectionError
77from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
78from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
79from lsst.resources import ResourcePath
80from lsst.resources.http import isWebdavEndpoint
81from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
82from lsst.utils import doImport
83from lsst.utils.introspection import get_full_type_name
85TESTDIR = os.path.abspath(os.path.dirname(__file__))
88def makeExampleMetrics():
89 return MetricsExample(
90 {"AM1": 5.2, "AM2": 30.6},
91 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
92 [563, 234, 456.7, 752, 8, 9, 27],
93 )
96class TransactionTestError(Exception):
97 """Specific error for testing transactions, to prevent misdiagnosing
98 that might otherwise occur when a standard exception is used.
99 """
101 pass
104class ButlerConfigTests(unittest.TestCase):
105 """Simple tests for ButlerConfig that are not tested in any other test
106 cases."""
108 def testSearchPath(self):
109 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
110 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
111 config1 = ButlerConfig(configFile)
112 self.assertNotIn("testConfigs", "\n".join(cm.output))
114 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
115 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
116 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
117 self.assertIn("testConfigs", "\n".join(cm.output))
119 key = ("datastore", "records", "table")
120 self.assertNotEqual(config1[key], config2[key])
121 self.assertEqual(config2[key], "override_record")
124class ButlerPutGetTests:
125 """Helper method for running a suite of put/get tests from different
126 butler configurations."""
128 root = None
130 @staticmethod
131 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
132 """Create a DatasetType and register it"""
133 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
134 registry.registerDatasetType(datasetType)
135 return datasetType
137 @classmethod
138 def setUpClass(cls):
139 cls.storageClassFactory = StorageClassFactory()
140 cls.storageClassFactory.addFromConfig(cls.configFile)
142 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
143 datasetType = datasetRef.datasetType
144 dataId = datasetRef.dataId
145 deferred = butler.getDirectDeferred(datasetRef)
147 for component in components:
148 compTypeName = datasetType.componentTypeName(component)
149 result = butler.get(compTypeName, dataId, collections=collections)
150 self.assertEqual(result, getattr(reference, component))
151 result_deferred = deferred.get(component=component)
152 self.assertEqual(result_deferred, result)
154 def tearDown(self):
155 removeTestTempDir(self.root)
157 def runPutGetTest(self, storageClass, datasetTypeName):
158 # New datasets will be added to run and tag, but we will only look in
159 # tag when looking up datasets.
160 run = "ingest"
161 butler = Butler(self.tmpConfigFile, run=run)
163 collections = set(butler.registry.queryCollections())
164 self.assertEqual(collections, set([run]))
166 # Create and register a DatasetType
167 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
169 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
171 # Add needed Dimensions
172 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
173 butler.registry.insertDimensionData(
174 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
175 )
176 butler.registry.insertDimensionData(
177 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
178 )
179 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
180 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
181 butler.registry.insertDimensionData(
182 "visit",
183 {
184 "instrument": "DummyCamComp",
185 "id": 423,
186 "name": "fourtwentythree",
187 "physical_filter": "d-r",
188 "visit_system": 1,
189 "datetime_begin": visit_start,
190 "datetime_end": visit_end,
191 },
192 )
194 # Add a second visit for some later tests
195 butler.registry.insertDimensionData(
196 "visit",
197 {
198 "instrument": "DummyCamComp",
199 "id": 424,
200 "name": "fourtwentyfour",
201 "physical_filter": "d-r",
202 "visit_system": 1,
203 },
204 )
206 # Create and store a dataset
207 metric = makeExampleMetrics()
208 dataId = {"instrument": "DummyCamComp", "visit": 423}
210 # Create a DatasetRef for put
211 refIn = DatasetRef(datasetType, dataId, id=None)
213 # Put with a preexisting id should fail
214 with self.assertRaises(ValueError):
215 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
217 # Put and remove the dataset once as a DatasetRef, once as a dataId,
218 # and once with a DatasetType
220 # Keep track of any collections we add and do not clean up
221 expected_collections = {run}
223 counter = 0
224 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
225 # Since we are using subTest we can get cascading failures
226 # here with the first attempt failing and the others failing
227 # immediately because the dataset already exists. Work around
228 # this by using a distinct run collection each time
229 counter += 1
230 this_run = f"put_run_{counter}"
231 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
232 expected_collections.update({this_run})
234 with self.subTest(args=args):
235 ref = butler.put(metric, *args, run=this_run)
236 self.assertIsInstance(ref, DatasetRef)
238 # Test getDirect
239 metricOut = butler.getDirect(ref)
240 self.assertEqual(metric, metricOut)
241 # Test get
242 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
243 self.assertEqual(metric, metricOut)
244 # Test get with a datasetRef
245 metricOut = butler.get(ref, collections=this_run)
246 self.assertEqual(metric, metricOut)
247 # Test getDeferred with dataId
248 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
249 self.assertEqual(metric, metricOut)
250 # Test getDeferred with a datasetRef
251 metricOut = butler.getDeferred(ref, collections=this_run).get()
252 self.assertEqual(metric, metricOut)
253 # and deferred direct with ref
254 metricOut = butler.getDirectDeferred(ref).get()
255 self.assertEqual(metric, metricOut)
257 # Check we can get components
258 if storageClass.isComposite():
259 self.assertGetComponents(
260 butler, ref, ("summary", "data", "output"), metric, collections=this_run
261 )
263 # Can the artifacts themselves be retrieved?
264 if not butler.datastore.isEphemeral:
265 root_uri = ResourcePath(self.root)
267 for preserve_path in (True, False):
268 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
269 # Use copy so that we can test that overwrite
270 # protection works (using "auto" for File URIs would
271 # use hard links and subsequent transfer would work
272 # because it knows they are the same file).
273 transferred = butler.retrieveArtifacts(
274 [ref], destination, preserve_path=preserve_path, transfer="copy"
275 )
276 self.assertGreater(len(transferred), 0)
277 artifacts = list(ResourcePath.findFileResources([destination]))
278 self.assertEqual(set(transferred), set(artifacts))
280 for artifact in transferred:
281 path_in_destination = artifact.relative_to(destination)
282 self.assertIsNotNone(path_in_destination)
284 # when path is not preserved there should not be
285 # any path separators.
286 num_seps = path_in_destination.count("/")
287 if preserve_path:
288 self.assertGreater(num_seps, 0)
289 else:
290 self.assertEqual(num_seps, 0)
292 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
293 n_uris = len(secondary_uris)
294 if primary_uri:
295 n_uris += 1
296 self.assertEqual(
297 len(artifacts),
298 n_uris,
299 "Comparing expected artifacts vs actual:"
300 f" {artifacts} vs {primary_uri} and {secondary_uris}",
301 )
303 if preserve_path:
304 # No need to run these twice
305 with self.assertRaises(ValueError):
306 butler.retrieveArtifacts([ref], destination, transfer="move")
308 with self.assertRaises(FileExistsError):
309 butler.retrieveArtifacts([ref], destination)
311 transferred_again = butler.retrieveArtifacts(
312 [ref], destination, preserve_path=preserve_path, overwrite=True
313 )
314 self.assertEqual(set(transferred_again), set(transferred))
316 # Now remove the dataset completely.
317 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
318 # Lookup with original args should still fail.
319 with self.assertRaises(LookupError):
320 butler.datasetExists(*args, collections=this_run)
321 # getDirect() should still fail.
322 with self.assertRaises(FileNotFoundError):
323 butler.getDirect(ref)
324 # Registry shouldn't be able to find it by dataset_id anymore.
325 self.assertIsNone(butler.registry.getDataset(ref.id))
327 # Do explicit registry removal since we know they are
328 # empty
329 butler.registry.removeCollection(this_run)
330 expected_collections.remove(this_run)
332 # Put the dataset again, since the last thing we did was remove it
333 # and we want to use the default collection.
334 ref = butler.put(metric, refIn)
336 # Get with parameters
337 stop = 4
338 sliced = butler.get(ref, parameters={"slice": slice(stop)})
339 self.assertNotEqual(metric, sliced)
340 self.assertEqual(metric.summary, sliced.summary)
341 self.assertEqual(metric.output, sliced.output)
342 self.assertEqual(metric.data[:stop], sliced.data)
343 # getDeferred with parameters
344 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
345 self.assertNotEqual(metric, sliced)
346 self.assertEqual(metric.summary, sliced.summary)
347 self.assertEqual(metric.output, sliced.output)
348 self.assertEqual(metric.data[:stop], sliced.data)
349 # getDeferred with deferred parameters
350 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
351 self.assertNotEqual(metric, sliced)
352 self.assertEqual(metric.summary, sliced.summary)
353 self.assertEqual(metric.output, sliced.output)
354 self.assertEqual(metric.data[:stop], sliced.data)
356 if storageClass.isComposite():
357 # Check that components can be retrieved
358 metricOut = butler.get(ref.datasetType.name, dataId)
359 compNameS = ref.datasetType.componentTypeName("summary")
360 compNameD = ref.datasetType.componentTypeName("data")
361 summary = butler.get(compNameS, dataId)
362 self.assertEqual(summary, metric.summary)
363 data = butler.get(compNameD, dataId)
364 self.assertEqual(data, metric.data)
366 if "counter" in storageClass.derivedComponents:
367 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
368 self.assertEqual(count, len(data))
370 count = butler.get(
371 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
372 )
373 self.assertEqual(count, stop)
375 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
376 summary = butler.getDirect(compRef)
377 self.assertEqual(summary, metric.summary)
379 # Create a Dataset type that has the same name but is inconsistent.
380 inconsistentDatasetType = DatasetType(
381 datasetTypeName, dimensions, self.storageClassFactory.getStorageClass("Config")
382 )
384 # Getting with a dataset type that does not match registry fails
385 with self.assertRaises(ValueError):
386 butler.get(inconsistentDatasetType, dataId)
388 # Combining a DatasetRef with a dataId should fail
389 with self.assertRaises(ValueError):
390 butler.get(ref, dataId)
391 # Getting with an explicit ref should fail if the id doesn't match
392 with self.assertRaises(ValueError):
393 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
395 # Getting a dataset with unknown parameters should fail
396 with self.assertRaises(KeyError):
397 butler.get(ref, parameters={"unsupported": True})
399 # Check we have a collection
400 collections = set(butler.registry.queryCollections())
401 self.assertEqual(collections, expected_collections)
403 # Clean up to check that we can remove something that may have
404 # already had a component removed
405 butler.pruneDatasets([ref], unstore=True, purge=True)
407 # Check that we can configure a butler to accept a put even
408 # if it already has the dataset in registry.
409 ref = butler.put(metric, refIn)
411 # Repeat put will fail.
412 with self.assertRaises(ConflictingDefinitionError):
413 butler.put(metric, refIn)
415 # Remove the datastore entry.
416 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
418 # Put will still fail
419 with self.assertRaises(ConflictingDefinitionError):
420 butler.put(metric, refIn)
422 # Allow the put to succeed
423 butler._allow_put_of_predefined_dataset = True
424 ref2 = butler.put(metric, refIn)
425 self.assertEqual(ref2.id, ref.id)
427 # A second put will still fail but with a different exception
428 # than before.
429 with self.assertRaises(ConflictingDefinitionError):
430 butler.put(metric, refIn)
432 # Reset the flag to avoid confusion
433 butler._allow_put_of_predefined_dataset = False
435 # Leave the dataset in place since some downstream tests require
436 # something to be present
438 return butler
440 def testDeferredCollectionPassing(self):
441 # Construct a butler with no run or collection, but make it writeable.
442 butler = Butler(self.tmpConfigFile, writeable=True)
443 # Create and register a DatasetType
444 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
445 datasetType = self.addDatasetType(
446 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
447 )
448 # Add needed Dimensions
449 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
450 butler.registry.insertDimensionData(
451 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
452 )
453 butler.registry.insertDimensionData(
454 "visit",
455 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
456 )
457 dataId = {"instrument": "DummyCamComp", "visit": 423}
458 # Create dataset.
459 metric = makeExampleMetrics()
460 # Register a new run and put dataset.
461 run = "deferred"
462 self.assertTrue(butler.registry.registerRun(run))
463 # Second time it will be allowed but indicate no-op
464 self.assertFalse(butler.registry.registerRun(run))
465 ref = butler.put(metric, datasetType, dataId, run=run)
466 # Putting with no run should fail with TypeError.
467 with self.assertRaises(TypeError):
468 butler.put(metric, datasetType, dataId)
469 # Dataset should exist.
470 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
471 # We should be able to get the dataset back, but with and without
472 # a deferred dataset handle.
473 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
474 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
475 # Trying to find the dataset without any collection is a TypeError.
476 with self.assertRaises(TypeError):
477 butler.datasetExists(datasetType, dataId)
478 with self.assertRaises(TypeError):
479 butler.get(datasetType, dataId)
480 # Associate the dataset with a different collection.
481 butler.registry.registerCollection("tagged")
482 butler.registry.associate("tagged", [ref])
483 # Deleting the dataset from the new collection should make it findable
484 # in the original collection.
485 butler.pruneDatasets([ref], tags=["tagged"])
486 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
489class ButlerTests(ButlerPutGetTests):
490 """Tests for Butler."""
492 useTempRoot = True
494 def setUp(self):
495 """Create a new butler root for each test."""
496 self.root = makeTestTempDir(TESTDIR)
497 Butler.makeRepo(self.root, config=Config(self.configFile))
498 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
500 def testConstructor(self):
501 """Independent test of constructor."""
502 butler = Butler(self.tmpConfigFile, run="ingest")
503 self.assertIsInstance(butler, Butler)
505 # Check that butler.yaml is added automatically.
506 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
507 config_dir = self.tmpConfigFile[: -len(end)]
508 butler = Butler(config_dir, run="ingest")
509 self.assertIsInstance(butler, Butler)
511 collections = set(butler.registry.queryCollections())
512 self.assertEqual(collections, {"ingest"})
514 # Check that some special characters can be included in run name.
515 special_run = "u@b.c-A"
516 butler_special = Butler(butler=butler, run=special_run)
517 collections = set(butler_special.registry.queryCollections("*@*"))
518 self.assertEqual(collections, {special_run})
520 butler2 = Butler(butler=butler, collections=["other"])
521 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
522 self.assertIsNone(butler2.run)
523 self.assertIs(butler.datastore, butler2.datastore)
525 # Test that we can use an environment variable to find this
526 # repository.
527 butler_index = Config()
528 butler_index["label"] = self.tmpConfigFile
529 for suffix in (".yaml", ".json"):
530 # Ensure that the content differs so that we know that
531 # we aren't reusing the cache.
532 bad_label = f"s3://bucket/not_real{suffix}"
533 butler_index["bad_label"] = bad_label
534 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
535 butler_index.dumpToUri(temp_file)
536 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
537 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
538 uri = Butler.get_repo_uri("bad_label")
539 self.assertEqual(uri, ResourcePath(bad_label))
540 uri = Butler.get_repo_uri("label")
541 butler = Butler(uri, writeable=False)
542 self.assertIsInstance(butler, Butler)
543 with self.assertRaises(KeyError) as cm:
544 Butler.get_repo_uri("missing")
545 self.assertIn("not known to", str(cm.exception))
546 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
547 with self.assertRaises(FileNotFoundError):
548 Butler.get_repo_uri("label")
549 self.assertEqual(Butler.get_known_repos(), set())
550 with self.assertRaises(KeyError) as cm:
551 # No environment variable set.
552 Butler.get_repo_uri("label")
553 self.assertIn("No repository index defined", str(cm.exception))
554 self.assertEqual(Butler.get_known_repos(), set())
556 def testBasicPutGet(self):
557 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
558 self.runPutGetTest(storageClass, "test_metric")
560 def testCompositePutGetConcrete(self):
562 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
563 butler = self.runPutGetTest(storageClass, "test_metric")
565 # Should *not* be disassembled
566 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
567 self.assertEqual(len(datasets), 1)
568 uri, components = butler.getURIs(datasets[0])
569 self.assertIsInstance(uri, ResourcePath)
570 self.assertFalse(components)
571 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
572 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
574 # Predicted dataset
575 dataId = {"instrument": "DummyCamComp", "visit": 424}
576 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
577 self.assertFalse(components)
578 self.assertIsInstance(uri, ResourcePath)
579 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
580 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
582 def testCompositePutGetVirtual(self):
583 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
584 butler = self.runPutGetTest(storageClass, "test_metric_comp")
586 # Should be disassembled
587 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
588 self.assertEqual(len(datasets), 1)
589 uri, components = butler.getURIs(datasets[0])
591 if butler.datastore.isEphemeral:
592 # Never disassemble in-memory datastore
593 self.assertIsInstance(uri, ResourcePath)
594 self.assertFalse(components)
595 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
596 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
597 else:
598 self.assertIsNone(uri)
599 self.assertEqual(set(components), set(storageClass.components))
600 for compuri in components.values():
601 self.assertIsInstance(compuri, ResourcePath)
602 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
603 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
605 # Predicted dataset
606 dataId = {"instrument": "DummyCamComp", "visit": 424}
607 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
609 if butler.datastore.isEphemeral:
610 # Never disassembled
611 self.assertIsInstance(uri, ResourcePath)
612 self.assertFalse(components)
613 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
614 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
615 else:
616 self.assertIsNone(uri)
617 self.assertEqual(set(components), set(storageClass.components))
618 for compuri in components.values():
619 self.assertIsInstance(compuri, ResourcePath)
620 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
621 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
623 def testIngest(self):
624 butler = Butler(self.tmpConfigFile, run="ingest")
626 # Create and register a DatasetType
627 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
629 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
630 datasetTypeName = "metric"
632 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
634 # Add needed Dimensions
635 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
636 butler.registry.insertDimensionData(
637 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
638 )
639 for detector in (1, 2):
640 butler.registry.insertDimensionData(
641 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
642 )
644 butler.registry.insertDimensionData(
645 "visit",
646 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
647 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
648 )
650 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
651 dataRoot = os.path.join(TESTDIR, "data", "basic")
652 datasets = []
653 for detector in (1, 2):
654 detector_name = f"detector_{detector}"
655 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
656 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
657 # Create a DatasetRef for ingest
658 refIn = DatasetRef(datasetType, dataId, id=None)
660 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
662 butler.ingest(*datasets, transfer="copy")
664 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
665 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
667 metrics1 = butler.get(datasetTypeName, dataId1)
668 metrics2 = butler.get(datasetTypeName, dataId2)
669 self.assertNotEqual(metrics1, metrics2)
671 # Compare URIs
672 uri1 = butler.getURI(datasetTypeName, dataId1)
673 uri2 = butler.getURI(datasetTypeName, dataId2)
674 self.assertNotEqual(uri1, uri2)
676 # Now do a multi-dataset but single file ingest
677 metricFile = os.path.join(dataRoot, "detectors.yaml")
678 refs = []
679 for detector in (1, 2):
680 detector_name = f"detector_{detector}"
681 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
682 # Create a DatasetRef for ingest
683 refs.append(DatasetRef(datasetType, dataId, id=None))
685 datasets = []
686 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
688 butler.ingest(*datasets, transfer="copy")
690 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
691 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
693 multi1 = butler.get(datasetTypeName, dataId1)
694 multi2 = butler.get(datasetTypeName, dataId2)
696 self.assertEqual(multi1, metrics1)
697 self.assertEqual(multi2, metrics2)
699 # Compare URIs
700 uri1 = butler.getURI(datasetTypeName, dataId1)
701 uri2 = butler.getURI(datasetTypeName, dataId2)
702 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
704 # Test that removing one does not break the second
705 # This line will issue a warning log message for a ChainedDatastore
706 # that uses an InMemoryDatastore since in-memory can not ingest
707 # files.
708 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
709 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
710 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
711 multi2b = butler.get(datasetTypeName, dataId2)
712 self.assertEqual(multi2, multi2b)
714 def testPruneCollections(self):
715 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
716 butler = Butler(self.tmpConfigFile, writeable=True)
717 # Load registry data with dimensions to hang datasets off of.
718 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
719 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
720 # Add some RUN-type collections.
721 run1 = "run1"
722 butler.registry.registerRun(run1)
723 run2 = "run2"
724 butler.registry.registerRun(run2)
725 # put some datasets. ref1 and ref2 have the same data ID, and are in
726 # different runs. ref3 has a different data ID.
727 metric = makeExampleMetrics()
728 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
729 datasetType = self.addDatasetType(
730 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
731 )
732 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
733 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
734 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
736 # Try to delete a RUN collection without purge, or with purge and not
737 # unstore.
738 with self.assertRaises(TypeError):
739 butler.pruneCollection(run1)
740 with self.assertRaises(TypeError):
741 butler.pruneCollection(run2, purge=True)
742 # Add a TAGGED collection and associate ref3 only into it.
743 tag1 = "tag1"
744 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
745 self.assertTrue(registered)
746 # Registering a second time should be allowed.
747 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
748 self.assertFalse(registered)
749 butler.registry.associate(tag1, [ref3])
750 # Add a CHAINED collection that searches run1 and then run2. It
751 # logically contains only ref1, because ref2 is shadowed due to them
752 # having the same data ID and dataset type.
753 chain1 = "chain1"
754 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
755 butler.registry.setCollectionChain(chain1, [run1, run2])
756 # Try to delete RUN collections, which should fail with complete
757 # rollback because they're still referenced by the CHAINED
758 # collection.
759 with self.assertRaises(Exception):
760 butler.pruneCollection(run1, pruge=True, unstore=True)
761 with self.assertRaises(Exception):
762 butler.pruneCollection(run2, pruge=True, unstore=True)
763 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
764 existence = butler.datastore.mexists([ref1, ref2, ref3])
765 self.assertTrue(existence[ref1])
766 self.assertTrue(existence[ref2])
767 self.assertTrue(existence[ref3])
768 # Try to delete CHAINED and TAGGED collections with purge; should not
769 # work.
770 with self.assertRaises(TypeError):
771 butler.pruneCollection(tag1, purge=True, unstore=True)
772 with self.assertRaises(TypeError):
773 butler.pruneCollection(chain1, purge=True, unstore=True)
774 # Remove the tagged collection with unstore=False. This should not
775 # affect the datasets.
776 butler.pruneCollection(tag1)
777 with self.assertRaises(MissingCollectionError):
778 butler.registry.getCollectionType(tag1)
779 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
780 existence = butler.datastore.mexists([ref1, ref2, ref3])
781 self.assertTrue(existence[ref1])
782 self.assertTrue(existence[ref2])
783 self.assertTrue(existence[ref3])
784 # Add the tagged collection back in, and remove it with unstore=True.
785 # This should remove ref3 only from the datastore.
786 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
787 butler.registry.associate(tag1, [ref3])
788 butler.pruneCollection(tag1, unstore=True)
789 with self.assertRaises(MissingCollectionError):
790 butler.registry.getCollectionType(tag1)
791 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
792 existence = butler.datastore.mexists([ref1, ref2, ref3])
793 self.assertTrue(existence[ref1])
794 self.assertTrue(existence[ref2])
795 self.assertFalse(existence[ref3])
796 # Delete the chain with unstore=False. The datasets should not be
797 # affected at all.
798 butler.pruneCollection(chain1)
799 with self.assertRaises(MissingCollectionError):
800 butler.registry.getCollectionType(chain1)
801 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
802 existence = butler.datastore.mexists([ref1, ref2, ref3])
803 self.assertTrue(existence[ref1])
804 self.assertTrue(existence[ref2])
805 self.assertFalse(existence[ref3])
806 # Redefine and then delete the chain with unstore=True. Only ref1
807 # should be unstored (ref3 has already been unstored, but otherwise
808 # would be now).
809 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
810 butler.registry.setCollectionChain(chain1, [run1, run2])
811 butler.pruneCollection(chain1, unstore=True)
812 with self.assertRaises(MissingCollectionError):
813 butler.registry.getCollectionType(chain1)
814 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
815 existence = butler.datastore.mexists([ref1, ref2, ref3])
816 self.assertFalse(existence[ref1])
817 self.assertTrue(existence[ref2])
818 self.assertFalse(existence[ref3])
819 # Remove run1. This removes ref1 and ref3 from the registry (they're
820 # already gone from the datastore, which is fine).
821 butler.pruneCollection(run1, purge=True, unstore=True)
822 with self.assertRaises(MissingCollectionError):
823 butler.registry.getCollectionType(run1)
824 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
825 self.assertTrue(butler.datastore.exists(ref2))
826 # Remove run2. This removes ref2 from the registry and the datastore.
827 butler.pruneCollection(run2, purge=True, unstore=True)
828 with self.assertRaises(MissingCollectionError):
829 butler.registry.getCollectionType(run2)
830 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
832 # Now that the collections have been pruned we can remove the
833 # dataset type
834 butler.registry.removeDatasetType(datasetType.name)
836 def testPickle(self):
837 """Test pickle support."""
838 butler = Butler(self.tmpConfigFile, run="ingest")
839 butlerOut = pickle.loads(pickle.dumps(butler))
840 self.assertIsInstance(butlerOut, Butler)
841 self.assertEqual(butlerOut._config, butler._config)
842 self.assertEqual(butlerOut.collections, butler.collections)
843 self.assertEqual(butlerOut.run, butler.run)
845 def testGetDatasetTypes(self):
846 butler = Butler(self.tmpConfigFile, run="ingest")
847 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
848 dimensionEntries = [
849 (
850 "instrument",
851 {"instrument": "DummyCam"},
852 {"instrument": "DummyHSC"},
853 {"instrument": "DummyCamComp"},
854 ),
855 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
856 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
857 ]
858 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
859 # Add needed Dimensions
860 for args in dimensionEntries:
861 butler.registry.insertDimensionData(*args)
863 # When a DatasetType is added to the registry entries are not created
864 # for components but querying them can return the components.
865 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
866 components = set()
867 for datasetTypeName in datasetTypeNames:
868 # Create and register a DatasetType
869 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
871 for componentName in storageClass.components:
872 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
874 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
875 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
877 # Now that we have some dataset types registered, validate them
878 butler.validateConfiguration(
879 ignore=[
880 "test_metric_comp",
881 "metric3",
882 "calexp",
883 "DummySC",
884 "datasetType.component",
885 "random_data",
886 "random_data_2",
887 ]
888 )
890 # Add a new datasetType that will fail template validation
891 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
892 if self.validationCanFail:
893 with self.assertRaises(ValidationError):
894 butler.validateConfiguration()
896 # Rerun validation but with a subset of dataset type names
897 butler.validateConfiguration(datasetTypeNames=["metric4"])
899 # Rerun validation but ignore the bad datasetType
900 butler.validateConfiguration(
901 ignore=[
902 "test_metric_comp",
903 "metric3",
904 "calexp",
905 "DummySC",
906 "datasetType.component",
907 "random_data",
908 "random_data_2",
909 ]
910 )
912 def testTransaction(self):
913 butler = Butler(self.tmpConfigFile, run="ingest")
914 datasetTypeName = "test_metric"
915 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
916 dimensionEntries = (
917 ("instrument", {"instrument": "DummyCam"}),
918 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
919 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
920 )
921 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
922 metric = makeExampleMetrics()
923 dataId = {"instrument": "DummyCam", "visit": 42}
924 # Create and register a DatasetType
925 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
926 with self.assertRaises(TransactionTestError):
927 with butler.transaction():
928 # Add needed Dimensions
929 for args in dimensionEntries:
930 butler.registry.insertDimensionData(*args)
931 # Store a dataset
932 ref = butler.put(metric, datasetTypeName, dataId)
933 self.assertIsInstance(ref, DatasetRef)
934 # Test getDirect
935 metricOut = butler.getDirect(ref)
936 self.assertEqual(metric, metricOut)
937 # Test get
938 metricOut = butler.get(datasetTypeName, dataId)
939 self.assertEqual(metric, metricOut)
940 # Check we can get components
941 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
942 raise TransactionTestError("This should roll back the entire transaction")
943 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"):
944 butler.registry.expandDataId(dataId)
945 # Should raise LookupError for missing data ID value
946 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
947 butler.get(datasetTypeName, dataId)
948 # Also check explicitly if Dataset entry is missing
949 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
950 # Direct retrieval should not find the file in the Datastore
951 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
952 butler.getDirect(ref)
954 def testMakeRepo(self):
955 """Test that we can write butler configuration to a new repository via
956 the Butler.makeRepo interface and then instantiate a butler from the
957 repo root.
958 """
959 # Do not run the test if we know this datastore configuration does
960 # not support a file system root
961 if self.fullConfigKey is None:
962 return
964 # create two separate directories
965 root1 = tempfile.mkdtemp(dir=self.root)
966 root2 = tempfile.mkdtemp(dir=self.root)
968 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
969 limited = Config(self.configFile)
970 butler1 = Butler(butlerConfig)
971 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
972 full = Config(self.tmpConfigFile)
973 butler2 = Butler(butlerConfig)
974 # Butlers should have the same configuration regardless of whether
975 # defaults were expanded.
976 self.assertEqual(butler1._config, butler2._config)
977 # Config files loaded directly should not be the same.
978 self.assertNotEqual(limited, full)
979 # Make sure "limited" doesn't have a few keys we know it should be
980 # inheriting from defaults.
981 self.assertIn(self.fullConfigKey, full)
982 self.assertNotIn(self.fullConfigKey, limited)
984 # Collections don't appear until something is put in them
985 collections1 = set(butler1.registry.queryCollections())
986 self.assertEqual(collections1, set())
987 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
989 # Check that a config with no associated file name will not
990 # work properly with relocatable Butler repo
991 butlerConfig.configFile = None
992 with self.assertRaises(ValueError):
993 Butler(butlerConfig)
995 with self.assertRaises(FileExistsError):
996 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
998 def testStringification(self):
999 butler = Butler(self.tmpConfigFile, run="ingest")
1000 butlerStr = str(butler)
1002 if self.datastoreStr is not None:
1003 for testStr in self.datastoreStr:
1004 self.assertIn(testStr, butlerStr)
1005 if self.registryStr is not None:
1006 self.assertIn(self.registryStr, butlerStr)
1008 datastoreName = butler.datastore.name
1009 if self.datastoreName is not None:
1010 for testStr in self.datastoreName:
1011 self.assertIn(testStr, datastoreName)
1013 def testButlerRewriteDataId(self):
1014 """Test that dataIds can be rewritten based on dimension records."""
1016 butler = Butler(self.tmpConfigFile, run="ingest")
1018 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1019 datasetTypeName = "random_data"
1021 # Create dimension records.
1022 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1023 butler.registry.insertDimensionData(
1024 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1025 )
1026 butler.registry.insertDimensionData(
1027 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1028 )
1030 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1031 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1032 butler.registry.registerDatasetType(datasetType)
1034 n_exposures = 5
1035 dayobs = 20210530
1037 for i in range(n_exposures):
1038 butler.registry.insertDimensionData(
1039 "exposure",
1040 {
1041 "instrument": "DummyCamComp",
1042 "id": i,
1043 "obs_id": f"exp{i}",
1044 "seq_num": i,
1045 "day_obs": dayobs,
1046 "physical_filter": "d-r",
1047 },
1048 )
1050 # Write some data.
1051 for i in range(n_exposures):
1052 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1054 # Use the seq_num for the put to test rewriting.
1055 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1056 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1058 # Check that the exposure is correct in the dataId
1059 self.assertEqual(ref.dataId["exposure"], i)
1061 # and check that we can get the dataset back with the same dataId
1062 new_metric = butler.get(datasetTypeName, dataId=dataId)
1063 self.assertEqual(new_metric, metric)
1066class FileDatastoreButlerTests(ButlerTests):
1067 """Common tests and specialization of ButlerTests for butlers backed
1068 by datastores that inherit from FileDatastore.
1069 """
1071 def checkFileExists(self, root, relpath):
1072 """Checks if file exists at a given path (relative to root).
1074 Test testPutTemplates verifies actual physical existance of the files
1075 in the requested location.
1076 """
1077 uri = ResourcePath(root, forceDirectory=True)
1078 return uri.join(relpath).exists()
1080 def testPutTemplates(self):
1081 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1082 butler = Butler(self.tmpConfigFile, run="ingest")
1084 # Add needed Dimensions
1085 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1086 butler.registry.insertDimensionData(
1087 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1088 )
1089 butler.registry.insertDimensionData(
1090 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1091 )
1092 butler.registry.insertDimensionData(
1093 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1094 )
1096 # Create and store a dataset
1097 metric = makeExampleMetrics()
1099 # Create two almost-identical DatasetTypes (both will use default
1100 # template)
1101 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1102 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1103 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1104 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1106 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1107 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1109 # Put with exactly the data ID keys needed
1110 ref = butler.put(metric, "metric1", dataId1)
1111 uri = butler.getURI(ref)
1112 self.assertTrue(
1113 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"),
1114 f"Checking existence of {uri}",
1115 )
1117 # Check the template based on dimensions
1118 butler.datastore.templates.validateTemplates([ref])
1120 # Put with extra data ID keys (physical_filter is an optional
1121 # dependency); should not change template (at least the way we're
1122 # defining them to behave now; the important thing is that they
1123 # must be consistent).
1124 ref = butler.put(metric, "metric2", dataId2)
1125 uri = butler.getURI(ref)
1126 self.assertTrue(
1127 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"),
1128 f"Checking existence of {uri}",
1129 )
1131 # Check the template based on dimensions
1132 butler.datastore.templates.validateTemplates([ref])
1134 # Now use a file template that will not result in unique filenames
1135 with self.assertRaises(FileTemplateValidationError):
1136 butler.put(metric, "metric3", dataId1)
1138 def testImportExport(self):
1139 # Run put/get tests just to create and populate a repo.
1140 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1141 self.runImportExportTest(storageClass)
1143 @unittest.expectedFailure
1144 def testImportExportVirtualComposite(self):
1145 # Run put/get tests just to create and populate a repo.
1146 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1147 self.runImportExportTest(storageClass)
1149 def runImportExportTest(self, storageClass):
1150 """This test does an export to a temp directory and an import back
1151 into a new temp directory repo. It does not assume a posix datastore"""
1152 exportButler = self.runPutGetTest(storageClass, "test_metric")
1153 print("Root:", exportButler.datastore.root)
1154 # Test that the repo actually has at least one dataset.
1155 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1156 self.assertGreater(len(datasets), 0)
1157 # Add a DimensionRecord that's unused by those datasets.
1158 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1159 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1160 # Export and then import datasets.
1161 with safeTestTempDir(TESTDIR) as exportDir:
1162 exportFile = os.path.join(exportDir, "exports.yaml")
1163 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1164 export.saveDatasets(datasets)
1165 # Export the same datasets again. This should quietly do
1166 # nothing because of internal deduplication, and it shouldn't
1167 # complain about being asked to export the "htm7" elements even
1168 # though there aren't any in these datasets or in the database.
1169 export.saveDatasets(datasets, elements=["htm7"])
1170 # Save one of the data IDs again; this should be harmless
1171 # because of internal deduplication.
1172 export.saveDataIds([datasets[0].dataId])
1173 # Save some dimension records directly.
1174 export.saveDimensionData("skymap", [skymapRecord])
1175 self.assertTrue(os.path.exists(exportFile))
1176 with safeTestTempDir(TESTDIR) as importDir:
1177 # We always want this to be a local posix butler
1178 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1179 # Calling script.butlerImport tests the implementation of the
1180 # butler command line interface "import" subcommand. Functions
1181 # in the script folder are generally considered protected and
1182 # should not be used as public api.
1183 with open(exportFile, "r") as f:
1184 script.butlerImport(
1185 importDir,
1186 export_file=f,
1187 directory=exportDir,
1188 transfer="auto",
1189 skip_dimensions=None,
1190 reuse_ids=False,
1191 )
1192 importButler = Butler(importDir, run="ingest")
1193 for ref in datasets:
1194 with self.subTest(ref=ref):
1195 # Test for existence by passing in the DatasetType and
1196 # data ID separately, to avoid lookup by dataset_id.
1197 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1198 self.assertEqual(
1199 list(importButler.registry.queryDimensionRecords("skymap")),
1200 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1201 )
1203 def testRemoveRuns(self):
1204 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1205 butler = Butler(self.tmpConfigFile, writeable=True)
1206 # Load registry data with dimensions to hang datasets off of.
1207 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1208 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1209 # Add some RUN-type collection.
1210 run1 = "run1"
1211 butler.registry.registerRun(run1)
1212 run2 = "run2"
1213 butler.registry.registerRun(run2)
1214 # put a dataset in each
1215 metric = makeExampleMetrics()
1216 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1217 datasetType = self.addDatasetType(
1218 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1219 )
1220 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1221 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1222 uri1 = butler.getURI(ref1, collections=[run1])
1223 uri2 = butler.getURI(ref2, collections=[run2])
1224 # Remove from both runs with different values for unstore.
1225 butler.removeRuns([run1], unstore=True)
1226 butler.removeRuns([run2], unstore=False)
1227 # Should be nothing in registry for either one, and datastore should
1228 # not think either exists.
1229 with self.assertRaises(MissingCollectionError):
1230 butler.registry.getCollectionType(run1)
1231 with self.assertRaises(MissingCollectionError):
1232 butler.registry.getCollectionType(run2)
1233 self.assertFalse(butler.datastore.exists(ref1))
1234 self.assertFalse(butler.datastore.exists(ref2))
1235 # The ref we unstored should be gone according to the URI, but the
1236 # one we forgot should still be around.
1237 self.assertFalse(uri1.exists())
1238 self.assertTrue(uri2.exists())
1241class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1242 """PosixDatastore specialization of a butler"""
1244 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1245 fullConfigKey = ".datastore.formatters"
1246 validationCanFail = True
1247 datastoreStr = ["/tmp"]
1248 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1249 registryStr = "/gen3.sqlite3"
1251 def testPathConstructor(self):
1252 """Independent test of constructor using PathLike."""
1253 butler = Butler(self.tmpConfigFile, run="ingest")
1254 self.assertIsInstance(butler, Butler)
1256 # And again with a Path object with the butler yaml
1257 path = pathlib.Path(self.tmpConfigFile)
1258 butler = Butler(path, writeable=False)
1259 self.assertIsInstance(butler, Butler)
1261 # And again with a Path object without the butler yaml
1262 # (making sure we skip it if the tmp config doesn't end
1263 # in butler.yaml -- which is the case for a subclass)
1264 if self.tmpConfigFile.endswith("butler.yaml"):
1265 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1266 butler = Butler(path, writeable=False)
1267 self.assertIsInstance(butler, Butler)
1269 def testExportTransferCopy(self):
1270 """Test local export using all transfer modes"""
1271 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1272 exportButler = self.runPutGetTest(storageClass, "test_metric")
1273 # Test that the repo actually has at least one dataset.
1274 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1275 self.assertGreater(len(datasets), 0)
1276 uris = [exportButler.getURI(d) for d in datasets]
1277 datastoreRoot = exportButler.datastore.root
1279 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1281 for path in pathsInStore:
1282 # Assume local file system
1283 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1285 for transfer in ("copy", "link", "symlink", "relsymlink"):
1286 with safeTestTempDir(TESTDIR) as exportDir:
1287 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1288 export.saveDatasets(datasets)
1289 for path in pathsInStore:
1290 self.assertTrue(
1291 self.checkFileExists(exportDir, path),
1292 f"Check that mode {transfer} exported files",
1293 )
1295 def testPruneDatasets(self):
1296 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1297 butler = Butler(self.tmpConfigFile, writeable=True)
1298 # Load registry data with dimensions to hang datasets off of.
1299 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1300 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1301 # Add some RUN-type collections.
1302 run1 = "run1"
1303 butler.registry.registerRun(run1)
1304 run2 = "run2"
1305 butler.registry.registerRun(run2)
1306 # put some datasets. ref1 and ref2 have the same data ID, and are in
1307 # different runs. ref3 has a different data ID.
1308 metric = makeExampleMetrics()
1309 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1310 datasetType = self.addDatasetType(
1311 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1312 )
1313 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1314 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1315 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1317 # Simple prune.
1318 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1319 with self.assertRaises(LookupError):
1320 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1322 # Put data back.
1323 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1324 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1325 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1327 # Check that in normal mode, deleting the record will lead to
1328 # trash not touching the file.
1329 uri1 = butler.datastore.getURI(ref1)
1330 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1331 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1332 butler.datastore.trash(ref1)
1333 butler.datastore.emptyTrash()
1334 self.assertTrue(uri1.exists())
1335 uri1.remove() # Clean it up.
1337 # Simulate execution butler setup by deleting the datastore
1338 # record but keeping the file around and trusting.
1339 butler.datastore.trustGetRequest = True
1340 uri2 = butler.datastore.getURI(ref2)
1341 uri3 = butler.datastore.getURI(ref3)
1342 self.assertTrue(uri2.exists())
1343 self.assertTrue(uri3.exists())
1345 # Remove the datastore record.
1346 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1347 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1348 self.assertTrue(uri2.exists())
1349 butler.datastore.trash([ref2, ref3])
1350 # Immediate removal for ref2 file
1351 self.assertFalse(uri2.exists())
1352 # But ref3 has to wait for the empty.
1353 self.assertTrue(uri3.exists())
1354 butler.datastore.emptyTrash()
1355 self.assertFalse(uri3.exists())
1357 # Clear out the datasets from registry.
1358 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1360 def testPytypeCoercion(self):
1361 """Test python type coercion on Butler.get"""
1363 # Store some data with the normal example storage class.
1364 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1365 datasetTypeName = "test_metric"
1366 butler = self.runPutGetTest(storageClass, datasetTypeName)
1368 dataId = {"instrument": "DummyCamComp", "visit": 423}
1369 metric = butler.get(datasetTypeName, dataId=dataId)
1370 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1372 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1373 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1375 # Now need to hack the registry dataset type definition.
1376 # There is no API for this.
1377 manager = butler.registry._managers.datasets
1378 manager._db.update(
1379 manager._static.dataset_type,
1380 {"name": datasetTypeName},
1381 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1382 )
1384 # Force reset of dataset type cache
1385 butler.registry.refresh()
1387 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1388 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1389 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1391 metric_model = butler.get(datasetTypeName, dataId=dataId)
1392 self.assertNotEqual(type(metric_model), type(metric))
1393 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1395 # Put the model and read it back to show that everything now
1396 # works as normal.
1397 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1398 metric_model_new = butler.get(metric_ref)
1399 self.assertEqual(metric_model_new, metric_model)
1401 # Hack the storage class again to something that will fail on the
1402 # get with no conversion class.
1403 manager._db.update(
1404 manager._static.dataset_type,
1405 {"name": datasetTypeName},
1406 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1407 )
1408 butler.registry.refresh()
1410 with self.assertRaises(ValueError):
1411 butler.get(datasetTypeName, dataId=dataId)
1414class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1415 """InMemoryDatastore specialization of a butler"""
1417 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1418 fullConfigKey = None
1419 useTempRoot = False
1420 validationCanFail = False
1421 datastoreStr = ["datastore='InMemory"]
1422 datastoreName = ["InMemoryDatastore@"]
1423 registryStr = "/gen3.sqlite3"
1425 def testIngest(self):
1426 pass
1429class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1430 """PosixDatastore specialization"""
1432 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1433 fullConfigKey = ".datastore.datastores.1.formatters"
1434 validationCanFail = True
1435 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1436 datastoreName = [
1437 "InMemoryDatastore@",
1438 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1439 "SecondDatastore",
1440 ]
1441 registryStr = "/gen3.sqlite3"
1444class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1445 """Test that a yaml file in one location can refer to a root in another."""
1447 datastoreStr = ["dir1"]
1448 # Disable the makeRepo test since we are deliberately not using
1449 # butler.yaml as the config name.
1450 fullConfigKey = None
1452 def setUp(self):
1453 self.root = makeTestTempDir(TESTDIR)
1455 # Make a new repository in one place
1456 self.dir1 = os.path.join(self.root, "dir1")
1457 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1459 # Move the yaml file to a different place and add a "root"
1460 self.dir2 = os.path.join(self.root, "dir2")
1461 os.makedirs(self.dir2, exist_ok=True)
1462 configFile1 = os.path.join(self.dir1, "butler.yaml")
1463 config = Config(configFile1)
1464 config["root"] = self.dir1
1465 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1466 config.dumpToUri(configFile2)
1467 os.remove(configFile1)
1468 self.tmpConfigFile = configFile2
1470 def testFileLocations(self):
1471 self.assertNotEqual(self.dir1, self.dir2)
1472 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1473 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1474 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1477class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1478 """Test that a config file created by makeRepo outside of repo works."""
1480 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1482 def setUp(self):
1483 self.root = makeTestTempDir(TESTDIR)
1484 self.root2 = makeTestTempDir(TESTDIR)
1486 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1487 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1489 def tearDown(self):
1490 if os.path.exists(self.root2):
1491 shutil.rmtree(self.root2, ignore_errors=True)
1492 super().tearDown()
1494 def testConfigExistence(self):
1495 c = Config(self.tmpConfigFile)
1496 uri_config = ResourcePath(c["root"])
1497 uri_expected = ResourcePath(self.root, forceDirectory=True)
1498 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1499 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1501 def testPutGet(self):
1502 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1503 self.runPutGetTest(storageClass, "test_metric")
1506class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1507 """Test that a config file created by makeRepo outside of repo works."""
1509 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1511 def setUp(self):
1512 self.root = makeTestTempDir(TESTDIR)
1513 self.root2 = makeTestTempDir(TESTDIR)
1515 self.tmpConfigFile = self.root2
1516 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1518 def testConfigExistence(self):
1519 # Append the yaml file else Config constructor does not know the file
1520 # type.
1521 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1522 super().testConfigExistence()
1525class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1526 """Test that a config file created by makeRepo outside of repo works."""
1528 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1530 def setUp(self):
1531 self.root = makeTestTempDir(TESTDIR)
1532 self.root2 = makeTestTempDir(TESTDIR)
1534 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1535 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1538@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1539@mock_s3
1540class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1541 """S3Datastore specialization of a butler; an S3 storage Datastore +
1542 a local in-memory SqlRegistry.
1543 """
1545 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1546 fullConfigKey = None
1547 validationCanFail = True
1549 bucketName = "anybucketname"
1550 """Name of the Bucket that will be used in the tests. The name is read from
1551 the config file used with the tests during set-up.
1552 """
1554 root = "butlerRoot/"
1555 """Root repository directory expected to be used in case useTempRoot=False.
1556 Otherwise the root is set to a 20 characters long randomly generated string
1557 during set-up.
1558 """
1560 datastoreStr = [f"datastore={root}"]
1561 """Contains all expected root locations in a format expected to be
1562 returned by Butler stringification.
1563 """
1565 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1566 """The expected format of the S3 Datastore string."""
1568 registryStr = "/gen3.sqlite3"
1569 """Expected format of the Registry string."""
1571 def genRoot(self):
1572 """Returns a random string of len 20 to serve as a root
1573 name for the temporary bucket repo.
1575 This is equivalent to tempfile.mkdtemp as this is what self.root
1576 becomes when useTempRoot is True.
1577 """
1578 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1579 return rndstr + "/"
1581 def setUp(self):
1582 config = Config(self.configFile)
1583 uri = ResourcePath(config[".datastore.datastore.root"])
1584 self.bucketName = uri.netloc
1586 # set up some fake credentials if they do not exist
1587 self.usingDummyCredentials = setAwsEnvCredentials()
1589 if self.useTempRoot:
1590 self.root = self.genRoot()
1591 rooturi = f"s3://{self.bucketName}/{self.root}"
1592 config.update({"datastore": {"datastore": {"root": rooturi}}})
1594 # need local folder to store registry database
1595 self.reg_dir = makeTestTempDir(TESTDIR)
1596 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1598 # MOTO needs to know that we expect Bucket bucketname to exist
1599 # (this used to be the class attribute bucketName)
1600 s3 = boto3.resource("s3")
1601 s3.create_bucket(Bucket=self.bucketName)
1603 self.datastoreStr = f"datastore={self.root}"
1604 self.datastoreName = [f"FileDatastore@{rooturi}"]
1605 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1606 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1608 def tearDown(self):
1609 s3 = boto3.resource("s3")
1610 bucket = s3.Bucket(self.bucketName)
1611 try:
1612 bucket.objects.all().delete()
1613 except botocore.exceptions.ClientError as e:
1614 if e.response["Error"]["Code"] == "404":
1615 # the key was not reachable - pass
1616 pass
1617 else:
1618 raise
1620 bucket = s3.Bucket(self.bucketName)
1621 bucket.delete()
1623 # unset any potentially set dummy credentials
1624 if self.usingDummyCredentials:
1625 unsetAwsEnvCredentials()
1627 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1628 shutil.rmtree(self.reg_dir, ignore_errors=True)
1630 if self.useTempRoot and os.path.exists(self.root):
1631 shutil.rmtree(self.root, ignore_errors=True)
1634@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1635# Mock required environment variables during tests
1636@unittest.mock.patch.dict(
1637 os.environ,
1638 {
1639 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1640 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1641 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1642 },
1643)
1644class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1645 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1646 a local in-memory SqlRegistry.
1647 """
1649 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1650 fullConfigKey = None
1651 validationCanFail = True
1653 serverName = "localhost"
1654 """Name of the server that will be used in the tests.
1655 """
1657 portNumber = 8080
1658 """Port on which the webdav server listens. Automatically chosen
1659 at setUpClass via the _getfreeport() method
1660 """
1662 root = "butlerRoot/"
1663 """Root repository directory expected to be used in case useTempRoot=False.
1664 Otherwise the root is set to a 20 characters long randomly generated string
1665 during set-up.
1666 """
1668 datastoreStr = [f"datastore={root}"]
1669 """Contains all expected root locations in a format expected to be
1670 returned by Butler stringification.
1671 """
1673 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1674 """The expected format of the WebdavDatastore string."""
1676 registryStr = "/gen3.sqlite3"
1677 """Expected format of the Registry string."""
1679 serverThread = None
1680 """Thread in which the local webdav server will run"""
1682 stopWebdavServer = False
1683 """This flag will cause the webdav server to
1684 gracefully shut down when True
1685 """
1687 def genRoot(self):
1688 """Returns a random string of len 20 to serve as a root
1689 name for the temporary bucket repo.
1691 This is equivalent to tempfile.mkdtemp as this is what self.root
1692 becomes when useTempRoot is True.
1693 """
1694 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1695 return rndstr + "/"
1697 @classmethod
1698 def setUpClass(cls):
1699 # Do the same as inherited class
1700 cls.storageClassFactory = StorageClassFactory()
1701 cls.storageClassFactory.addFromConfig(cls.configFile)
1703 cls.portNumber = cls._getfreeport()
1704 # Run a local webdav server on which tests will be run
1705 cls.serverThread = Thread(
1706 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1707 )
1708 cls.serverThread.start()
1709 # Wait for it to start
1710 time.sleep(3)
1712 @classmethod
1713 def tearDownClass(cls):
1714 # Ask for graceful shut down of the webdav server
1715 cls.stopWebdavServer = True
1716 # Wait for the thread to exit
1717 cls.serverThread.join()
1719 # Mock required environment variables during tests
1720 @unittest.mock.patch.dict(
1721 os.environ,
1722 {
1723 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1724 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1725 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1726 },
1727 )
1728 def setUp(self):
1729 config = Config(self.configFile)
1731 if self.useTempRoot:
1732 self.root = self.genRoot()
1733 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1734 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1736 # need local folder to store registry database
1737 self.reg_dir = makeTestTempDir(TESTDIR)
1738 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1740 self.datastoreStr = f"datastore={self.root}"
1741 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1743 if not isWebdavEndpoint(self.rooturi):
1744 raise OSError("Webdav server not running properly: cannot run tests.")
1746 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1747 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1749 # Mock required environment variables during tests
1750 @unittest.mock.patch.dict(
1751 os.environ,
1752 {
1753 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1754 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1755 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1756 },
1757 )
1758 def tearDown(self):
1759 # Clear temporary directory
1760 ResourcePath(self.rooturi).remove()
1761 ResourcePath(self.rooturi).session.close()
1763 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1764 shutil.rmtree(self.reg_dir, ignore_errors=True)
1766 if self.useTempRoot and os.path.exists(self.root):
1767 shutil.rmtree(self.root, ignore_errors=True)
1769 def _serveWebdav(self, port: int, stopWebdavServer):
1770 """Starts a local webdav-compatible HTTP server,
1771 Listening on http://localhost:port
1772 This server only runs when this test class is instantiated,
1773 and then shuts down. Must be started is a separate thread.
1775 Parameters
1776 ----------
1777 port : `int`
1778 The port number on which the server should listen
1779 """
1780 root_path = gettempdir()
1782 config = {
1783 "host": "0.0.0.0",
1784 "port": port,
1785 "provider_mapping": {"/": root_path},
1786 "http_authenticator": {"domain_controller": None},
1787 "simple_dc": {"user_mapping": {"*": True}},
1788 "verbose": 0,
1789 }
1790 app = WsgiDAVApp(config)
1792 server_args = {
1793 "bind_addr": (config["host"], config["port"]),
1794 "wsgi_app": app,
1795 }
1796 server = wsgi.Server(**server_args)
1797 server.prepare()
1799 try:
1800 # Start the actual server in a separate thread
1801 t = Thread(target=server.serve, daemon=True)
1802 t.start()
1803 # watch stopWebdavServer, and gracefully
1804 # shut down the server when True
1805 while True:
1806 if stopWebdavServer():
1807 break
1808 time.sleep(1)
1809 except KeyboardInterrupt:
1810 print("Caught Ctrl-C, shutting down...")
1811 finally:
1812 server.stop()
1813 t.join()
1815 def _getfreeport():
1816 """
1817 Determines a free port using sockets.
1818 """
1819 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1820 free_socket.bind(("0.0.0.0", 0))
1821 free_socket.listen()
1822 port = free_socket.getsockname()[1]
1823 free_socket.close()
1824 return port
1827class PosixDatastoreTransfers(unittest.TestCase):
1828 """Test data transfers between butlers.
1830 Test for different managers. UUID to UUID and integer to integer are
1831 tested. UUID to integer is not supported since we do not currently
1832 want to allow that. Integer to UUID is supported with the caveat
1833 that UUID4 will be generated and this will be incorrect for raw
1834 dataset types. The test ignores that.
1835 """
1837 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1839 @classmethod
1840 def setUpClass(cls):
1841 cls.storageClassFactory = StorageClassFactory()
1842 cls.storageClassFactory.addFromConfig(cls.configFile)
1844 def setUp(self):
1845 self.root = makeTestTempDir(TESTDIR)
1846 self.config = Config(self.configFile)
1848 def tearDown(self):
1849 removeTestTempDir(self.root)
1851 def create_butler(self, manager, label):
1852 config = Config(self.configFile)
1853 config["registry", "managers", "datasets"] = manager
1854 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1856 def create_butlers(self, manager1, manager2):
1857 self.source_butler = self.create_butler(manager1, "1")
1858 self.target_butler = self.create_butler(manager2, "2")
1860 def testTransferUuidToUuid(self):
1861 self.create_butlers(
1862 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1863 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1864 )
1865 # Setting id_gen_map should have no effect here
1866 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1868 def testTransferIntToInt(self):
1869 self.create_butlers(
1870 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1871 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1872 )
1873 # int dataset ID only allows UNIQUE
1874 self.assertButlerTransfers()
1876 def testTransferIntToUuid(self):
1877 self.create_butlers(
1878 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1879 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1880 )
1881 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1883 def testTransferMissing(self):
1884 """Test transfers where datastore records are missing.
1886 This is how execution butler works.
1887 """
1888 self.create_butlers(
1889 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1890 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1891 )
1893 # Configure the source butler to allow trust.
1894 self.source_butler.datastore.trustGetRequest = True
1896 self.assertButlerTransfers(purge=True)
1898 def testTransferMissingDisassembly(self):
1899 """Test transfers where datastore records are missing.
1901 This is how execution butler works.
1902 """
1903 self.create_butlers(
1904 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1905 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1906 )
1908 # Configure the source butler to allow trust.
1909 self.source_butler.datastore.trustGetRequest = True
1911 # Test disassembly.
1912 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1914 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1915 """Test that a run can be transferred to another butler."""
1917 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1918 datasetTypeName = "random_data"
1920 # Test will create 3 collections and we will want to transfer
1921 # two of those three.
1922 runs = ["run1", "run2", "other"]
1924 # Also want to use two different dataset types to ensure that
1925 # grouping works.
1926 datasetTypeNames = ["random_data", "random_data_2"]
1928 # Create the run collections in the source butler.
1929 for run in runs:
1930 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1932 # Create dimensions in both butlers (transfer will not create them).
1933 n_exposures = 30
1934 for butler in (self.source_butler, self.target_butler):
1935 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1936 butler.registry.insertDimensionData(
1937 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1938 )
1939 butler.registry.insertDimensionData(
1940 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1941 )
1943 for i in range(n_exposures):
1944 butler.registry.insertDimensionData(
1945 "exposure",
1946 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1947 )
1949 # Create dataset types in the source butler.
1950 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1951 for datasetTypeName in datasetTypeNames:
1952 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1953 self.source_butler.registry.registerDatasetType(datasetType)
1955 # Write a dataset to an unrelated run -- this will ensure that
1956 # we are rewriting integer dataset ids in the target if necessary.
1957 # Will not be relevant for UUID.
1958 run = "distraction"
1959 butler = Butler(butler=self.source_butler, run=run)
1960 butler.put(
1961 makeExampleMetrics(),
1962 datasetTypeName,
1963 exposure=1,
1964 instrument="DummyCamComp",
1965 physical_filter="d-r",
1966 )
1968 # Write some example metrics to the source
1969 butler = Butler(butler=self.source_butler)
1971 # Set of DatasetRefs that should be in the list of refs to transfer
1972 # but which will not be transferred.
1973 deleted = set()
1975 n_expected = 20 # Number of datasets expected to be transferred
1976 source_refs = []
1977 for i in range(n_exposures):
1978 # Put a third of datasets into each collection, only retain
1979 # two thirds.
1980 index = i % 3
1981 run = runs[index]
1982 datasetTypeName = datasetTypeNames[i % 2]
1984 metric_data = {
1985 "summary": {"counter": i},
1986 "output": {"text": "metric"},
1987 "data": [2 * x for x in range(i)],
1988 }
1989 metric = MetricsExample(**metric_data)
1990 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1991 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
1993 # Remove the datastore record using low-level API
1994 if purge:
1995 # Remove records for a fraction.
1996 if index == 1:
1998 # For one of these delete the file as well.
1999 # This allows the "missing" code to filter the
2000 # file out.
2001 if not deleted:
2002 primary, uris = butler.datastore.getURIs(ref)
2003 if primary:
2004 primary.remove()
2005 for uri in uris.values():
2006 uri.remove()
2007 n_expected -= 1
2008 deleted.add(ref)
2010 # Remove the datastore record.
2011 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2013 if index < 2:
2014 source_refs.append(ref)
2015 if ref not in deleted:
2016 new_metric = butler.get(ref.unresolved(), collections=run)
2017 self.assertEqual(new_metric, metric)
2019 # Create some bad dataset types to ensure we check for inconsistent
2020 # definitions.
2021 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2022 for datasetTypeName in datasetTypeNames:
2023 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2024 self.target_butler.registry.registerDatasetType(datasetType)
2025 with self.assertRaises(ConflictingDefinitionError):
2026 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2027 # And remove the bad definitions.
2028 for datasetTypeName in datasetTypeNames:
2029 self.target_butler.registry.removeDatasetType(datasetTypeName)
2031 # Transfer without creating dataset types should fail.
2032 with self.assertRaises(KeyError):
2033 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2035 # Now transfer them to the second butler
2036 with self.assertLogs(level=logging.DEBUG) as cm:
2037 transferred = self.target_butler.transfer_from(
2038 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2039 )
2040 self.assertEqual(len(transferred), n_expected)
2041 log_output = ";".join(cm.output)
2042 self.assertIn("found in datastore for chunk", log_output)
2043 self.assertIn("Creating output run", log_output)
2045 # Do the transfer twice to ensure that it will do nothing extra.
2046 # Only do this if purge=True because it does not work for int
2047 # dataset_id.
2048 if purge:
2049 # This should not need to register dataset types.
2050 transferred = self.target_butler.transfer_from(
2051 self.source_butler, source_refs, id_gen_map=id_gen_map
2052 )
2053 self.assertEqual(len(transferred), n_expected)
2055 # Also do an explicit low-level transfer to trigger some
2056 # edge cases.
2057 with self.assertLogs(level=logging.DEBUG) as cm:
2058 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2059 log_output = ";".join(cm.output)
2060 self.assertIn("no file artifacts exist", log_output)
2062 with self.assertRaises(TypeError):
2063 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2065 with self.assertRaises(ValueError):
2066 self.target_butler.datastore.transfer_from(
2067 self.source_butler.datastore, source_refs, transfer="split"
2068 )
2070 # Now try to get the same refs from the new butler.
2071 for ref in source_refs:
2072 if ref not in deleted:
2073 unresolved_ref = ref.unresolved()
2074 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2075 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2076 self.assertEqual(new_metric, old_metric)
2078 # Now prune run2 collection and create instead a CHAINED collection.
2079 # This should block the transfer.
2080 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2081 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2082 with self.assertRaises(TypeError):
2083 # Re-importing the run1 datasets can be problematic if they
2084 # use integer IDs so filter those out.
2085 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2086 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2089if __name__ == "__main__": 2089 ↛ 2090line 2089 didn't jump to line 2090, because the condition on line 2089 was never true
2090 unittest.main()