Coverage for tests/test_butler.py: 16%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import logging
26import os
27import pathlib
28import pickle
29import posixpath
30import random
31import shutil
32import socket
33import string
34import tempfile
35import time
36import unittest
38try:
39 import boto3
40 import botocore
41 from moto import mock_s3
42except ImportError:
43 boto3 = None
45 def mock_s3(cls):
46 """A no-op decorator in case moto mock_s3 can not be imported."""
47 return cls
50try:
51 from cheroot import wsgi
52 from wsgidav.wsgidav_app import WsgiDAVApp
53except ImportError:
54 WsgiDAVApp = None
56from tempfile import gettempdir
57from threading import Thread
59import astropy.time
60from lsst.daf.butler import (
61 Butler,
62 ButlerConfig,
63 ButlerURI,
64 CollectionSearch,
65 CollectionType,
66 Config,
67 DatasetIdGenEnum,
68 DatasetRef,
69 DatasetType,
70 FileDataset,
71 FileTemplateValidationError,
72 StorageClassFactory,
73 ValidationError,
74 script,
75)
76from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
77from lsst.daf.butler.registry import ConflictingDefinitionError, MissingCollectionError
78from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
79from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
80from lsst.resources.http import isWebdavEndpoint
81from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
82from lsst.utils import doImport
83from lsst.utils.introspection import get_full_type_name
85TESTDIR = os.path.abspath(os.path.dirname(__file__))
88def makeExampleMetrics():
89 return MetricsExample(
90 {"AM1": 5.2, "AM2": 30.6},
91 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
92 [563, 234, 456.7, 752, 8, 9, 27],
93 )
96class TransactionTestError(Exception):
97 """Specific error for testing transactions, to prevent misdiagnosing
98 that might otherwise occur when a standard exception is used.
99 """
101 pass
104class ButlerConfigTests(unittest.TestCase):
105 """Simple tests for ButlerConfig that are not tested in any other test
106 cases."""
108 def testSearchPath(self):
109 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
110 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
111 config1 = ButlerConfig(configFile)
112 self.assertNotIn("testConfigs", "\n".join(cm.output))
114 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
115 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
116 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
117 self.assertIn("testConfigs", "\n".join(cm.output))
119 key = ("datastore", "records", "table")
120 self.assertNotEqual(config1[key], config2[key])
121 self.assertEqual(config2[key], "override_record")
124class ButlerPutGetTests:
125 """Helper method for running a suite of put/get tests from different
126 butler configurations."""
128 root = None
130 @staticmethod
131 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
132 """Create a DatasetType and register it"""
133 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
134 registry.registerDatasetType(datasetType)
135 return datasetType
137 @classmethod
138 def setUpClass(cls):
139 cls.storageClassFactory = StorageClassFactory()
140 cls.storageClassFactory.addFromConfig(cls.configFile)
142 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
143 datasetType = datasetRef.datasetType
144 dataId = datasetRef.dataId
145 deferred = butler.getDirectDeferred(datasetRef)
147 for component in components:
148 compTypeName = datasetType.componentTypeName(component)
149 result = butler.get(compTypeName, dataId, collections=collections)
150 self.assertEqual(result, getattr(reference, component))
151 result_deferred = deferred.get(component=component)
152 self.assertEqual(result_deferred, result)
154 def tearDown(self):
155 removeTestTempDir(self.root)
157 def runPutGetTest(self, storageClass, datasetTypeName):
158 # New datasets will be added to run and tag, but we will only look in
159 # tag when looking up datasets.
160 run = "ingest"
161 butler = Butler(self.tmpConfigFile, run=run)
163 collections = set(butler.registry.queryCollections())
164 self.assertEqual(collections, set([run]))
166 # Create and register a DatasetType
167 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
169 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
171 # Add needed Dimensions
172 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
173 butler.registry.insertDimensionData(
174 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
175 )
176 butler.registry.insertDimensionData(
177 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
178 )
179 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
180 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
181 butler.registry.insertDimensionData(
182 "visit",
183 {
184 "instrument": "DummyCamComp",
185 "id": 423,
186 "name": "fourtwentythree",
187 "physical_filter": "d-r",
188 "visit_system": 1,
189 "datetime_begin": visit_start,
190 "datetime_end": visit_end,
191 },
192 )
194 # Add a second visit for some later tests
195 butler.registry.insertDimensionData(
196 "visit",
197 {
198 "instrument": "DummyCamComp",
199 "id": 424,
200 "name": "fourtwentyfour",
201 "physical_filter": "d-r",
202 "visit_system": 1,
203 },
204 )
206 # Create and store a dataset
207 metric = makeExampleMetrics()
208 dataId = {"instrument": "DummyCamComp", "visit": 423}
210 # Create a DatasetRef for put
211 refIn = DatasetRef(datasetType, dataId, id=None)
213 # Put with a preexisting id should fail
214 with self.assertRaises(ValueError):
215 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
217 # Put and remove the dataset once as a DatasetRef, once as a dataId,
218 # and once with a DatasetType
220 # Keep track of any collections we add and do not clean up
221 expected_collections = {run}
223 counter = 0
224 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
225 # Since we are using subTest we can get cascading failures
226 # here with the first attempt failing and the others failing
227 # immediately because the dataset already exists. Work around
228 # this by using a distinct run collection each time
229 counter += 1
230 this_run = f"put_run_{counter}"
231 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
232 expected_collections.update({this_run})
234 with self.subTest(args=args):
235 ref = butler.put(metric, *args, run=this_run)
236 self.assertIsInstance(ref, DatasetRef)
238 # Test getDirect
239 metricOut = butler.getDirect(ref)
240 self.assertEqual(metric, metricOut)
241 # Test get
242 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
243 self.assertEqual(metric, metricOut)
244 # Test get with a datasetRef
245 metricOut = butler.get(ref, collections=this_run)
246 self.assertEqual(metric, metricOut)
247 # Test getDeferred with dataId
248 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
249 self.assertEqual(metric, metricOut)
250 # Test getDeferred with a datasetRef
251 metricOut = butler.getDeferred(ref, collections=this_run).get()
252 self.assertEqual(metric, metricOut)
253 # and deferred direct with ref
254 metricOut = butler.getDirectDeferred(ref).get()
255 self.assertEqual(metric, metricOut)
257 # Check we can get components
258 if storageClass.isComposite():
259 self.assertGetComponents(
260 butler, ref, ("summary", "data", "output"), metric, collections=this_run
261 )
263 # Can the artifacts themselves be retrieved?
264 if not butler.datastore.isEphemeral:
265 root_uri = ButlerURI(self.root)
267 for preserve_path in (True, False):
268 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
269 # Use copy so that we can test that overwrite
270 # protection works (using "auto" for File URIs would
271 # use hard links and subsequent transfer would work
272 # because it knows they are the same file).
273 transferred = butler.retrieveArtifacts(
274 [ref], destination, preserve_path=preserve_path, transfer="copy"
275 )
276 self.assertGreater(len(transferred), 0)
277 artifacts = list(ButlerURI.findFileResources([destination]))
278 self.assertEqual(set(transferred), set(artifacts))
280 for artifact in transferred:
281 path_in_destination = artifact.relative_to(destination)
282 self.assertIsNotNone(path_in_destination)
284 # when path is not preserved there should not be
285 # any path separators.
286 num_seps = path_in_destination.count("/")
287 if preserve_path:
288 self.assertGreater(num_seps, 0)
289 else:
290 self.assertEqual(num_seps, 0)
292 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
293 n_uris = len(secondary_uris)
294 if primary_uri:
295 n_uris += 1
296 self.assertEqual(
297 len(artifacts),
298 n_uris,
299 "Comparing expected artifacts vs actual:"
300 f" {artifacts} vs {primary_uri} and {secondary_uris}",
301 )
303 if preserve_path:
304 # No need to run these twice
305 with self.assertRaises(ValueError):
306 butler.retrieveArtifacts([ref], destination, transfer="move")
308 with self.assertRaises(FileExistsError):
309 butler.retrieveArtifacts([ref], destination)
311 transferred_again = butler.retrieveArtifacts(
312 [ref], destination, preserve_path=preserve_path, overwrite=True
313 )
314 self.assertEqual(set(transferred_again), set(transferred))
316 # Now remove the dataset completely.
317 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
318 # Lookup with original args should still fail.
319 with self.assertRaises(LookupError):
320 butler.datasetExists(*args, collections=this_run)
321 # getDirect() should still fail.
322 with self.assertRaises(FileNotFoundError):
323 butler.getDirect(ref)
324 # Registry shouldn't be able to find it by dataset_id anymore.
325 self.assertIsNone(butler.registry.getDataset(ref.id))
327 # Do explicit registry removal since we know they are
328 # empty
329 butler.registry.removeCollection(this_run)
330 expected_collections.remove(this_run)
332 # Put the dataset again, since the last thing we did was remove it
333 # and we want to use the default collection.
334 ref = butler.put(metric, refIn)
336 # Get with parameters
337 stop = 4
338 sliced = butler.get(ref, parameters={"slice": slice(stop)})
339 self.assertNotEqual(metric, sliced)
340 self.assertEqual(metric.summary, sliced.summary)
341 self.assertEqual(metric.output, sliced.output)
342 self.assertEqual(metric.data[:stop], sliced.data)
343 # getDeferred with parameters
344 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
345 self.assertNotEqual(metric, sliced)
346 self.assertEqual(metric.summary, sliced.summary)
347 self.assertEqual(metric.output, sliced.output)
348 self.assertEqual(metric.data[:stop], sliced.data)
349 # getDeferred with deferred parameters
350 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
351 self.assertNotEqual(metric, sliced)
352 self.assertEqual(metric.summary, sliced.summary)
353 self.assertEqual(metric.output, sliced.output)
354 self.assertEqual(metric.data[:stop], sliced.data)
356 if storageClass.isComposite():
357 # Check that components can be retrieved
358 metricOut = butler.get(ref.datasetType.name, dataId)
359 compNameS = ref.datasetType.componentTypeName("summary")
360 compNameD = ref.datasetType.componentTypeName("data")
361 summary = butler.get(compNameS, dataId)
362 self.assertEqual(summary, metric.summary)
363 data = butler.get(compNameD, dataId)
364 self.assertEqual(data, metric.data)
366 if "counter" in storageClass.derivedComponents:
367 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
368 self.assertEqual(count, len(data))
370 count = butler.get(
371 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
372 )
373 self.assertEqual(count, stop)
375 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
376 summary = butler.getDirect(compRef)
377 self.assertEqual(summary, metric.summary)
379 # Create a Dataset type that has the same name but is inconsistent.
380 inconsistentDatasetType = DatasetType(
381 datasetTypeName, dimensions, self.storageClassFactory.getStorageClass("Config")
382 )
384 # Getting with a dataset type that does not match registry fails
385 with self.assertRaises(ValueError):
386 butler.get(inconsistentDatasetType, dataId)
388 # Combining a DatasetRef with a dataId should fail
389 with self.assertRaises(ValueError):
390 butler.get(ref, dataId)
391 # Getting with an explicit ref should fail if the id doesn't match
392 with self.assertRaises(ValueError):
393 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
395 # Getting a dataset with unknown parameters should fail
396 with self.assertRaises(KeyError):
397 butler.get(ref, parameters={"unsupported": True})
399 # Check we have a collection
400 collections = set(butler.registry.queryCollections())
401 self.assertEqual(collections, expected_collections)
403 # Clean up to check that we can remove something that may have
404 # already had a component removed
405 butler.pruneDatasets([ref], unstore=True, purge=True)
407 # Check that we can configure a butler to accept a put even
408 # if it already has the dataset in registry.
409 ref = butler.put(metric, refIn)
411 # Repeat put will fail.
412 with self.assertRaises(ConflictingDefinitionError):
413 butler.put(metric, refIn)
415 # Remove the datastore entry.
416 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
418 # Put will still fail
419 with self.assertRaises(ConflictingDefinitionError):
420 butler.put(metric, refIn)
422 # Allow the put to succeed
423 butler._allow_put_of_predefined_dataset = True
424 ref2 = butler.put(metric, refIn)
425 self.assertEqual(ref2.id, ref.id)
427 # A second put will still fail but with a different exception
428 # than before.
429 with self.assertRaises(ConflictingDefinitionError):
430 butler.put(metric, refIn)
432 # Reset the flag to avoid confusion
433 butler._allow_put_of_predefined_dataset = False
435 # Leave the dataset in place since some downstream tests require
436 # something to be present
438 return butler
440 def testDeferredCollectionPassing(self):
441 # Construct a butler with no run or collection, but make it writeable.
442 butler = Butler(self.tmpConfigFile, writeable=True)
443 # Create and register a DatasetType
444 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
445 datasetType = self.addDatasetType(
446 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
447 )
448 # Add needed Dimensions
449 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
450 butler.registry.insertDimensionData(
451 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
452 )
453 butler.registry.insertDimensionData(
454 "visit",
455 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
456 )
457 dataId = {"instrument": "DummyCamComp", "visit": 423}
458 # Create dataset.
459 metric = makeExampleMetrics()
460 # Register a new run and put dataset.
461 run = "deferred"
462 self.assertTrue(butler.registry.registerRun(run))
463 # Second time it will be allowed but indicate no-op
464 self.assertFalse(butler.registry.registerRun(run))
465 ref = butler.put(metric, datasetType, dataId, run=run)
466 # Putting with no run should fail with TypeError.
467 with self.assertRaises(TypeError):
468 butler.put(metric, datasetType, dataId)
469 # Dataset should exist.
470 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
471 # We should be able to get the dataset back, but with and without
472 # a deferred dataset handle.
473 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
474 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
475 # Trying to find the dataset without any collection is a TypeError.
476 with self.assertRaises(TypeError):
477 butler.datasetExists(datasetType, dataId)
478 with self.assertRaises(TypeError):
479 butler.get(datasetType, dataId)
480 # Associate the dataset with a different collection.
481 butler.registry.registerCollection("tagged")
482 butler.registry.associate("tagged", [ref])
483 # Deleting the dataset from the new collection should make it findable
484 # in the original collection.
485 butler.pruneDatasets([ref], tags=["tagged"])
486 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
489class ButlerTests(ButlerPutGetTests):
490 """Tests for Butler."""
492 useTempRoot = True
494 def setUp(self):
495 """Create a new butler root for each test."""
496 self.root = makeTestTempDir(TESTDIR)
497 Butler.makeRepo(self.root, config=Config(self.configFile))
498 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
500 def testConstructor(self):
501 """Independent test of constructor."""
502 butler = Butler(self.tmpConfigFile, run="ingest")
503 self.assertIsInstance(butler, Butler)
505 collections = set(butler.registry.queryCollections())
506 self.assertEqual(collections, {"ingest"})
508 butler2 = Butler(butler=butler, collections=["other"])
509 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
510 self.assertIsNone(butler2.run)
511 self.assertIs(butler.datastore, butler2.datastore)
513 # Test that we can use an environment variable to find this
514 # repository.
515 butler_index = Config()
516 butler_index["label"] = self.tmpConfigFile
517 for suffix in (".yaml", ".json"):
518 # Ensure that the content differs so that we know that
519 # we aren't reusing the cache.
520 bad_label = f"s3://bucket/not_real{suffix}"
521 butler_index["bad_label"] = bad_label
522 with ButlerURI.temporary_uri(suffix=suffix) as temp_file:
523 butler_index.dumpToUri(temp_file)
524 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
525 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
526 uri = Butler.get_repo_uri("bad_label")
527 self.assertEqual(uri, ButlerURI(bad_label))
528 uri = Butler.get_repo_uri("label")
529 butler = Butler(uri, writeable=False)
530 self.assertIsInstance(butler, Butler)
531 with self.assertRaises(KeyError) as cm:
532 Butler.get_repo_uri("missing")
533 self.assertIn("not known to", str(cm.exception))
534 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
535 with self.assertRaises(FileNotFoundError):
536 Butler.get_repo_uri("label")
537 self.assertEqual(Butler.get_known_repos(), set())
538 with self.assertRaises(KeyError) as cm:
539 # No environment variable set.
540 Butler.get_repo_uri("label")
541 self.assertIn("No repository index defined", str(cm.exception))
542 self.assertEqual(Butler.get_known_repos(), set())
544 def testBasicPutGet(self):
545 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
546 self.runPutGetTest(storageClass, "test_metric")
548 def testCompositePutGetConcrete(self):
550 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
551 butler = self.runPutGetTest(storageClass, "test_metric")
553 # Should *not* be disassembled
554 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
555 self.assertEqual(len(datasets), 1)
556 uri, components = butler.getURIs(datasets[0])
557 self.assertIsInstance(uri, ButlerURI)
558 self.assertFalse(components)
559 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
560 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
562 # Predicted dataset
563 dataId = {"instrument": "DummyCamComp", "visit": 424}
564 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
565 self.assertFalse(components)
566 self.assertIsInstance(uri, ButlerURI)
567 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
568 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
570 def testCompositePutGetVirtual(self):
571 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
572 butler = self.runPutGetTest(storageClass, "test_metric_comp")
574 # Should be disassembled
575 datasets = list(butler.registry.queryDatasets(..., collections="ingest"))
576 self.assertEqual(len(datasets), 1)
577 uri, components = butler.getURIs(datasets[0])
579 if butler.datastore.isEphemeral:
580 # Never disassemble in-memory datastore
581 self.assertIsInstance(uri, ButlerURI)
582 self.assertFalse(components)
583 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
584 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
585 else:
586 self.assertIsNone(uri)
587 self.assertEqual(set(components), set(storageClass.components))
588 for compuri in components.values():
589 self.assertIsInstance(compuri, ButlerURI)
590 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
591 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
593 # Predicted dataset
594 dataId = {"instrument": "DummyCamComp", "visit": 424}
595 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
597 if butler.datastore.isEphemeral:
598 # Never disassembled
599 self.assertIsInstance(uri, ButlerURI)
600 self.assertFalse(components)
601 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
602 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
603 else:
604 self.assertIsNone(uri)
605 self.assertEqual(set(components), set(storageClass.components))
606 for compuri in components.values():
607 self.assertIsInstance(compuri, ButlerURI)
608 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
609 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
611 def testIngest(self):
612 butler = Butler(self.tmpConfigFile, run="ingest")
614 # Create and register a DatasetType
615 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
617 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
618 datasetTypeName = "metric"
620 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
622 # Add needed Dimensions
623 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
624 butler.registry.insertDimensionData(
625 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
626 )
627 for detector in (1, 2):
628 butler.registry.insertDimensionData(
629 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
630 )
632 butler.registry.insertDimensionData(
633 "visit",
634 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
635 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
636 )
638 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
639 dataRoot = os.path.join(TESTDIR, "data", "basic")
640 datasets = []
641 for detector in (1, 2):
642 detector_name = f"detector_{detector}"
643 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
644 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
645 # Create a DatasetRef for ingest
646 refIn = DatasetRef(datasetType, dataId, id=None)
648 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
650 butler.ingest(*datasets, transfer="copy")
652 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
653 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
655 metrics1 = butler.get(datasetTypeName, dataId1)
656 metrics2 = butler.get(datasetTypeName, dataId2)
657 self.assertNotEqual(metrics1, metrics2)
659 # Compare URIs
660 uri1 = butler.getURI(datasetTypeName, dataId1)
661 uri2 = butler.getURI(datasetTypeName, dataId2)
662 self.assertNotEqual(uri1, uri2)
664 # Now do a multi-dataset but single file ingest
665 metricFile = os.path.join(dataRoot, "detectors.yaml")
666 refs = []
667 for detector in (1, 2):
668 detector_name = f"detector_{detector}"
669 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
670 # Create a DatasetRef for ingest
671 refs.append(DatasetRef(datasetType, dataId, id=None))
673 datasets = []
674 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
676 butler.ingest(*datasets, transfer="copy")
678 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
679 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
681 multi1 = butler.get(datasetTypeName, dataId1)
682 multi2 = butler.get(datasetTypeName, dataId2)
684 self.assertEqual(multi1, metrics1)
685 self.assertEqual(multi2, metrics2)
687 # Compare URIs
688 uri1 = butler.getURI(datasetTypeName, dataId1)
689 uri2 = butler.getURI(datasetTypeName, dataId2)
690 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
692 # Test that removing one does not break the second
693 # This line will issue a warning log message for a ChainedDatastore
694 # that uses an InMemoryDatastore since in-memory can not ingest
695 # files.
696 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
697 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
698 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
699 multi2b = butler.get(datasetTypeName, dataId2)
700 self.assertEqual(multi2, multi2b)
702 def testPruneCollections(self):
703 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
704 butler = Butler(self.tmpConfigFile, writeable=True)
705 # Load registry data with dimensions to hang datasets off of.
706 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
707 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
708 # Add some RUN-type collections.
709 run1 = "run1"
710 butler.registry.registerRun(run1)
711 run2 = "run2"
712 butler.registry.registerRun(run2)
713 # put some datasets. ref1 and ref2 have the same data ID, and are in
714 # different runs. ref3 has a different data ID.
715 metric = makeExampleMetrics()
716 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
717 datasetType = self.addDatasetType(
718 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
719 )
720 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
721 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
722 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
724 # Try to delete a RUN collection without purge, or with purge and not
725 # unstore.
726 with self.assertRaises(TypeError):
727 butler.pruneCollection(run1)
728 with self.assertRaises(TypeError):
729 butler.pruneCollection(run2, purge=True)
730 # Add a TAGGED collection and associate ref3 only into it.
731 tag1 = "tag1"
732 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
733 self.assertTrue(registered)
734 # Registering a second time should be allowed.
735 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
736 self.assertFalse(registered)
737 butler.registry.associate(tag1, [ref3])
738 # Add a CHAINED collection that searches run1 and then run2. It
739 # logically contains only ref1, because ref2 is shadowed due to them
740 # having the same data ID and dataset type.
741 chain1 = "chain1"
742 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
743 butler.registry.setCollectionChain(chain1, [run1, run2])
744 # Try to delete RUN collections, which should fail with complete
745 # rollback because they're still referenced by the CHAINED
746 # collection.
747 with self.assertRaises(Exception):
748 butler.pruneCollection(run1, pruge=True, unstore=True)
749 with self.assertRaises(Exception):
750 butler.pruneCollection(run2, pruge=True, unstore=True)
751 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
752 existence = butler.datastore.mexists([ref1, ref2, ref3])
753 self.assertTrue(existence[ref1])
754 self.assertTrue(existence[ref2])
755 self.assertTrue(existence[ref3])
756 # Try to delete CHAINED and TAGGED collections with purge; should not
757 # work.
758 with self.assertRaises(TypeError):
759 butler.pruneCollection(tag1, purge=True, unstore=True)
760 with self.assertRaises(TypeError):
761 butler.pruneCollection(chain1, purge=True, unstore=True)
762 # Remove the tagged collection with unstore=False. This should not
763 # affect the datasets.
764 butler.pruneCollection(tag1)
765 with self.assertRaises(MissingCollectionError):
766 butler.registry.getCollectionType(tag1)
767 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
768 existence = butler.datastore.mexists([ref1, ref2, ref3])
769 self.assertTrue(existence[ref1])
770 self.assertTrue(existence[ref2])
771 self.assertTrue(existence[ref3])
772 # Add the tagged collection back in, and remove it with unstore=True.
773 # This should remove ref3 only from the datastore.
774 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
775 butler.registry.associate(tag1, [ref3])
776 butler.pruneCollection(tag1, unstore=True)
777 with self.assertRaises(MissingCollectionError):
778 butler.registry.getCollectionType(tag1)
779 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
780 existence = butler.datastore.mexists([ref1, ref2, ref3])
781 self.assertTrue(existence[ref1])
782 self.assertTrue(existence[ref2])
783 self.assertFalse(existence[ref3])
784 # Delete the chain with unstore=False. The datasets should not be
785 # affected at all.
786 butler.pruneCollection(chain1)
787 with self.assertRaises(MissingCollectionError):
788 butler.registry.getCollectionType(chain1)
789 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
790 existence = butler.datastore.mexists([ref1, ref2, ref3])
791 self.assertTrue(existence[ref1])
792 self.assertTrue(existence[ref2])
793 self.assertFalse(existence[ref3])
794 # Redefine and then delete the chain with unstore=True. Only ref1
795 # should be unstored (ref3 has already been unstored, but otherwise
796 # would be now).
797 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
798 butler.registry.setCollectionChain(chain1, [run1, run2])
799 butler.pruneCollection(chain1, unstore=True)
800 with self.assertRaises(MissingCollectionError):
801 butler.registry.getCollectionType(chain1)
802 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
803 existence = butler.datastore.mexists([ref1, ref2, ref3])
804 self.assertFalse(existence[ref1])
805 self.assertTrue(existence[ref2])
806 self.assertFalse(existence[ref3])
807 # Remove run1. This removes ref1 and ref3 from the registry (they're
808 # already gone from the datastore, which is fine).
809 butler.pruneCollection(run1, purge=True, unstore=True)
810 with self.assertRaises(MissingCollectionError):
811 butler.registry.getCollectionType(run1)
812 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
813 self.assertTrue(butler.datastore.exists(ref2))
814 # Remove run2. This removes ref2 from the registry and the datastore.
815 butler.pruneCollection(run2, purge=True, unstore=True)
816 with self.assertRaises(MissingCollectionError):
817 butler.registry.getCollectionType(run2)
818 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
820 # Now that the collections have been pruned we can remove the
821 # dataset type
822 butler.registry.removeDatasetType(datasetType.name)
824 def testPickle(self):
825 """Test pickle support."""
826 butler = Butler(self.tmpConfigFile, run="ingest")
827 butlerOut = pickle.loads(pickle.dumps(butler))
828 self.assertIsInstance(butlerOut, Butler)
829 self.assertEqual(butlerOut._config, butler._config)
830 self.assertEqual(butlerOut.collections, butler.collections)
831 self.assertEqual(butlerOut.run, butler.run)
833 def testGetDatasetTypes(self):
834 butler = Butler(self.tmpConfigFile, run="ingest")
835 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
836 dimensionEntries = [
837 (
838 "instrument",
839 {"instrument": "DummyCam"},
840 {"instrument": "DummyHSC"},
841 {"instrument": "DummyCamComp"},
842 ),
843 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
844 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
845 ]
846 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
847 # Add needed Dimensions
848 for args in dimensionEntries:
849 butler.registry.insertDimensionData(*args)
851 # When a DatasetType is added to the registry entries are not created
852 # for components but querying them can return the components.
853 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
854 components = set()
855 for datasetTypeName in datasetTypeNames:
856 # Create and register a DatasetType
857 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
859 for componentName in storageClass.components:
860 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
862 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
863 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
865 # Now that we have some dataset types registered, validate them
866 butler.validateConfiguration(
867 ignore=[
868 "test_metric_comp",
869 "metric3",
870 "calexp",
871 "DummySC",
872 "datasetType.component",
873 "random_data",
874 "random_data_2",
875 ]
876 )
878 # Add a new datasetType that will fail template validation
879 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
880 if self.validationCanFail:
881 with self.assertRaises(ValidationError):
882 butler.validateConfiguration()
884 # Rerun validation but with a subset of dataset type names
885 butler.validateConfiguration(datasetTypeNames=["metric4"])
887 # Rerun validation but ignore the bad datasetType
888 butler.validateConfiguration(
889 ignore=[
890 "test_metric_comp",
891 "metric3",
892 "calexp",
893 "DummySC",
894 "datasetType.component",
895 "random_data",
896 "random_data_2",
897 ]
898 )
900 def testTransaction(self):
901 butler = Butler(self.tmpConfigFile, run="ingest")
902 datasetTypeName = "test_metric"
903 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
904 dimensionEntries = (
905 ("instrument", {"instrument": "DummyCam"}),
906 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
907 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
908 )
909 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
910 metric = makeExampleMetrics()
911 dataId = {"instrument": "DummyCam", "visit": 42}
912 # Create and register a DatasetType
913 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
914 with self.assertRaises(TransactionTestError):
915 with butler.transaction():
916 # Add needed Dimensions
917 for args in dimensionEntries:
918 butler.registry.insertDimensionData(*args)
919 # Store a dataset
920 ref = butler.put(metric, datasetTypeName, dataId)
921 self.assertIsInstance(ref, DatasetRef)
922 # Test getDirect
923 metricOut = butler.getDirect(ref)
924 self.assertEqual(metric, metricOut)
925 # Test get
926 metricOut = butler.get(datasetTypeName, dataId)
927 self.assertEqual(metric, metricOut)
928 # Check we can get components
929 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
930 raise TransactionTestError("This should roll back the entire transaction")
931 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"):
932 butler.registry.expandDataId(dataId)
933 # Should raise LookupError for missing data ID value
934 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
935 butler.get(datasetTypeName, dataId)
936 # Also check explicitly if Dataset entry is missing
937 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
938 # Direct retrieval should not find the file in the Datastore
939 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
940 butler.getDirect(ref)
942 def testMakeRepo(self):
943 """Test that we can write butler configuration to a new repository via
944 the Butler.makeRepo interface and then instantiate a butler from the
945 repo root.
946 """
947 # Do not run the test if we know this datastore configuration does
948 # not support a file system root
949 if self.fullConfigKey is None:
950 return
952 # create two separate directories
953 root1 = tempfile.mkdtemp(dir=self.root)
954 root2 = tempfile.mkdtemp(dir=self.root)
956 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
957 limited = Config(self.configFile)
958 butler1 = Butler(butlerConfig)
959 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
960 full = Config(self.tmpConfigFile)
961 butler2 = Butler(butlerConfig)
962 # Butlers should have the same configuration regardless of whether
963 # defaults were expanded.
964 self.assertEqual(butler1._config, butler2._config)
965 # Config files loaded directly should not be the same.
966 self.assertNotEqual(limited, full)
967 # Make sure "limited" doesn't have a few keys we know it should be
968 # inheriting from defaults.
969 self.assertIn(self.fullConfigKey, full)
970 self.assertNotIn(self.fullConfigKey, limited)
972 # Collections don't appear until something is put in them
973 collections1 = set(butler1.registry.queryCollections())
974 self.assertEqual(collections1, set())
975 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
977 # Check that a config with no associated file name will not
978 # work properly with relocatable Butler repo
979 butlerConfig.configFile = None
980 with self.assertRaises(ValueError):
981 Butler(butlerConfig)
983 with self.assertRaises(FileExistsError):
984 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
986 def testStringification(self):
987 butler = Butler(self.tmpConfigFile, run="ingest")
988 butlerStr = str(butler)
990 if self.datastoreStr is not None:
991 for testStr in self.datastoreStr:
992 self.assertIn(testStr, butlerStr)
993 if self.registryStr is not None:
994 self.assertIn(self.registryStr, butlerStr)
996 datastoreName = butler.datastore.name
997 if self.datastoreName is not None:
998 for testStr in self.datastoreName:
999 self.assertIn(testStr, datastoreName)
1001 def testButlerRewriteDataId(self):
1002 """Test that dataIds can be rewritten based on dimension records."""
1004 butler = Butler(self.tmpConfigFile, run="ingest")
1006 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1007 datasetTypeName = "random_data"
1009 # Create dimension records.
1010 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1011 butler.registry.insertDimensionData(
1012 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1013 )
1014 butler.registry.insertDimensionData(
1015 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1016 )
1018 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1019 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1020 butler.registry.registerDatasetType(datasetType)
1022 n_exposures = 5
1023 dayobs = 20210530
1025 for i in range(n_exposures):
1026 butler.registry.insertDimensionData(
1027 "exposure",
1028 {
1029 "instrument": "DummyCamComp",
1030 "id": i,
1031 "obs_id": f"exp{i}",
1032 "seq_num": i,
1033 "day_obs": dayobs,
1034 "physical_filter": "d-r",
1035 },
1036 )
1038 # Write some data.
1039 for i in range(n_exposures):
1040 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1042 # Use the seq_num for the put to test rewriting.
1043 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1044 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1046 # Check that the exposure is correct in the dataId
1047 self.assertEqual(ref.dataId["exposure"], i)
1049 # and check that we can get the dataset back with the same dataId
1050 new_metric = butler.get(datasetTypeName, dataId=dataId)
1051 self.assertEqual(new_metric, metric)
1054class FileDatastoreButlerTests(ButlerTests):
1055 """Common tests and specialization of ButlerTests for butlers backed
1056 by datastores that inherit from FileDatastore.
1057 """
1059 def checkFileExists(self, root, relpath):
1060 """Checks if file exists at a given path (relative to root).
1062 Test testPutTemplates verifies actual physical existance of the files
1063 in the requested location.
1064 """
1065 uri = ButlerURI(root, forceDirectory=True)
1066 return uri.join(relpath).exists()
1068 def testPutTemplates(self):
1069 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1070 butler = Butler(self.tmpConfigFile, run="ingest")
1072 # Add needed Dimensions
1073 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1074 butler.registry.insertDimensionData(
1075 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1076 )
1077 butler.registry.insertDimensionData(
1078 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1079 )
1080 butler.registry.insertDimensionData(
1081 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1082 )
1084 # Create and store a dataset
1085 metric = makeExampleMetrics()
1087 # Create two almost-identical DatasetTypes (both will use default
1088 # template)
1089 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1090 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1091 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1092 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1094 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1095 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1097 # Put with exactly the data ID keys needed
1098 ref = butler.put(metric, "metric1", dataId1)
1099 uri = butler.getURI(ref)
1100 self.assertTrue(
1101 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"),
1102 f"Checking existence of {uri}",
1103 )
1105 # Check the template based on dimensions
1106 butler.datastore.templates.validateTemplates([ref])
1108 # Put with extra data ID keys (physical_filter is an optional
1109 # dependency); should not change template (at least the way we're
1110 # defining them to behave now; the important thing is that they
1111 # must be consistent).
1112 ref = butler.put(metric, "metric2", dataId2)
1113 uri = butler.getURI(ref)
1114 self.assertTrue(
1115 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"),
1116 f"Checking existence of {uri}",
1117 )
1119 # Check the template based on dimensions
1120 butler.datastore.templates.validateTemplates([ref])
1122 # Now use a file template that will not result in unique filenames
1123 with self.assertRaises(FileTemplateValidationError):
1124 butler.put(metric, "metric3", dataId1)
1126 def testImportExport(self):
1127 # Run put/get tests just to create and populate a repo.
1128 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1129 self.runImportExportTest(storageClass)
1131 @unittest.expectedFailure
1132 def testImportExportVirtualComposite(self):
1133 # Run put/get tests just to create and populate a repo.
1134 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1135 self.runImportExportTest(storageClass)
1137 def runImportExportTest(self, storageClass):
1138 """This test does an export to a temp directory and an import back
1139 into a new temp directory repo. It does not assume a posix datastore"""
1140 exportButler = self.runPutGetTest(storageClass, "test_metric")
1141 print("Root:", exportButler.datastore.root)
1142 # Test that the repo actually has at least one dataset.
1143 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1144 self.assertGreater(len(datasets), 0)
1145 # Add a DimensionRecord that's unused by those datasets.
1146 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1147 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1148 # Export and then import datasets.
1149 with safeTestTempDir(TESTDIR) as exportDir:
1150 exportFile = os.path.join(exportDir, "exports.yaml")
1151 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1152 export.saveDatasets(datasets)
1153 # Export the same datasets again. This should quietly do
1154 # nothing because of internal deduplication, and it shouldn't
1155 # complain about being asked to export the "htm7" elements even
1156 # though there aren't any in these datasets or in the database.
1157 export.saveDatasets(datasets, elements=["htm7"])
1158 # Save one of the data IDs again; this should be harmless
1159 # because of internal deduplication.
1160 export.saveDataIds([datasets[0].dataId])
1161 # Save some dimension records directly.
1162 export.saveDimensionData("skymap", [skymapRecord])
1163 self.assertTrue(os.path.exists(exportFile))
1164 with safeTestTempDir(TESTDIR) as importDir:
1165 # We always want this to be a local posix butler
1166 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1167 # Calling script.butlerImport tests the implementation of the
1168 # butler command line interface "import" subcommand. Functions
1169 # in the script folder are generally considered protected and
1170 # should not be used as public api.
1171 with open(exportFile, "r") as f:
1172 script.butlerImport(
1173 importDir,
1174 export_file=f,
1175 directory=exportDir,
1176 transfer="auto",
1177 skip_dimensions=None,
1178 reuse_ids=False,
1179 )
1180 importButler = Butler(importDir, run="ingest")
1181 for ref in datasets:
1182 with self.subTest(ref=ref):
1183 # Test for existence by passing in the DatasetType and
1184 # data ID separately, to avoid lookup by dataset_id.
1185 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1186 self.assertEqual(
1187 list(importButler.registry.queryDimensionRecords("skymap")),
1188 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1189 )
1191 def testRemoveRuns(self):
1192 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1193 butler = Butler(self.tmpConfigFile, writeable=True)
1194 # Load registry data with dimensions to hang datasets off of.
1195 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1196 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1197 # Add some RUN-type collection.
1198 run1 = "run1"
1199 butler.registry.registerRun(run1)
1200 run2 = "run2"
1201 butler.registry.registerRun(run2)
1202 # put a dataset in each
1203 metric = makeExampleMetrics()
1204 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1205 datasetType = self.addDatasetType(
1206 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1207 )
1208 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1209 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1210 uri1 = butler.getURI(ref1, collections=[run1])
1211 uri2 = butler.getURI(ref2, collections=[run2])
1212 # Remove from both runs with different values for unstore.
1213 butler.removeRuns([run1], unstore=True)
1214 butler.removeRuns([run2], unstore=False)
1215 # Should be nothing in registry for either one, and datastore should
1216 # not think either exists.
1217 with self.assertRaises(MissingCollectionError):
1218 butler.registry.getCollectionType(run1)
1219 with self.assertRaises(MissingCollectionError):
1220 butler.registry.getCollectionType(run2)
1221 self.assertFalse(butler.datastore.exists(ref1))
1222 self.assertFalse(butler.datastore.exists(ref2))
1223 # The ref we unstored should be gone according to the URI, but the
1224 # one we forgot should still be around.
1225 self.assertFalse(uri1.exists())
1226 self.assertTrue(uri2.exists())
1229class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1230 """PosixDatastore specialization of a butler"""
1232 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1233 fullConfigKey = ".datastore.formatters"
1234 validationCanFail = True
1235 datastoreStr = ["/tmp"]
1236 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1237 registryStr = "/gen3.sqlite3"
1239 def testPathConstructor(self):
1240 """Independent test of constructor using PathLike."""
1241 butler = Butler(self.tmpConfigFile, run="ingest")
1242 self.assertIsInstance(butler, Butler)
1244 # And again with a Path object with the butler yaml
1245 path = pathlib.Path(self.tmpConfigFile)
1246 butler = Butler(path, writeable=False)
1247 self.assertIsInstance(butler, Butler)
1249 # And again with a Path object without the butler yaml
1250 # (making sure we skip it if the tmp config doesn't end
1251 # in butler.yaml -- which is the case for a subclass)
1252 if self.tmpConfigFile.endswith("butler.yaml"):
1253 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1254 butler = Butler(path, writeable=False)
1255 self.assertIsInstance(butler, Butler)
1257 def testExportTransferCopy(self):
1258 """Test local export using all transfer modes"""
1259 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1260 exportButler = self.runPutGetTest(storageClass, "test_metric")
1261 # Test that the repo actually has at least one dataset.
1262 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1263 self.assertGreater(len(datasets), 0)
1264 uris = [exportButler.getURI(d) for d in datasets]
1265 datastoreRoot = exportButler.datastore.root
1267 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1269 for path in pathsInStore:
1270 # Assume local file system
1271 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1273 for transfer in ("copy", "link", "symlink", "relsymlink"):
1274 with safeTestTempDir(TESTDIR) as exportDir:
1275 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1276 export.saveDatasets(datasets)
1277 for path in pathsInStore:
1278 self.assertTrue(
1279 self.checkFileExists(exportDir, path),
1280 f"Check that mode {transfer} exported files",
1281 )
1283 def testPruneDatasets(self):
1284 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1285 butler = Butler(self.tmpConfigFile, writeable=True)
1286 # Load registry data with dimensions to hang datasets off of.
1287 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1288 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1289 # Add some RUN-type collections.
1290 run1 = "run1"
1291 butler.registry.registerRun(run1)
1292 run2 = "run2"
1293 butler.registry.registerRun(run2)
1294 # put some datasets. ref1 and ref2 have the same data ID, and are in
1295 # different runs. ref3 has a different data ID.
1296 metric = makeExampleMetrics()
1297 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1298 datasetType = self.addDatasetType(
1299 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1300 )
1301 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1302 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1303 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1305 # Simple prune.
1306 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1307 with self.assertRaises(LookupError):
1308 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1310 # Put data back.
1311 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1312 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1313 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1315 # Check that in normal mode, deleting the record will lead to
1316 # trash not touching the file.
1317 uri1 = butler.datastore.getURI(ref1)
1318 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1319 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1320 butler.datastore.trash(ref1)
1321 butler.datastore.emptyTrash()
1322 self.assertTrue(uri1.exists())
1323 uri1.remove() # Clean it up.
1325 # Simulate execution butler setup by deleting the datastore
1326 # record but keeping the file around and trusting.
1327 butler.datastore.trustGetRequest = True
1328 uri2 = butler.datastore.getURI(ref2)
1329 uri3 = butler.datastore.getURI(ref3)
1330 self.assertTrue(uri2.exists())
1331 self.assertTrue(uri3.exists())
1333 # Remove the datastore record.
1334 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1335 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1336 self.assertTrue(uri2.exists())
1337 butler.datastore.trash([ref2, ref3])
1338 # Immediate removal for ref2 file
1339 self.assertFalse(uri2.exists())
1340 # But ref3 has to wait for the empty.
1341 self.assertTrue(uri3.exists())
1342 butler.datastore.emptyTrash()
1343 self.assertFalse(uri3.exists())
1345 # Clear out the datasets from registry.
1346 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1348 def testPytypeCoercion(self):
1349 """Test python type coercion on Butler.get"""
1351 # Store some data with the normal example storage class.
1352 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1353 datasetTypeName = "test_metric"
1354 butler = self.runPutGetTest(storageClass, datasetTypeName)
1356 dataId = {"instrument": "DummyCamComp", "visit": 423}
1357 metric = butler.get(datasetTypeName, dataId=dataId)
1358 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1360 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1361 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1363 # Now need to hack the registry dataset type definition.
1364 # There is no API for this.
1365 manager = butler.registry._managers.datasets
1366 manager._db.update(
1367 manager._static.dataset_type,
1368 {"name": datasetTypeName},
1369 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1370 )
1372 # Force reset of dataset type cache
1373 butler.registry.refresh()
1375 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1376 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1377 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1379 metric_model = butler.get(datasetTypeName, dataId=dataId)
1380 self.assertNotEqual(type(metric_model), type(metric))
1381 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1383 # Put the model and read it back to show that everything now
1384 # works as normal.
1385 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1386 metric_model_new = butler.get(metric_ref)
1387 self.assertEqual(metric_model_new, metric_model)
1389 # Hack the storage class again to something that will fail on the
1390 # get with no conversion class.
1391 manager._db.update(
1392 manager._static.dataset_type,
1393 {"name": datasetTypeName},
1394 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1395 )
1396 butler.registry.refresh()
1398 with self.assertRaises(ValueError):
1399 butler.get(datasetTypeName, dataId=dataId)
1402class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1403 """InMemoryDatastore specialization of a butler"""
1405 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1406 fullConfigKey = None
1407 useTempRoot = False
1408 validationCanFail = False
1409 datastoreStr = ["datastore='InMemory"]
1410 datastoreName = ["InMemoryDatastore@"]
1411 registryStr = "/gen3.sqlite3"
1413 def testIngest(self):
1414 pass
1417class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1418 """PosixDatastore specialization"""
1420 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1421 fullConfigKey = ".datastore.datastores.1.formatters"
1422 validationCanFail = True
1423 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1424 datastoreName = [
1425 "InMemoryDatastore@",
1426 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1427 "SecondDatastore",
1428 ]
1429 registryStr = "/gen3.sqlite3"
1432class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1433 """Test that a yaml file in one location can refer to a root in another."""
1435 datastoreStr = ["dir1"]
1436 # Disable the makeRepo test since we are deliberately not using
1437 # butler.yaml as the config name.
1438 fullConfigKey = None
1440 def setUp(self):
1441 self.root = makeTestTempDir(TESTDIR)
1443 # Make a new repository in one place
1444 self.dir1 = os.path.join(self.root, "dir1")
1445 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1447 # Move the yaml file to a different place and add a "root"
1448 self.dir2 = os.path.join(self.root, "dir2")
1449 os.makedirs(self.dir2, exist_ok=True)
1450 configFile1 = os.path.join(self.dir1, "butler.yaml")
1451 config = Config(configFile1)
1452 config["root"] = self.dir1
1453 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1454 config.dumpToUri(configFile2)
1455 os.remove(configFile1)
1456 self.tmpConfigFile = configFile2
1458 def testFileLocations(self):
1459 self.assertNotEqual(self.dir1, self.dir2)
1460 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1461 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1462 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1465class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1466 """Test that a config file created by makeRepo outside of repo works."""
1468 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1470 def setUp(self):
1471 self.root = makeTestTempDir(TESTDIR)
1472 self.root2 = makeTestTempDir(TESTDIR)
1474 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1475 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1477 def tearDown(self):
1478 if os.path.exists(self.root2):
1479 shutil.rmtree(self.root2, ignore_errors=True)
1480 super().tearDown()
1482 def testConfigExistence(self):
1483 c = Config(self.tmpConfigFile)
1484 uri_config = ButlerURI(c["root"])
1485 uri_expected = ButlerURI(self.root, forceDirectory=True)
1486 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1487 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1489 def testPutGet(self):
1490 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1491 self.runPutGetTest(storageClass, "test_metric")
1494class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1495 """Test that a config file created by makeRepo outside of repo works."""
1497 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1499 def setUp(self):
1500 self.root = makeTestTempDir(TESTDIR)
1501 self.root2 = makeTestTempDir(TESTDIR)
1503 self.tmpConfigFile = self.root2
1504 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1506 def testConfigExistence(self):
1507 # Append the yaml file else Config constructor does not know the file
1508 # type.
1509 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1510 super().testConfigExistence()
1513class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1514 """Test that a config file created by makeRepo outside of repo works."""
1516 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1518 def setUp(self):
1519 self.root = makeTestTempDir(TESTDIR)
1520 self.root2 = makeTestTempDir(TESTDIR)
1522 self.tmpConfigFile = ButlerURI(os.path.join(self.root2, "something.yaml")).geturl()
1523 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1526@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1527@mock_s3
1528class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1529 """S3Datastore specialization of a butler; an S3 storage Datastore +
1530 a local in-memory SqlRegistry.
1531 """
1533 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1534 fullConfigKey = None
1535 validationCanFail = True
1537 bucketName = "anybucketname"
1538 """Name of the Bucket that will be used in the tests. The name is read from
1539 the config file used with the tests during set-up.
1540 """
1542 root = "butlerRoot/"
1543 """Root repository directory expected to be used in case useTempRoot=False.
1544 Otherwise the root is set to a 20 characters long randomly generated string
1545 during set-up.
1546 """
1548 datastoreStr = [f"datastore={root}"]
1549 """Contains all expected root locations in a format expected to be
1550 returned by Butler stringification.
1551 """
1553 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1554 """The expected format of the S3 Datastore string."""
1556 registryStr = "/gen3.sqlite3"
1557 """Expected format of the Registry string."""
1559 def genRoot(self):
1560 """Returns a random string of len 20 to serve as a root
1561 name for the temporary bucket repo.
1563 This is equivalent to tempfile.mkdtemp as this is what self.root
1564 becomes when useTempRoot is True.
1565 """
1566 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1567 return rndstr + "/"
1569 def setUp(self):
1570 config = Config(self.configFile)
1571 uri = ButlerURI(config[".datastore.datastore.root"])
1572 self.bucketName = uri.netloc
1574 # set up some fake credentials if they do not exist
1575 self.usingDummyCredentials = setAwsEnvCredentials()
1577 if self.useTempRoot:
1578 self.root = self.genRoot()
1579 rooturi = f"s3://{self.bucketName}/{self.root}"
1580 config.update({"datastore": {"datastore": {"root": rooturi}}})
1582 # need local folder to store registry database
1583 self.reg_dir = makeTestTempDir(TESTDIR)
1584 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1586 # MOTO needs to know that we expect Bucket bucketname to exist
1587 # (this used to be the class attribute bucketName)
1588 s3 = boto3.resource("s3")
1589 s3.create_bucket(Bucket=self.bucketName)
1591 self.datastoreStr = f"datastore={self.root}"
1592 self.datastoreName = [f"FileDatastore@{rooturi}"]
1593 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1594 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1596 def tearDown(self):
1597 s3 = boto3.resource("s3")
1598 bucket = s3.Bucket(self.bucketName)
1599 try:
1600 bucket.objects.all().delete()
1601 except botocore.exceptions.ClientError as e:
1602 if e.response["Error"]["Code"] == "404":
1603 # the key was not reachable - pass
1604 pass
1605 else:
1606 raise
1608 bucket = s3.Bucket(self.bucketName)
1609 bucket.delete()
1611 # unset any potentially set dummy credentials
1612 if self.usingDummyCredentials:
1613 unsetAwsEnvCredentials()
1615 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1616 shutil.rmtree(self.reg_dir, ignore_errors=True)
1618 if self.useTempRoot and os.path.exists(self.root):
1619 shutil.rmtree(self.root, ignore_errors=True)
1622@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1623# Mock required environment variables during tests
1624@unittest.mock.patch.dict(
1625 os.environ,
1626 {
1627 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1628 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1629 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1630 },
1631)
1632class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1633 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1634 a local in-memory SqlRegistry.
1635 """
1637 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1638 fullConfigKey = None
1639 validationCanFail = True
1641 serverName = "localhost"
1642 """Name of the server that will be used in the tests.
1643 """
1645 portNumber = 8080
1646 """Port on which the webdav server listens. Automatically chosen
1647 at setUpClass via the _getfreeport() method
1648 """
1650 root = "butlerRoot/"
1651 """Root repository directory expected to be used in case useTempRoot=False.
1652 Otherwise the root is set to a 20 characters long randomly generated string
1653 during set-up.
1654 """
1656 datastoreStr = [f"datastore={root}"]
1657 """Contains all expected root locations in a format expected to be
1658 returned by Butler stringification.
1659 """
1661 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1662 """The expected format of the WebdavDatastore string."""
1664 registryStr = "/gen3.sqlite3"
1665 """Expected format of the Registry string."""
1667 serverThread = None
1668 """Thread in which the local webdav server will run"""
1670 stopWebdavServer = False
1671 """This flag will cause the webdav server to
1672 gracefully shut down when True
1673 """
1675 def genRoot(self):
1676 """Returns a random string of len 20 to serve as a root
1677 name for the temporary bucket repo.
1679 This is equivalent to tempfile.mkdtemp as this is what self.root
1680 becomes when useTempRoot is True.
1681 """
1682 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1683 return rndstr + "/"
1685 @classmethod
1686 def setUpClass(cls):
1687 # Do the same as inherited class
1688 cls.storageClassFactory = StorageClassFactory()
1689 cls.storageClassFactory.addFromConfig(cls.configFile)
1691 cls.portNumber = cls._getfreeport()
1692 # Run a local webdav server on which tests will be run
1693 cls.serverThread = Thread(
1694 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1695 )
1696 cls.serverThread.start()
1697 # Wait for it to start
1698 time.sleep(3)
1700 @classmethod
1701 def tearDownClass(cls):
1702 # Ask for graceful shut down of the webdav server
1703 cls.stopWebdavServer = True
1704 # Wait for the thread to exit
1705 cls.serverThread.join()
1707 # Mock required environment variables during tests
1708 @unittest.mock.patch.dict(
1709 os.environ,
1710 {
1711 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1712 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1713 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1714 },
1715 )
1716 def setUp(self):
1717 config = Config(self.configFile)
1719 if self.useTempRoot:
1720 self.root = self.genRoot()
1721 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1722 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1724 # need local folder to store registry database
1725 self.reg_dir = makeTestTempDir(TESTDIR)
1726 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1728 self.datastoreStr = f"datastore={self.root}"
1729 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1731 if not isWebdavEndpoint(self.rooturi):
1732 raise OSError("Webdav server not running properly: cannot run tests.")
1734 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1735 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1737 # Mock required environment variables during tests
1738 @unittest.mock.patch.dict(
1739 os.environ,
1740 {
1741 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN",
1742 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"),
1743 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs",
1744 },
1745 )
1746 def tearDown(self):
1747 # Clear temporary directory
1748 ButlerURI(self.rooturi).remove()
1749 ButlerURI(self.rooturi).session.close()
1751 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1752 shutil.rmtree(self.reg_dir, ignore_errors=True)
1754 if self.useTempRoot and os.path.exists(self.root):
1755 shutil.rmtree(self.root, ignore_errors=True)
1757 def _serveWebdav(self, port: int, stopWebdavServer):
1758 """Starts a local webdav-compatible HTTP server,
1759 Listening on http://localhost:port
1760 This server only runs when this test class is instantiated,
1761 and then shuts down. Must be started is a separate thread.
1763 Parameters
1764 ----------
1765 port : `int`
1766 The port number on which the server should listen
1767 """
1768 root_path = gettempdir()
1770 config = {
1771 "host": "0.0.0.0",
1772 "port": port,
1773 "provider_mapping": {"/": root_path},
1774 "http_authenticator": {"domain_controller": None},
1775 "simple_dc": {"user_mapping": {"*": True}},
1776 "verbose": 0,
1777 }
1778 app = WsgiDAVApp(config)
1780 server_args = {
1781 "bind_addr": (config["host"], config["port"]),
1782 "wsgi_app": app,
1783 }
1784 server = wsgi.Server(**server_args)
1785 server.prepare()
1787 try:
1788 # Start the actual server in a separate thread
1789 t = Thread(target=server.serve, daemon=True)
1790 t.start()
1791 # watch stopWebdavServer, and gracefully
1792 # shut down the server when True
1793 while True:
1794 if stopWebdavServer():
1795 break
1796 time.sleep(1)
1797 except KeyboardInterrupt:
1798 print("Caught Ctrl-C, shutting down...")
1799 finally:
1800 server.stop()
1801 t.join()
1803 def _getfreeport():
1804 """
1805 Determines a free port using sockets.
1806 """
1807 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1808 free_socket.bind(("0.0.0.0", 0))
1809 free_socket.listen()
1810 port = free_socket.getsockname()[1]
1811 free_socket.close()
1812 return port
1815class PosixDatastoreTransfers(unittest.TestCase):
1816 """Test data transfers between butlers.
1818 Test for different managers. UUID to UUID and integer to integer are
1819 tested. UUID to integer is not supported since we do not currently
1820 want to allow that. Integer to UUID is supported with the caveat
1821 that UUID4 will be generated and this will be incorrect for raw
1822 dataset types. The test ignores that.
1823 """
1825 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1827 @classmethod
1828 def setUpClass(cls):
1829 cls.storageClassFactory = StorageClassFactory()
1830 cls.storageClassFactory.addFromConfig(cls.configFile)
1832 def setUp(self):
1833 self.root = makeTestTempDir(TESTDIR)
1834 self.config = Config(self.configFile)
1836 def tearDown(self):
1837 removeTestTempDir(self.root)
1839 def create_butler(self, manager, label):
1840 config = Config(self.configFile)
1841 config["registry", "managers", "datasets"] = manager
1842 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1844 def create_butlers(self, manager1, manager2):
1845 self.source_butler = self.create_butler(manager1, "1")
1846 self.target_butler = self.create_butler(manager2, "2")
1848 def testTransferUuidToUuid(self):
1849 self.create_butlers(
1850 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1851 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1852 )
1853 # Setting id_gen_map should have no effect here
1854 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1856 def testTransferIntToInt(self):
1857 self.create_butlers(
1858 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1859 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1860 )
1861 # int dataset ID only allows UNIQUE
1862 self.assertButlerTransfers()
1864 def testTransferIntToUuid(self):
1865 self.create_butlers(
1866 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
1867 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1868 )
1869 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1871 def testTransferMissing(self):
1872 """Test transfers where datastore records are missing.
1874 This is how execution butler works.
1875 """
1876 self.create_butlers(
1877 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1878 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1879 )
1881 # Configure the source butler to allow trust.
1882 self.source_butler.datastore.trustGetRequest = True
1884 self.assertButlerTransfers(purge=True)
1886 def testTransferMissingDisassembly(self):
1887 """Test transfers where datastore records are missing.
1889 This is how execution butler works.
1890 """
1891 self.create_butlers(
1892 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1893 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1894 )
1896 # Configure the source butler to allow trust.
1897 self.source_butler.datastore.trustGetRequest = True
1899 # Test disassembly.
1900 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1902 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
1903 """Test that a run can be transferred to another butler."""
1905 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1906 datasetTypeName = "random_data"
1908 # Test will create 3 collections and we will want to transfer
1909 # two of those three.
1910 runs = ["run1", "run2", "other"]
1912 # Also want to use two different dataset types to ensure that
1913 # grouping works.
1914 datasetTypeNames = ["random_data", "random_data_2"]
1916 # Create the run collections in the source butler.
1917 for run in runs:
1918 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1920 # Create dimensions in both butlers (transfer will not create them).
1921 n_exposures = 30
1922 for butler in (self.source_butler, self.target_butler):
1923 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1924 butler.registry.insertDimensionData(
1925 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1926 )
1927 butler.registry.insertDimensionData(
1928 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1929 )
1931 for i in range(n_exposures):
1932 butler.registry.insertDimensionData(
1933 "exposure",
1934 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1935 )
1937 # Create dataset types in the source butler.
1938 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1939 for datasetTypeName in datasetTypeNames:
1940 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1941 self.source_butler.registry.registerDatasetType(datasetType)
1943 # Write a dataset to an unrelated run -- this will ensure that
1944 # we are rewriting integer dataset ids in the target if necessary.
1945 # Will not be relevant for UUID.
1946 run = "distraction"
1947 butler = Butler(butler=self.source_butler, run=run)
1948 butler.put(
1949 makeExampleMetrics(),
1950 datasetTypeName,
1951 exposure=1,
1952 instrument="DummyCamComp",
1953 physical_filter="d-r",
1954 )
1956 # Write some example metrics to the source
1957 butler = Butler(butler=self.source_butler)
1959 # Set of DatasetRefs that should be in the list of refs to transfer
1960 # but which will not be transferred.
1961 deleted = set()
1963 n_expected = 20 # Number of datasets expected to be transferred
1964 source_refs = []
1965 for i in range(n_exposures):
1966 # Put a third of datasets into each collection, only retain
1967 # two thirds.
1968 index = i % 3
1969 run = runs[index]
1970 datasetTypeName = datasetTypeNames[i % 2]
1972 metric_data = {
1973 "summary": {"counter": i},
1974 "output": {"text": "metric"},
1975 "data": [2 * x for x in range(i)],
1976 }
1977 metric = MetricsExample(**metric_data)
1978 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1979 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
1981 # Remove the datastore record using low-level API
1982 if purge:
1983 # Remove records for a fraction.
1984 if index == 1:
1986 # For one of these delete the file as well.
1987 # This allows the "missing" code to filter the
1988 # file out.
1989 if not deleted:
1990 primary, uris = butler.datastore.getURIs(ref)
1991 if primary:
1992 primary.remove()
1993 for uri in uris.values():
1994 uri.remove()
1995 n_expected -= 1
1996 deleted.add(ref)
1998 # Remove the datastore record.
1999 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2001 if index < 2:
2002 source_refs.append(ref)
2003 if ref not in deleted:
2004 new_metric = butler.get(ref.unresolved(), collections=run)
2005 self.assertEqual(new_metric, metric)
2007 # Create some bad dataset types to ensure we check for inconsistent
2008 # definitions.
2009 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2010 for datasetTypeName in datasetTypeNames:
2011 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2012 self.target_butler.registry.registerDatasetType(datasetType)
2013 with self.assertRaises(ConflictingDefinitionError):
2014 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2015 # And remove the bad definitions.
2016 for datasetTypeName in datasetTypeNames:
2017 self.target_butler.registry.removeDatasetType(datasetTypeName)
2019 # Transfer without creating dataset types should fail.
2020 with self.assertRaises(KeyError):
2021 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2023 # Now transfer them to the second butler
2024 with self.assertLogs(level=logging.DEBUG) as cm:
2025 transferred = self.target_butler.transfer_from(
2026 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2027 )
2028 self.assertEqual(len(transferred), n_expected)
2029 log_output = ";".join(cm.output)
2030 self.assertIn("found in datastore for chunk", log_output)
2031 self.assertIn("Creating output run", log_output)
2033 # Do the transfer twice to ensure that it will do nothing extra.
2034 # Only do this if purge=True because it does not work for int
2035 # dataset_id.
2036 if purge:
2037 # This should not need to register dataset types.
2038 transferred = self.target_butler.transfer_from(
2039 self.source_butler, source_refs, id_gen_map=id_gen_map
2040 )
2041 self.assertEqual(len(transferred), n_expected)
2043 # Also do an explicit low-level transfer to trigger some
2044 # edge cases.
2045 with self.assertLogs(level=logging.DEBUG) as cm:
2046 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2047 log_output = ";".join(cm.output)
2048 self.assertIn("no file artifacts exist", log_output)
2050 with self.assertRaises(TypeError):
2051 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2053 with self.assertRaises(ValueError):
2054 self.target_butler.datastore.transfer_from(
2055 self.source_butler.datastore, source_refs, transfer="split"
2056 )
2058 # Now try to get the same refs from the new butler.
2059 for ref in source_refs:
2060 if ref not in deleted:
2061 unresolved_ref = ref.unresolved()
2062 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2063 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2064 self.assertEqual(new_metric, old_metric)
2066 # Now prune run2 collection and create instead a CHAINED collection.
2067 # This should block the transfer.
2068 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2069 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2070 with self.assertRaises(TypeError):
2071 # Re-importing the run1 datasets can be problematic if they
2072 # use integer IDs so filter those out.
2073 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2074 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2077if __name__ == "__main__": 2077 ↛ 2078line 2077 didn't jump to line 2078, because the condition on line 2077 was never true
2078 unittest.main()