Coverage for tests/test_butler.py: 17%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import gc
26import logging
27import os
28import pathlib
29import pickle
30import posixpath
31import random
32import shutil
33import socket
34import string
35import tempfile
36import time
37import unittest
38from tempfile import gettempdir
39from threading import Thread
41try:
42 import boto3
43 import botocore
44 from moto import mock_s3
45except ImportError:
46 boto3 = None
48 def mock_s3(cls):
49 """A no-op decorator in case moto mock_s3 can not be imported."""
50 return cls
53try:
54 # It's possible but silly to have testing.postgresql installed without
55 # having the postgresql server installed (because then nothing in
56 # testing.postgresql would work), so we use the presence of that module
57 # to test whether we can expect the server to be available.
58 import testing.postgresql
59except ImportError:
60 testing = None
63try:
64 from cheroot import wsgi
65 from wsgidav.wsgidav_app import WsgiDAVApp
66except ImportError:
67 WsgiDAVApp = None
69import astropy.time
70import sqlalchemy
71from lsst.daf.butler import (
72 Butler,
73 ButlerConfig,
74 CollectionSearch,
75 CollectionType,
76 Config,
77 DatasetIdGenEnum,
78 DatasetRef,
79 DatasetType,
80 FileDataset,
81 FileTemplateValidationError,
82 StorageClassFactory,
83 ValidationError,
84 script,
85)
86from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
87from lsst.daf.butler.registry import (
88 CollectionError,
89 CollectionTypeError,
90 ConflictingDefinitionError,
91 DataIdValueError,
92 MissingCollectionError,
93)
94from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
95from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir
96from lsst.resources import ResourcePath
97from lsst.resources.http import _is_webdav_endpoint
98from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
99from lsst.utils import doImport
100from lsst.utils.introspection import get_full_type_name
102TESTDIR = os.path.abspath(os.path.dirname(__file__))
105def makeExampleMetrics():
106 return MetricsExample(
107 {"AM1": 5.2, "AM2": 30.6},
108 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
109 [563, 234, 456.7, 752, 8, 9, 27],
110 )
113class TransactionTestError(Exception):
114 """Specific error for testing transactions, to prevent misdiagnosing
115 that might otherwise occur when a standard exception is used.
116 """
118 pass
121class ButlerConfigTests(unittest.TestCase):
122 """Simple tests for ButlerConfig that are not tested in any other test
123 cases."""
125 def testSearchPath(self):
126 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
127 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
128 config1 = ButlerConfig(configFile)
129 self.assertNotIn("testConfigs", "\n".join(cm.output))
131 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
132 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
133 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
134 self.assertIn("testConfigs", "\n".join(cm.output))
136 key = ("datastore", "records", "table")
137 self.assertNotEqual(config1[key], config2[key])
138 self.assertEqual(config2[key], "override_record")
141class ButlerPutGetTests:
142 """Helper method for running a suite of put/get tests from different
143 butler configurations."""
145 root = None
146 default_run = "ingésτ😺"
148 @staticmethod
149 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
150 """Create a DatasetType and register it"""
151 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
152 registry.registerDatasetType(datasetType)
153 return datasetType
155 @classmethod
156 def setUpClass(cls):
157 cls.storageClassFactory = StorageClassFactory()
158 cls.storageClassFactory.addFromConfig(cls.configFile)
160 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None):
161 datasetType = datasetRef.datasetType
162 dataId = datasetRef.dataId
163 deferred = butler.getDirectDeferred(datasetRef)
165 for component in components:
166 compTypeName = datasetType.componentTypeName(component)
167 result = butler.get(compTypeName, dataId, collections=collections)
168 self.assertEqual(result, getattr(reference, component))
169 result_deferred = deferred.get(component=component)
170 self.assertEqual(result_deferred, result)
172 def tearDown(self):
173 removeTestTempDir(self.root)
175 def create_butler(self, run, storageClass, datasetTypeName):
176 butler = Butler(self.tmpConfigFile, run=run)
178 collections = set(butler.registry.queryCollections())
179 self.assertEqual(collections, set([run]))
181 # Create and register a DatasetType
182 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
184 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
186 # Add needed Dimensions
187 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
188 butler.registry.insertDimensionData(
189 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
190 )
191 butler.registry.insertDimensionData(
192 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
193 )
194 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
195 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
196 butler.registry.insertDimensionData(
197 "visit",
198 {
199 "instrument": "DummyCamComp",
200 "id": 423,
201 "name": "fourtwentythree",
202 "physical_filter": "d-r",
203 "visit_system": 1,
204 "datetime_begin": visit_start,
205 "datetime_end": visit_end,
206 },
207 )
209 # Add more visits for some later tests
210 for visit_id in (424, 425):
211 butler.registry.insertDimensionData(
212 "visit",
213 {
214 "instrument": "DummyCamComp",
215 "id": visit_id,
216 "name": f"fourtwentyfour_{visit_id}",
217 "physical_filter": "d-r",
218 "visit_system": 1,
219 },
220 )
221 return butler, datasetType
223 def runPutGetTest(self, storageClass, datasetTypeName):
224 # New datasets will be added to run and tag, but we will only look in
225 # tag when looking up datasets.
226 run = self.default_run
227 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
229 # Create and store a dataset
230 metric = makeExampleMetrics()
231 dataId = {"instrument": "DummyCamComp", "visit": 423}
233 # Create a DatasetRef for put
234 refIn = DatasetRef(datasetType, dataId, id=None)
236 # Put with a preexisting id should fail
237 with self.assertRaises(ValueError):
238 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
240 # Put and remove the dataset once as a DatasetRef, once as a dataId,
241 # and once with a DatasetType
243 # Keep track of any collections we add and do not clean up
244 expected_collections = {run}
246 counter = 0
247 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
248 # Since we are using subTest we can get cascading failures
249 # here with the first attempt failing and the others failing
250 # immediately because the dataset already exists. Work around
251 # this by using a distinct run collection each time
252 counter += 1
253 this_run = f"put_run_{counter}"
254 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
255 expected_collections.update({this_run})
257 with self.subTest(args=args):
258 ref = butler.put(metric, *args, run=this_run)
259 self.assertIsInstance(ref, DatasetRef)
261 # Test getDirect
262 metricOut = butler.getDirect(ref)
263 self.assertEqual(metric, metricOut)
264 # Test get
265 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
266 self.assertEqual(metric, metricOut)
267 # Test get with a datasetRef
268 metricOut = butler.get(ref, collections=this_run)
269 self.assertEqual(metric, metricOut)
270 # Test getDeferred with dataId
271 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
272 self.assertEqual(metric, metricOut)
273 # Test getDeferred with a datasetRef
274 metricOut = butler.getDeferred(ref, collections=this_run).get()
275 self.assertEqual(metric, metricOut)
276 # and deferred direct with ref
277 metricOut = butler.getDirectDeferred(ref).get()
278 self.assertEqual(metric, metricOut)
280 # Check we can get components
281 if storageClass.isComposite():
282 self.assertGetComponents(
283 butler, ref, ("summary", "data", "output"), metric, collections=this_run
284 )
286 # Can the artifacts themselves be retrieved?
287 if not butler.datastore.isEphemeral:
288 root_uri = ResourcePath(self.root)
290 for preserve_path in (True, False):
291 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
292 # Use copy so that we can test that overwrite
293 # protection works (using "auto" for File URIs would
294 # use hard links and subsequent transfer would work
295 # because it knows they are the same file).
296 transferred = butler.retrieveArtifacts(
297 [ref], destination, preserve_path=preserve_path, transfer="copy"
298 )
299 self.assertGreater(len(transferred), 0)
300 artifacts = list(ResourcePath.findFileResources([destination]))
301 self.assertEqual(set(transferred), set(artifacts))
303 for artifact in transferred:
304 path_in_destination = artifact.relative_to(destination)
305 self.assertIsNotNone(path_in_destination)
307 # when path is not preserved there should not be
308 # any path separators.
309 num_seps = path_in_destination.count("/")
310 if preserve_path:
311 self.assertGreater(num_seps, 0)
312 else:
313 self.assertEqual(num_seps, 0)
315 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
316 n_uris = len(secondary_uris)
317 if primary_uri:
318 n_uris += 1
319 self.assertEqual(
320 len(artifacts),
321 n_uris,
322 "Comparing expected artifacts vs actual:"
323 f" {artifacts} vs {primary_uri} and {secondary_uris}",
324 )
326 if preserve_path:
327 # No need to run these twice
328 with self.assertRaises(ValueError):
329 butler.retrieveArtifacts([ref], destination, transfer="move")
331 with self.assertRaises(FileExistsError):
332 butler.retrieveArtifacts([ref], destination)
334 transferred_again = butler.retrieveArtifacts(
335 [ref], destination, preserve_path=preserve_path, overwrite=True
336 )
337 self.assertEqual(set(transferred_again), set(transferred))
339 # Now remove the dataset completely.
340 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run)
341 # Lookup with original args should still fail.
342 with self.assertRaises(LookupError):
343 butler.datasetExists(*args, collections=this_run)
344 # getDirect() should still fail.
345 with self.assertRaises(FileNotFoundError):
346 butler.getDirect(ref)
347 # Registry shouldn't be able to find it by dataset_id anymore.
348 self.assertIsNone(butler.registry.getDataset(ref.id))
350 # Do explicit registry removal since we know they are
351 # empty
352 butler.registry.removeCollection(this_run)
353 expected_collections.remove(this_run)
355 # Put the dataset again, since the last thing we did was remove it
356 # and we want to use the default collection.
357 ref = butler.put(metric, refIn)
359 # Get with parameters
360 stop = 4
361 sliced = butler.get(ref, parameters={"slice": slice(stop)})
362 self.assertNotEqual(metric, sliced)
363 self.assertEqual(metric.summary, sliced.summary)
364 self.assertEqual(metric.output, sliced.output)
365 self.assertEqual(metric.data[:stop], sliced.data)
366 # getDeferred with parameters
367 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
368 self.assertNotEqual(metric, sliced)
369 self.assertEqual(metric.summary, sliced.summary)
370 self.assertEqual(metric.output, sliced.output)
371 self.assertEqual(metric.data[:stop], sliced.data)
372 # getDeferred with deferred parameters
373 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
374 self.assertNotEqual(metric, sliced)
375 self.assertEqual(metric.summary, sliced.summary)
376 self.assertEqual(metric.output, sliced.output)
377 self.assertEqual(metric.data[:stop], sliced.data)
379 if storageClass.isComposite():
380 # Check that components can be retrieved
381 metricOut = butler.get(ref.datasetType.name, dataId)
382 compNameS = ref.datasetType.componentTypeName("summary")
383 compNameD = ref.datasetType.componentTypeName("data")
384 summary = butler.get(compNameS, dataId)
385 self.assertEqual(summary, metric.summary)
386 data = butler.get(compNameD, dataId)
387 self.assertEqual(data, metric.data)
389 if "counter" in storageClass.derivedComponents:
390 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
391 self.assertEqual(count, len(data))
393 count = butler.get(
394 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
395 )
396 self.assertEqual(count, stop)
398 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
399 summary = butler.getDirect(compRef)
400 self.assertEqual(summary, metric.summary)
402 # Create a Dataset type that has the same name but is inconsistent.
403 inconsistentDatasetType = DatasetType(
404 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
405 )
407 # Getting with a dataset type that does not match registry fails
408 with self.assertRaises(ValueError):
409 butler.get(inconsistentDatasetType, dataId)
411 # Combining a DatasetRef with a dataId should fail
412 with self.assertRaises(ValueError):
413 butler.get(ref, dataId)
414 # Getting with an explicit ref should fail if the id doesn't match
415 with self.assertRaises(ValueError):
416 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
418 # Getting a dataset with unknown parameters should fail
419 with self.assertRaises(KeyError):
420 butler.get(ref, parameters={"unsupported": True})
422 # Check we have a collection
423 collections = set(butler.registry.queryCollections())
424 self.assertEqual(collections, expected_collections)
426 # Clean up to check that we can remove something that may have
427 # already had a component removed
428 butler.pruneDatasets([ref], unstore=True, purge=True)
430 # Check that we can configure a butler to accept a put even
431 # if it already has the dataset in registry.
432 ref = butler.put(metric, refIn)
434 # Repeat put will fail.
435 with self.assertRaises(ConflictingDefinitionError):
436 butler.put(metric, refIn)
438 # Remove the datastore entry.
439 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
441 # Put will still fail
442 with self.assertRaises(ConflictingDefinitionError):
443 butler.put(metric, refIn)
445 # Allow the put to succeed
446 butler._allow_put_of_predefined_dataset = True
447 ref2 = butler.put(metric, refIn)
448 self.assertEqual(ref2.id, ref.id)
450 # A second put will still fail but with a different exception
451 # than before.
452 with self.assertRaises(ConflictingDefinitionError):
453 butler.put(metric, refIn)
455 # Reset the flag to avoid confusion
456 butler._allow_put_of_predefined_dataset = False
458 # Leave the dataset in place since some downstream tests require
459 # something to be present
461 return butler
463 def testDeferredCollectionPassing(self):
464 # Construct a butler with no run or collection, but make it writeable.
465 butler = Butler(self.tmpConfigFile, writeable=True)
466 # Create and register a DatasetType
467 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
468 datasetType = self.addDatasetType(
469 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
470 )
471 # Add needed Dimensions
472 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
473 butler.registry.insertDimensionData(
474 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
475 )
476 butler.registry.insertDimensionData(
477 "visit",
478 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
479 )
480 dataId = {"instrument": "DummyCamComp", "visit": 423}
481 # Create dataset.
482 metric = makeExampleMetrics()
483 # Register a new run and put dataset.
484 run = "deferred"
485 self.assertTrue(butler.registry.registerRun(run))
486 # Second time it will be allowed but indicate no-op
487 self.assertFalse(butler.registry.registerRun(run))
488 ref = butler.put(metric, datasetType, dataId, run=run)
489 # Putting with no run should fail with TypeError.
490 with self.assertRaises(CollectionError):
491 butler.put(metric, datasetType, dataId)
492 # Dataset should exist.
493 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
494 # We should be able to get the dataset back, but with and without
495 # a deferred dataset handle.
496 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
497 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
498 # Trying to find the dataset without any collection is a TypeError.
499 with self.assertRaises(CollectionError):
500 butler.datasetExists(datasetType, dataId)
501 with self.assertRaises(CollectionError):
502 butler.get(datasetType, dataId)
503 # Associate the dataset with a different collection.
504 butler.registry.registerCollection("tagged")
505 butler.registry.associate("tagged", [ref])
506 # Deleting the dataset from the new collection should make it findable
507 # in the original collection.
508 butler.pruneDatasets([ref], tags=["tagged"])
509 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
512class ButlerTests(ButlerPutGetTests):
513 """Tests for Butler."""
515 useTempRoot = True
517 def setUp(self):
518 """Create a new butler root for each test."""
519 self.root = makeTestTempDir(TESTDIR)
520 Butler.makeRepo(self.root, config=Config(self.configFile))
521 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
523 def testConstructor(self):
524 """Independent test of constructor."""
525 butler = Butler(self.tmpConfigFile, run=self.default_run)
526 self.assertIsInstance(butler, Butler)
528 # Check that butler.yaml is added automatically.
529 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
530 config_dir = self.tmpConfigFile[: -len(end)]
531 butler = Butler(config_dir, run=self.default_run)
532 self.assertIsInstance(butler, Butler)
534 # Even with a ResourcePath.
535 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
536 self.assertIsInstance(butler, Butler)
538 collections = set(butler.registry.queryCollections())
539 self.assertEqual(collections, {self.default_run})
541 # Check that some special characters can be included in run name.
542 special_run = "u@b.c-A"
543 butler_special = Butler(butler=butler, run=special_run)
544 collections = set(butler_special.registry.queryCollections("*@*"))
545 self.assertEqual(collections, {special_run})
547 butler2 = Butler(butler=butler, collections=["other"])
548 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
549 self.assertIsNone(butler2.run)
550 self.assertIs(butler.datastore, butler2.datastore)
552 # Test that we can use an environment variable to find this
553 # repository.
554 butler_index = Config()
555 butler_index["label"] = self.tmpConfigFile
556 for suffix in (".yaml", ".json"):
557 # Ensure that the content differs so that we know that
558 # we aren't reusing the cache.
559 bad_label = f"s3://bucket/not_real{suffix}"
560 butler_index["bad_label"] = bad_label
561 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
562 butler_index.dumpToUri(temp_file)
563 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
564 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
565 uri = Butler.get_repo_uri("bad_label")
566 self.assertEqual(uri, ResourcePath(bad_label))
567 uri = Butler.get_repo_uri("label")
568 butler = Butler(uri, writeable=False)
569 self.assertIsInstance(butler, Butler)
570 butler = Butler("label", writeable=False)
571 self.assertIsInstance(butler, Butler)
572 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
573 Butler("not_there", writeable=False)
574 with self.assertRaises(KeyError) as cm:
575 Butler.get_repo_uri("missing")
576 self.assertIn("not known to", str(cm.exception))
577 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
578 with self.assertRaises(FileNotFoundError):
579 Butler.get_repo_uri("label")
580 self.assertEqual(Butler.get_known_repos(), set())
581 with self.assertRaises(KeyError) as cm:
582 # No environment variable set.
583 Butler.get_repo_uri("label")
584 self.assertIn("No repository index defined", str(cm.exception))
585 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
586 # No aliases registered.
587 Butler("not_there")
588 self.assertEqual(Butler.get_known_repos(), set())
590 def testBasicPutGet(self):
591 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
592 self.runPutGetTest(storageClass, "test_metric")
594 def testCompositePutGetConcrete(self):
596 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
597 butler = self.runPutGetTest(storageClass, "test_metric")
599 # Should *not* be disassembled
600 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
601 self.assertEqual(len(datasets), 1)
602 uri, components = butler.getURIs(datasets[0])
603 self.assertIsInstance(uri, ResourcePath)
604 self.assertFalse(components)
605 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
606 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
608 # Predicted dataset
609 dataId = {"instrument": "DummyCamComp", "visit": 424}
610 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
611 self.assertFalse(components)
612 self.assertIsInstance(uri, ResourcePath)
613 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
614 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
616 def testCompositePutGetVirtual(self):
617 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
618 butler = self.runPutGetTest(storageClass, "test_metric_comp")
620 # Should be disassembled
621 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
622 self.assertEqual(len(datasets), 1)
623 uri, components = butler.getURIs(datasets[0])
625 if butler.datastore.isEphemeral:
626 # Never disassemble in-memory datastore
627 self.assertIsInstance(uri, ResourcePath)
628 self.assertFalse(components)
629 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
630 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
631 else:
632 self.assertIsNone(uri)
633 self.assertEqual(set(components), set(storageClass.components))
634 for compuri in components.values():
635 self.assertIsInstance(compuri, ResourcePath)
636 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
637 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
639 # Predicted dataset
640 dataId = {"instrument": "DummyCamComp", "visit": 424}
641 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
643 if butler.datastore.isEphemeral:
644 # Never disassembled
645 self.assertIsInstance(uri, ResourcePath)
646 self.assertFalse(components)
647 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
648 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
649 else:
650 self.assertIsNone(uri)
651 self.assertEqual(set(components), set(storageClass.components))
652 for compuri in components.values():
653 self.assertIsInstance(compuri, ResourcePath)
654 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
655 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
657 def testIngest(self):
658 butler = Butler(self.tmpConfigFile, run=self.default_run)
660 # Create and register a DatasetType
661 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
663 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
664 datasetTypeName = "metric"
666 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
668 # Add needed Dimensions
669 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
670 butler.registry.insertDimensionData(
671 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
672 )
673 for detector in (1, 2):
674 butler.registry.insertDimensionData(
675 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
676 )
678 butler.registry.insertDimensionData(
679 "visit",
680 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
681 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
682 )
684 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter")
685 dataRoot = os.path.join(TESTDIR, "data", "basic")
686 datasets = []
687 for detector in (1, 2):
688 detector_name = f"detector_{detector}"
689 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
690 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
691 # Create a DatasetRef for ingest
692 refIn = DatasetRef(datasetType, dataId, id=None)
694 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
696 butler.ingest(*datasets, transfer="copy")
698 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
699 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
701 metrics1 = butler.get(datasetTypeName, dataId1)
702 metrics2 = butler.get(datasetTypeName, dataId2)
703 self.assertNotEqual(metrics1, metrics2)
705 # Compare URIs
706 uri1 = butler.getURI(datasetTypeName, dataId1)
707 uri2 = butler.getURI(datasetTypeName, dataId2)
708 self.assertNotEqual(uri1, uri2)
710 # Now do a multi-dataset but single file ingest
711 metricFile = os.path.join(dataRoot, "detectors.yaml")
712 refs = []
713 for detector in (1, 2):
714 detector_name = f"detector_{detector}"
715 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
716 # Create a DatasetRef for ingest
717 refs.append(DatasetRef(datasetType, dataId, id=None))
719 datasets = []
720 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter))
722 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
724 # Check that the datastore recorded no file size.
725 # Not all datastores can support this.
726 try:
727 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0])
728 self.assertEqual(infos[0].file_size, -1)
729 except AttributeError:
730 pass
732 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
733 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
735 multi1 = butler.get(datasetTypeName, dataId1)
736 multi2 = butler.get(datasetTypeName, dataId2)
738 self.assertEqual(multi1, metrics1)
739 self.assertEqual(multi2, metrics2)
741 # Compare URIs
742 uri1 = butler.getURI(datasetTypeName, dataId1)
743 uri2 = butler.getURI(datasetTypeName, dataId2)
744 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
746 # Test that removing one does not break the second
747 # This line will issue a warning log message for a ChainedDatastore
748 # that uses an InMemoryDatastore since in-memory can not ingest
749 # files.
750 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
751 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
752 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
753 multi2b = butler.get(datasetTypeName, dataId2)
754 self.assertEqual(multi2, multi2b)
756 def testPruneCollections(self):
757 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
758 butler = Butler(self.tmpConfigFile, writeable=True)
759 # Load registry data with dimensions to hang datasets off of.
760 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
761 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
762 # Add some RUN-type collections.
763 run1 = "run1"
764 butler.registry.registerRun(run1)
765 run2 = "run2"
766 butler.registry.registerRun(run2)
767 # put some datasets. ref1 and ref2 have the same data ID, and are in
768 # different runs. ref3 has a different data ID.
769 metric = makeExampleMetrics()
770 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
771 datasetType = self.addDatasetType(
772 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
773 )
774 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
775 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
776 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
778 # Try to delete a RUN collection without purge, or with purge and not
779 # unstore.
780 with self.assertRaises(TypeError):
781 butler.pruneCollection(run1)
782 with self.assertRaises(TypeError):
783 butler.pruneCollection(run2, purge=True)
784 # Add a TAGGED collection and associate ref3 only into it.
785 tag1 = "tag1"
786 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
787 self.assertTrue(registered)
788 # Registering a second time should be allowed.
789 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
790 self.assertFalse(registered)
791 butler.registry.associate(tag1, [ref3])
792 # Add a CHAINED collection that searches run1 and then run2. It
793 # logically contains only ref1, because ref2 is shadowed due to them
794 # having the same data ID and dataset type.
795 chain1 = "chain1"
796 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
797 butler.registry.setCollectionChain(chain1, [run1, run2])
798 # Try to delete RUN collections, which should fail with complete
799 # rollback because they're still referenced by the CHAINED
800 # collection.
801 with self.assertRaises(Exception):
802 butler.pruneCollection(run1, pruge=True, unstore=True)
803 with self.assertRaises(Exception):
804 butler.pruneCollection(run2, pruge=True, unstore=True)
805 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
806 existence = butler.datastore.mexists([ref1, ref2, ref3])
807 self.assertTrue(existence[ref1])
808 self.assertTrue(existence[ref2])
809 self.assertTrue(existence[ref3])
810 # Try to delete CHAINED and TAGGED collections with purge; should not
811 # work.
812 with self.assertRaises(TypeError):
813 butler.pruneCollection(tag1, purge=True, unstore=True)
814 with self.assertRaises(TypeError):
815 butler.pruneCollection(chain1, purge=True, unstore=True)
816 # Remove the tagged collection with unstore=False. This should not
817 # affect the datasets.
818 butler.pruneCollection(tag1)
819 with self.assertRaises(MissingCollectionError):
820 butler.registry.getCollectionType(tag1)
821 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
822 existence = butler.datastore.mexists([ref1, ref2, ref3])
823 self.assertTrue(existence[ref1])
824 self.assertTrue(existence[ref2])
825 self.assertTrue(existence[ref3])
826 # Add the tagged collection back in, and remove it with unstore=True.
827 # This should remove ref3 only from the datastore.
828 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
829 butler.registry.associate(tag1, [ref3])
830 butler.pruneCollection(tag1, unstore=True)
831 with self.assertRaises(MissingCollectionError):
832 butler.registry.getCollectionType(tag1)
833 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
834 existence = butler.datastore.mexists([ref1, ref2, ref3])
835 self.assertTrue(existence[ref1])
836 self.assertTrue(existence[ref2])
837 self.assertFalse(existence[ref3])
838 # Delete the chain with unstore=False. The datasets should not be
839 # affected at all.
840 butler.pruneCollection(chain1)
841 with self.assertRaises(MissingCollectionError):
842 butler.registry.getCollectionType(chain1)
843 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
844 existence = butler.datastore.mexists([ref1, ref2, ref3])
845 self.assertTrue(existence[ref1])
846 self.assertTrue(existence[ref2])
847 self.assertFalse(existence[ref3])
848 # Redefine and then delete the chain with unstore=True. Only ref1
849 # should be unstored (ref3 has already been unstored, but otherwise
850 # would be now).
851 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
852 butler.registry.setCollectionChain(chain1, [run1, run2])
853 butler.pruneCollection(chain1, unstore=True)
854 with self.assertRaises(MissingCollectionError):
855 butler.registry.getCollectionType(chain1)
856 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3])
857 existence = butler.datastore.mexists([ref1, ref2, ref3])
858 self.assertFalse(existence[ref1])
859 self.assertTrue(existence[ref2])
860 self.assertFalse(existence[ref3])
861 # Remove run1. This removes ref1 and ref3 from the registry (they're
862 # already gone from the datastore, which is fine).
863 butler.pruneCollection(run1, purge=True, unstore=True)
864 with self.assertRaises(MissingCollectionError):
865 butler.registry.getCollectionType(run1)
866 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2])
867 self.assertTrue(butler.datastore.exists(ref2))
868 # Remove run2. This removes ref2 from the registry and the datastore.
869 butler.pruneCollection(run2, purge=True, unstore=True)
870 with self.assertRaises(MissingCollectionError):
871 butler.registry.getCollectionType(run2)
872 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [])
874 # Now that the collections have been pruned we can remove the
875 # dataset type
876 butler.registry.removeDatasetType(datasetType.name)
878 def testPickle(self):
879 """Test pickle support."""
880 butler = Butler(self.tmpConfigFile, run=self.default_run)
881 butlerOut = pickle.loads(pickle.dumps(butler))
882 self.assertIsInstance(butlerOut, Butler)
883 self.assertEqual(butlerOut._config, butler._config)
884 self.assertEqual(butlerOut.collections, butler.collections)
885 self.assertEqual(butlerOut.run, butler.run)
887 def testGetDatasetTypes(self):
888 butler = Butler(self.tmpConfigFile, run=self.default_run)
889 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
890 dimensionEntries = [
891 (
892 "instrument",
893 {"instrument": "DummyCam"},
894 {"instrument": "DummyHSC"},
895 {"instrument": "DummyCamComp"},
896 ),
897 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
898 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
899 ]
900 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
901 # Add needed Dimensions
902 for args in dimensionEntries:
903 butler.registry.insertDimensionData(*args)
905 # When a DatasetType is added to the registry entries are not created
906 # for components but querying them can return the components.
907 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
908 components = set()
909 for datasetTypeName in datasetTypeNames:
910 # Create and register a DatasetType
911 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
913 for componentName in storageClass.components:
914 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
916 fromRegistry = set(butler.registry.queryDatasetTypes(components=True))
917 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
919 # Now that we have some dataset types registered, validate them
920 butler.validateConfiguration(
921 ignore=[
922 "test_metric_comp",
923 "metric3",
924 "calexp",
925 "DummySC",
926 "datasetType.component",
927 "random_data",
928 "random_data_2",
929 ]
930 )
932 # Add a new datasetType that will fail template validation
933 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
934 if self.validationCanFail:
935 with self.assertRaises(ValidationError):
936 butler.validateConfiguration()
938 # Rerun validation but with a subset of dataset type names
939 butler.validateConfiguration(datasetTypeNames=["metric4"])
941 # Rerun validation but ignore the bad datasetType
942 butler.validateConfiguration(
943 ignore=[
944 "test_metric_comp",
945 "metric3",
946 "calexp",
947 "DummySC",
948 "datasetType.component",
949 "random_data",
950 "random_data_2",
951 ]
952 )
954 def testTransaction(self):
955 butler = Butler(self.tmpConfigFile, run=self.default_run)
956 datasetTypeName = "test_metric"
957 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
958 dimensionEntries = (
959 ("instrument", {"instrument": "DummyCam"}),
960 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
961 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
962 )
963 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
964 metric = makeExampleMetrics()
965 dataId = {"instrument": "DummyCam", "visit": 42}
966 # Create and register a DatasetType
967 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
968 with self.assertRaises(TransactionTestError):
969 with butler.transaction():
970 # Add needed Dimensions
971 for args in dimensionEntries:
972 butler.registry.insertDimensionData(*args)
973 # Store a dataset
974 ref = butler.put(metric, datasetTypeName, dataId)
975 self.assertIsInstance(ref, DatasetRef)
976 # Test getDirect
977 metricOut = butler.getDirect(ref)
978 self.assertEqual(metric, metricOut)
979 # Test get
980 metricOut = butler.get(datasetTypeName, dataId)
981 self.assertEqual(metric, metricOut)
982 # Check we can get components
983 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
984 raise TransactionTestError("This should roll back the entire transaction")
985 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
986 butler.registry.expandDataId(dataId)
987 # Should raise LookupError for missing data ID value
988 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
989 butler.get(datasetTypeName, dataId)
990 # Also check explicitly if Dataset entry is missing
991 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
992 # Direct retrieval should not find the file in the Datastore
993 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
994 butler.getDirect(ref)
996 def testMakeRepo(self):
997 """Test that we can write butler configuration to a new repository via
998 the Butler.makeRepo interface and then instantiate a butler from the
999 repo root.
1000 """
1001 # Do not run the test if we know this datastore configuration does
1002 # not support a file system root
1003 if self.fullConfigKey is None:
1004 return
1006 # create two separate directories
1007 root1 = tempfile.mkdtemp(dir=self.root)
1008 root2 = tempfile.mkdtemp(dir=self.root)
1010 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1011 limited = Config(self.configFile)
1012 butler1 = Butler(butlerConfig)
1013 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1014 full = Config(self.tmpConfigFile)
1015 butler2 = Butler(butlerConfig)
1016 # Butlers should have the same configuration regardless of whether
1017 # defaults were expanded.
1018 self.assertEqual(butler1._config, butler2._config)
1019 # Config files loaded directly should not be the same.
1020 self.assertNotEqual(limited, full)
1021 # Make sure "limited" doesn't have a few keys we know it should be
1022 # inheriting from defaults.
1023 self.assertIn(self.fullConfigKey, full)
1024 self.assertNotIn(self.fullConfigKey, limited)
1026 # Collections don't appear until something is put in them
1027 collections1 = set(butler1.registry.queryCollections())
1028 self.assertEqual(collections1, set())
1029 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1031 # Check that a config with no associated file name will not
1032 # work properly with relocatable Butler repo
1033 butlerConfig.configFile = None
1034 with self.assertRaises(ValueError):
1035 Butler(butlerConfig)
1037 with self.assertRaises(FileExistsError):
1038 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1040 def testStringification(self):
1041 butler = Butler(self.tmpConfigFile, run=self.default_run)
1042 butlerStr = str(butler)
1044 if self.datastoreStr is not None:
1045 for testStr in self.datastoreStr:
1046 self.assertIn(testStr, butlerStr)
1047 if self.registryStr is not None:
1048 self.assertIn(self.registryStr, butlerStr)
1050 datastoreName = butler.datastore.name
1051 if self.datastoreName is not None:
1052 for testStr in self.datastoreName:
1053 self.assertIn(testStr, datastoreName)
1055 def testButlerRewriteDataId(self):
1056 """Test that dataIds can be rewritten based on dimension records."""
1058 butler = Butler(self.tmpConfigFile, run=self.default_run)
1060 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1061 datasetTypeName = "random_data"
1063 # Create dimension records.
1064 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1065 butler.registry.insertDimensionData(
1066 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1067 )
1068 butler.registry.insertDimensionData(
1069 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1070 )
1072 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1073 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1074 butler.registry.registerDatasetType(datasetType)
1076 n_exposures = 5
1077 dayobs = 20210530
1079 for i in range(n_exposures):
1080 butler.registry.insertDimensionData(
1081 "exposure",
1082 {
1083 "instrument": "DummyCamComp",
1084 "id": i,
1085 "obs_id": f"exp{i}",
1086 "seq_num": i,
1087 "day_obs": dayobs,
1088 "physical_filter": "d-r",
1089 },
1090 )
1092 # Write some data.
1093 for i in range(n_exposures):
1094 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1096 # Use the seq_num for the put to test rewriting.
1097 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1098 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1100 # Check that the exposure is correct in the dataId
1101 self.assertEqual(ref.dataId["exposure"], i)
1103 # and check that we can get the dataset back with the same dataId
1104 new_metric = butler.get(datasetTypeName, dataId=dataId)
1105 self.assertEqual(new_metric, metric)
1108class FileDatastoreButlerTests(ButlerTests):
1109 """Common tests and specialization of ButlerTests for butlers backed
1110 by datastores that inherit from FileDatastore.
1111 """
1113 def checkFileExists(self, root, relpath):
1114 """Checks if file exists at a given path (relative to root).
1116 Test testPutTemplates verifies actual physical existance of the files
1117 in the requested location.
1118 """
1119 uri = ResourcePath(root, forceDirectory=True)
1120 return uri.join(relpath).exists()
1122 def testPutTemplates(self):
1123 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1124 butler = Butler(self.tmpConfigFile, run=self.default_run)
1126 # Add needed Dimensions
1127 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1128 butler.registry.insertDimensionData(
1129 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1130 )
1131 butler.registry.insertDimensionData(
1132 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1133 )
1134 butler.registry.insertDimensionData(
1135 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1136 )
1138 # Create and store a dataset
1139 metric = makeExampleMetrics()
1141 # Create two almost-identical DatasetTypes (both will use default
1142 # template)
1143 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1144 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1145 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1146 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1148 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1149 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1151 # Put with exactly the data ID keys needed
1152 ref = butler.put(metric, "metric1", dataId1)
1153 uri = butler.getURI(ref)
1154 self.assertTrue(
1155 self.checkFileExists(
1156 butler.datastore.root, f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle"
1157 ),
1158 f"Checking existence of {uri}",
1159 )
1161 # Check the template based on dimensions
1162 butler.datastore.templates.validateTemplates([ref])
1164 # Put with extra data ID keys (physical_filter is an optional
1165 # dependency); should not change template (at least the way we're
1166 # defining them to behave now; the important thing is that they
1167 # must be consistent).
1168 ref = butler.put(metric, "metric2", dataId2)
1169 uri = butler.getURI(ref)
1170 self.assertTrue(
1171 self.checkFileExists(
1172 butler.datastore.root, f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle"
1173 ),
1174 f"Checking existence of {uri}",
1175 )
1177 # Check the template based on dimensions
1178 butler.datastore.templates.validateTemplates([ref])
1180 # Now use a file template that will not result in unique filenames
1181 with self.assertRaises(FileTemplateValidationError):
1182 butler.put(metric, "metric3", dataId1)
1184 def testImportExport(self):
1185 # Run put/get tests just to create and populate a repo.
1186 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1187 self.runImportExportTest(storageClass)
1189 @unittest.expectedFailure
1190 def testImportExportVirtualComposite(self):
1191 # Run put/get tests just to create and populate a repo.
1192 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1193 self.runImportExportTest(storageClass)
1195 def runImportExportTest(self, storageClass):
1196 """This test does an export to a temp directory and an import back
1197 into a new temp directory repo. It does not assume a posix datastore"""
1198 exportButler = self.runPutGetTest(storageClass, "test_metric")
1199 print("Root:", exportButler.datastore.root)
1200 # Test that the repo actually has at least one dataset.
1201 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1202 self.assertGreater(len(datasets), 0)
1203 # Add a DimensionRecord that's unused by those datasets.
1204 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1205 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1206 # Export and then import datasets.
1207 with safeTestTempDir(TESTDIR) as exportDir:
1208 exportFile = os.path.join(exportDir, "exports.yaml")
1209 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1210 export.saveDatasets(datasets)
1211 # Export the same datasets again. This should quietly do
1212 # nothing because of internal deduplication, and it shouldn't
1213 # complain about being asked to export the "htm7" elements even
1214 # though there aren't any in these datasets or in the database.
1215 export.saveDatasets(datasets, elements=["htm7"])
1216 # Save one of the data IDs again; this should be harmless
1217 # because of internal deduplication.
1218 export.saveDataIds([datasets[0].dataId])
1219 # Save some dimension records directly.
1220 export.saveDimensionData("skymap", [skymapRecord])
1221 self.assertTrue(os.path.exists(exportFile))
1222 with safeTestTempDir(TESTDIR) as importDir:
1223 # We always want this to be a local posix butler
1224 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1225 # Calling script.butlerImport tests the implementation of the
1226 # butler command line interface "import" subcommand. Functions
1227 # in the script folder are generally considered protected and
1228 # should not be used as public api.
1229 with open(exportFile, "r") as f:
1230 script.butlerImport(
1231 importDir,
1232 export_file=f,
1233 directory=exportDir,
1234 transfer="auto",
1235 skip_dimensions=None,
1236 reuse_ids=False,
1237 )
1238 importButler = Butler(importDir, run=self.default_run)
1239 for ref in datasets:
1240 with self.subTest(ref=ref):
1241 # Test for existence by passing in the DatasetType and
1242 # data ID separately, to avoid lookup by dataset_id.
1243 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1244 self.assertEqual(
1245 list(importButler.registry.queryDimensionRecords("skymap")),
1246 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1247 )
1249 def testRemoveRuns(self):
1250 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1251 butler = Butler(self.tmpConfigFile, writeable=True)
1252 # Load registry data with dimensions to hang datasets off of.
1253 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1254 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1255 # Add some RUN-type collection.
1256 run1 = "run1"
1257 butler.registry.registerRun(run1)
1258 run2 = "run2"
1259 butler.registry.registerRun(run2)
1260 # put a dataset in each
1261 metric = makeExampleMetrics()
1262 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1263 datasetType = self.addDatasetType(
1264 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1265 )
1266 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1267 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1268 uri1 = butler.getURI(ref1, collections=[run1])
1269 uri2 = butler.getURI(ref2, collections=[run2])
1270 # Remove from both runs with different values for unstore.
1271 butler.removeRuns([run1], unstore=True)
1272 butler.removeRuns([run2], unstore=False)
1273 # Should be nothing in registry for either one, and datastore should
1274 # not think either exists.
1275 with self.assertRaises(MissingCollectionError):
1276 butler.registry.getCollectionType(run1)
1277 with self.assertRaises(MissingCollectionError):
1278 butler.registry.getCollectionType(run2)
1279 self.assertFalse(butler.datastore.exists(ref1))
1280 self.assertFalse(butler.datastore.exists(ref2))
1281 # The ref we unstored should be gone according to the URI, but the
1282 # one we forgot should still be around.
1283 self.assertFalse(uri1.exists())
1284 self.assertTrue(uri2.exists())
1287class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1288 """PosixDatastore specialization of a butler"""
1290 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1291 fullConfigKey = ".datastore.formatters"
1292 validationCanFail = True
1293 datastoreStr = ["/tmp"]
1294 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1295 registryStr = "/gen3.sqlite3"
1297 def testPathConstructor(self):
1298 """Independent test of constructor using PathLike."""
1299 butler = Butler(self.tmpConfigFile, run=self.default_run)
1300 self.assertIsInstance(butler, Butler)
1302 # And again with a Path object with the butler yaml
1303 path = pathlib.Path(self.tmpConfigFile)
1304 butler = Butler(path, writeable=False)
1305 self.assertIsInstance(butler, Butler)
1307 # And again with a Path object without the butler yaml
1308 # (making sure we skip it if the tmp config doesn't end
1309 # in butler.yaml -- which is the case for a subclass)
1310 if self.tmpConfigFile.endswith("butler.yaml"):
1311 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1312 butler = Butler(path, writeable=False)
1313 self.assertIsInstance(butler, Butler)
1315 def testExportTransferCopy(self):
1316 """Test local export using all transfer modes"""
1317 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1318 exportButler = self.runPutGetTest(storageClass, "test_metric")
1319 # Test that the repo actually has at least one dataset.
1320 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1321 self.assertGreater(len(datasets), 0)
1322 uris = [exportButler.getURI(d) for d in datasets]
1323 datastoreRoot = exportButler.datastore.root
1325 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1327 for path in pathsInStore:
1328 # Assume local file system
1329 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1331 for transfer in ("copy", "link", "symlink", "relsymlink"):
1332 with safeTestTempDir(TESTDIR) as exportDir:
1333 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1334 export.saveDatasets(datasets)
1335 for path in pathsInStore:
1336 self.assertTrue(
1337 self.checkFileExists(exportDir, path),
1338 f"Check that mode {transfer} exported files",
1339 )
1341 def testPruneDatasets(self):
1342 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1343 butler = Butler(self.tmpConfigFile, writeable=True)
1344 # Load registry data with dimensions to hang datasets off of.
1345 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1346 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1347 # Add some RUN-type collections.
1348 run1 = "run1"
1349 butler.registry.registerRun(run1)
1350 run2 = "run2"
1351 butler.registry.registerRun(run2)
1352 # put some datasets. ref1 and ref2 have the same data ID, and are in
1353 # different runs. ref3 has a different data ID.
1354 metric = makeExampleMetrics()
1355 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1356 datasetType = self.addDatasetType(
1357 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1358 )
1359 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1360 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1361 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1363 # Simple prune.
1364 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1365 with self.assertRaises(LookupError):
1366 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1368 # Put data back.
1369 ref1 = butler.put(metric, ref1.unresolved(), run=run1)
1370 ref2 = butler.put(metric, ref2.unresolved(), run=run2)
1371 ref3 = butler.put(metric, ref3.unresolved(), run=run1)
1373 # Check that in normal mode, deleting the record will lead to
1374 # trash not touching the file.
1375 uri1 = butler.datastore.getURI(ref1)
1376 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table
1377 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1378 butler.datastore.trash(ref1)
1379 butler.datastore.emptyTrash()
1380 self.assertTrue(uri1.exists())
1381 uri1.remove() # Clean it up.
1383 # Simulate execution butler setup by deleting the datastore
1384 # record but keeping the file around and trusting.
1385 butler.datastore.trustGetRequest = True
1386 uri2 = butler.datastore.getURI(ref2)
1387 uri3 = butler.datastore.getURI(ref3)
1388 self.assertTrue(uri2.exists())
1389 self.assertTrue(uri3.exists())
1391 # Remove the datastore record.
1392 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table
1393 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1394 self.assertTrue(uri2.exists())
1395 butler.datastore.trash([ref2, ref3])
1396 # Immediate removal for ref2 file
1397 self.assertFalse(uri2.exists())
1398 # But ref3 has to wait for the empty.
1399 self.assertTrue(uri3.exists())
1400 butler.datastore.emptyTrash()
1401 self.assertFalse(uri3.exists())
1403 # Clear out the datasets from registry.
1404 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1406 def testPytypePutCoercion(self):
1407 """Test python type coercion on Butler.get and put."""
1409 # Store some data with the normal example storage class.
1410 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1411 datasetTypeName = "test_metric"
1412 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
1414 dataId = {"instrument": "DummyCamComp", "visit": 423}
1416 # Put a dict and this should coerce to a MetricsExample
1417 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
1418 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
1419 test_metric = butler.getDirect(metric_ref)
1420 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
1421 self.assertEqual(test_metric.summary, test_dict["summary"])
1422 self.assertEqual(test_metric.output, test_dict["output"])
1424 # Check that the put still works if a DatasetType is given with
1425 # a definition matching this python type.
1426 registry_type = butler.registry.getDatasetType(datasetTypeName)
1427 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
1428 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
1429 self.assertEqual(metric2_ref.datasetType, registry_type)
1431 # The get will return the type expected by registry.
1432 test_metric2 = butler.getDirect(metric2_ref)
1433 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
1435 # Make a new DatasetRef with the compatible but different DatasetType.
1436 # This should now return a dict.
1437 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
1438 test_dict2 = butler.getDirect(new_ref)
1439 self.assertEqual(get_full_type_name(test_dict2), "dict")
1441 # Get it again with the wrong dataset type definition using get()
1442 # rather than getDirect(). This should be consistent with getDirect()
1443 # behavior and return the type of the DatasetType.
1444 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
1445 self.assertEqual(get_full_type_name(test_dict3), "dict")
1447 def testPytypeCoercion(self):
1448 """Test python type coercion on Butler.get and put."""
1450 # Store some data with the normal example storage class.
1451 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1452 datasetTypeName = "test_metric"
1453 butler = self.runPutGetTest(storageClass, datasetTypeName)
1455 dataId = {"instrument": "DummyCamComp", "visit": 423}
1456 metric = butler.get(datasetTypeName, dataId=dataId)
1457 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1459 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1460 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1462 # Now need to hack the registry dataset type definition.
1463 # There is no API for this.
1464 manager = butler.registry._managers.datasets
1465 manager._db.update(
1466 manager._static.dataset_type,
1467 {"name": datasetTypeName},
1468 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1469 )
1471 # Force reset of dataset type cache
1472 butler.registry.refresh()
1474 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1475 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1476 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1478 metric_model = butler.get(datasetTypeName, dataId=dataId)
1479 self.assertNotEqual(type(metric_model), type(metric))
1480 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1482 # Put the model and read it back to show that everything now
1483 # works as normal.
1484 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1485 metric_model_new = butler.get(metric_ref)
1486 self.assertEqual(metric_model_new, metric_model)
1488 # Hack the storage class again to something that will fail on the
1489 # get with no conversion class.
1490 manager._db.update(
1491 manager._static.dataset_type,
1492 {"name": datasetTypeName},
1493 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1494 )
1495 butler.registry.refresh()
1497 with self.assertRaises(ValueError):
1498 butler.get(datasetTypeName, dataId=dataId)
1501@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1502class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1503 """PosixDatastore specialization of a butler using Postgres"""
1505 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1506 fullConfigKey = ".datastore.formatters"
1507 validationCanFail = True
1508 datastoreStr = ["/tmp"]
1509 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1510 registryStr = "PostgreSQL@test"
1512 @staticmethod
1513 def _handler(postgresql):
1514 engine = sqlalchemy.engine.create_engine(postgresql.url())
1515 with engine.begin() as connection:
1516 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1518 @classmethod
1519 def setUpClass(cls):
1520 # Create the postgres test server.
1521 cls.postgresql = testing.postgresql.PostgresqlFactory(
1522 cache_initialized_db=True, on_initialized=cls._handler
1523 )
1524 super().setUpClass()
1526 @classmethod
1527 def tearDownClass(cls):
1528 # Clean up any lingering SQLAlchemy engines/connections
1529 # so they're closed before we shut down the server.
1530 gc.collect()
1531 cls.postgresql.clear_cache()
1532 super().tearDownClass()
1534 def setUp(self):
1535 self.server = self.postgresql()
1537 # Need to add a registry section to the config.
1538 self._temp_config = False
1539 config = Config(self.configFile)
1540 config["registry", "db"] = self.server.url()
1541 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1542 config.dump(fh)
1543 self.configFile = fh.name
1544 self._temp_config = True
1545 super().setUp()
1547 def tearDown(self):
1548 self.server.stop()
1549 if self._temp_config and os.path.exists(self.configFile):
1550 os.remove(self.configFile)
1551 super().tearDown()
1553 def testMakeRepo(self):
1554 # The base class test assumes that it's using sqlite and assumes
1555 # the config file is acceptable to sqlite.
1556 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1559class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1560 """InMemoryDatastore specialization of a butler"""
1562 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1563 fullConfigKey = None
1564 useTempRoot = False
1565 validationCanFail = False
1566 datastoreStr = ["datastore='InMemory"]
1567 datastoreName = ["InMemoryDatastore@"]
1568 registryStr = "/gen3.sqlite3"
1570 def testIngest(self):
1571 pass
1574class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1575 """PosixDatastore specialization"""
1577 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1578 fullConfigKey = ".datastore.datastores.1.formatters"
1579 validationCanFail = True
1580 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1581 datastoreName = [
1582 "InMemoryDatastore@",
1583 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1584 "SecondDatastore",
1585 ]
1586 registryStr = "/gen3.sqlite3"
1589class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1590 """Test that a yaml file in one location can refer to a root in another."""
1592 datastoreStr = ["dir1"]
1593 # Disable the makeRepo test since we are deliberately not using
1594 # butler.yaml as the config name.
1595 fullConfigKey = None
1597 def setUp(self):
1598 self.root = makeTestTempDir(TESTDIR)
1600 # Make a new repository in one place
1601 self.dir1 = os.path.join(self.root, "dir1")
1602 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1604 # Move the yaml file to a different place and add a "root"
1605 self.dir2 = os.path.join(self.root, "dir2")
1606 os.makedirs(self.dir2, exist_ok=True)
1607 configFile1 = os.path.join(self.dir1, "butler.yaml")
1608 config = Config(configFile1)
1609 config["root"] = self.dir1
1610 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1611 config.dumpToUri(configFile2)
1612 os.remove(configFile1)
1613 self.tmpConfigFile = configFile2
1615 def testFileLocations(self):
1616 self.assertNotEqual(self.dir1, self.dir2)
1617 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1618 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1619 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1622class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1623 """Test that a config file created by makeRepo outside of repo works."""
1625 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1627 def setUp(self):
1628 self.root = makeTestTempDir(TESTDIR)
1629 self.root2 = makeTestTempDir(TESTDIR)
1631 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1632 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1634 def tearDown(self):
1635 if os.path.exists(self.root2):
1636 shutil.rmtree(self.root2, ignore_errors=True)
1637 super().tearDown()
1639 def testConfigExistence(self):
1640 c = Config(self.tmpConfigFile)
1641 uri_config = ResourcePath(c["root"])
1642 uri_expected = ResourcePath(self.root, forceDirectory=True)
1643 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1644 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1646 def testPutGet(self):
1647 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1648 self.runPutGetTest(storageClass, "test_metric")
1651class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1652 """Test that a config file created by makeRepo outside of repo works."""
1654 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1656 def setUp(self):
1657 self.root = makeTestTempDir(TESTDIR)
1658 self.root2 = makeTestTempDir(TESTDIR)
1660 self.tmpConfigFile = self.root2
1661 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1663 def testConfigExistence(self):
1664 # Append the yaml file else Config constructor does not know the file
1665 # type.
1666 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1667 super().testConfigExistence()
1670class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1671 """Test that a config file created by makeRepo outside of repo works."""
1673 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1675 def setUp(self):
1676 self.root = makeTestTempDir(TESTDIR)
1677 self.root2 = makeTestTempDir(TESTDIR)
1679 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1680 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1683@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1684class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1685 """S3Datastore specialization of a butler; an S3 storage Datastore +
1686 a local in-memory SqlRegistry.
1687 """
1689 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1690 fullConfigKey = None
1691 validationCanFail = True
1693 bucketName = "anybucketname"
1694 """Name of the Bucket that will be used in the tests. The name is read from
1695 the config file used with the tests during set-up.
1696 """
1698 root = "butlerRoot/"
1699 """Root repository directory expected to be used in case useTempRoot=False.
1700 Otherwise the root is set to a 20 characters long randomly generated string
1701 during set-up.
1702 """
1704 datastoreStr = [f"datastore={root}"]
1705 """Contains all expected root locations in a format expected to be
1706 returned by Butler stringification.
1707 """
1709 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1710 """The expected format of the S3 Datastore string."""
1712 registryStr = "/gen3.sqlite3"
1713 """Expected format of the Registry string."""
1715 mock_s3 = mock_s3()
1716 """The mocked s3 interface from moto."""
1718 def genRoot(self):
1719 """Returns a random string of len 20 to serve as a root
1720 name for the temporary bucket repo.
1722 This is equivalent to tempfile.mkdtemp as this is what self.root
1723 becomes when useTempRoot is True.
1724 """
1725 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1726 return rndstr + "/"
1728 def setUp(self):
1729 config = Config(self.configFile)
1730 uri = ResourcePath(config[".datastore.datastore.root"])
1731 self.bucketName = uri.netloc
1733 # Enable S3 mocking of tests.
1734 self.mock_s3.start()
1736 # set up some fake credentials if they do not exist
1737 self.usingDummyCredentials = setAwsEnvCredentials()
1739 if self.useTempRoot:
1740 self.root = self.genRoot()
1741 rooturi = f"s3://{self.bucketName}/{self.root}"
1742 config.update({"datastore": {"datastore": {"root": rooturi}}})
1744 # need local folder to store registry database
1745 self.reg_dir = makeTestTempDir(TESTDIR)
1746 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1748 # MOTO needs to know that we expect Bucket bucketname to exist
1749 # (this used to be the class attribute bucketName)
1750 s3 = boto3.resource("s3")
1751 s3.create_bucket(Bucket=self.bucketName)
1753 self.datastoreStr = f"datastore={self.root}"
1754 self.datastoreName = [f"FileDatastore@{rooturi}"]
1755 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1756 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1758 def tearDown(self):
1759 s3 = boto3.resource("s3")
1760 bucket = s3.Bucket(self.bucketName)
1761 try:
1762 bucket.objects.all().delete()
1763 except botocore.exceptions.ClientError as e:
1764 if e.response["Error"]["Code"] == "404":
1765 # the key was not reachable - pass
1766 pass
1767 else:
1768 raise
1770 bucket = s3.Bucket(self.bucketName)
1771 bucket.delete()
1773 # Stop the S3 mock.
1774 self.mock_s3.stop()
1776 # unset any potentially set dummy credentials
1777 if self.usingDummyCredentials:
1778 unsetAwsEnvCredentials()
1780 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1781 shutil.rmtree(self.reg_dir, ignore_errors=True)
1783 if self.useTempRoot and os.path.exists(self.root):
1784 shutil.rmtree(self.root, ignore_errors=True)
1786 super().tearDown()
1789@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!")
1790class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1791 """WebdavDatastore specialization of a butler; a Webdav storage Datastore +
1792 a local in-memory SqlRegistry.
1793 """
1795 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml")
1796 fullConfigKey = None
1797 validationCanFail = True
1799 serverName = "localhost"
1800 """Name of the server that will be used in the tests.
1801 """
1803 portNumber = 8080
1804 """Port on which the webdav server listens. Automatically chosen
1805 at setUpClass via the _getfreeport() method
1806 """
1808 root = "butlerRoot/"
1809 """Root repository directory expected to be used in case useTempRoot=False.
1810 Otherwise the root is set to a 20 characters long randomly generated string
1811 during set-up.
1812 """
1814 datastoreStr = [f"datastore={root}"]
1815 """Contains all expected root locations in a format expected to be
1816 returned by Butler stringification.
1817 """
1819 datastoreName = ["FileDatastore@https://{serverName}/{root}"]
1820 """The expected format of the WebdavDatastore string."""
1822 registryStr = "/gen3.sqlite3"
1823 """Expected format of the Registry string."""
1825 serverThread = None
1826 """Thread in which the local webdav server will run"""
1828 stopWebdavServer = False
1829 """This flag will cause the webdav server to
1830 gracefully shut down when True
1831 """
1833 def genRoot(self):
1834 """Returns a random string of len 20 to serve as a root
1835 name for the temporary bucket repo.
1837 This is equivalent to tempfile.mkdtemp as this is what self.root
1838 becomes when useTempRoot is True.
1839 """
1840 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1841 return rndstr + "/"
1843 @classmethod
1844 def setUpClass(cls):
1845 # Do the same as inherited class
1846 cls.storageClassFactory = StorageClassFactory()
1847 cls.storageClassFactory.addFromConfig(cls.configFile)
1849 cls.portNumber = cls._getfreeport()
1850 # Run a local webdav server on which tests will be run
1851 cls.serverThread = Thread(
1852 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True
1853 )
1854 cls.serverThread.start()
1855 # Wait for it to start
1856 time.sleep(3)
1858 @classmethod
1859 def tearDownClass(cls):
1860 # Ask for graceful shut down of the webdav server
1861 cls.stopWebdavServer = True
1862 # Wait for the thread to exit
1863 cls.serverThread.join()
1864 super().tearDownClass()
1866 def setUp(self):
1867 config = Config(self.configFile)
1869 if self.useTempRoot:
1870 self.root = self.genRoot()
1871 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}"
1872 config.update({"datastore": {"datastore": {"root": self.rooturi}}})
1874 # need local folder to store registry database
1875 self.reg_dir = makeTestTempDir(TESTDIR)
1876 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1878 self.datastoreStr = f"datastore={self.root}"
1879 self.datastoreName = [f"FileDatastore@{self.rooturi}"]
1881 if not _is_webdav_endpoint(self.rooturi):
1882 raise OSError("Webdav server not running properly: cannot run tests.")
1884 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False)
1885 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml")
1887 def tearDown(self):
1888 # Clear temporary directory
1889 ResourcePath(self.rooturi).remove()
1890 ResourcePath(self.rooturi).session.close()
1892 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1893 shutil.rmtree(self.reg_dir, ignore_errors=True)
1895 if self.useTempRoot and os.path.exists(self.root):
1896 shutil.rmtree(self.root, ignore_errors=True)
1898 super().tearDown()
1900 def _serveWebdav(self, port: int, stopWebdavServer):
1901 """Starts a local webdav-compatible HTTP server,
1902 Listening on http://localhost:port
1903 This server only runs when this test class is instantiated,
1904 and then shuts down. Must be started is a separate thread.
1906 Parameters
1907 ----------
1908 port : `int`
1909 The port number on which the server should listen
1910 """
1911 root_path = gettempdir()
1913 config = {
1914 "host": "0.0.0.0",
1915 "port": port,
1916 "provider_mapping": {"/": root_path},
1917 "http_authenticator": {"domain_controller": None},
1918 "simple_dc": {"user_mapping": {"*": True}},
1919 "verbose": 0,
1920 }
1921 app = WsgiDAVApp(config)
1923 server_args = {
1924 "bind_addr": (config["host"], config["port"]),
1925 "wsgi_app": app,
1926 }
1927 server = wsgi.Server(**server_args)
1928 server.prepare()
1930 try:
1931 # Start the actual server in a separate thread
1932 t = Thread(target=server.serve, daemon=True)
1933 t.start()
1934 # watch stopWebdavServer, and gracefully
1935 # shut down the server when True
1936 while True:
1937 if stopWebdavServer():
1938 break
1939 time.sleep(1)
1940 except KeyboardInterrupt:
1941 print("Caught Ctrl-C, shutting down...")
1942 finally:
1943 server.stop()
1944 t.join()
1946 def _getfreeport():
1947 """
1948 Determines a free port using sockets.
1949 """
1950 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1951 free_socket.bind(("0.0.0.0", 0))
1952 free_socket.listen()
1953 port = free_socket.getsockname()[1]
1954 free_socket.close()
1955 return port
1958class PosixDatastoreTransfers(unittest.TestCase):
1959 """Test data transfers between butlers.
1961 Test for different managers. UUID to UUID and integer to integer are
1962 tested. UUID to integer is not supported since we do not currently
1963 want to allow that. Integer to UUID is supported with the caveat
1964 that UUID4 will be generated and this will be incorrect for raw
1965 dataset types. The test ignores that.
1966 """
1968 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1970 @classmethod
1971 def setUpClass(cls):
1972 cls.storageClassFactory = StorageClassFactory()
1973 cls.storageClassFactory.addFromConfig(cls.configFile)
1975 def setUp(self):
1976 self.root = makeTestTempDir(TESTDIR)
1977 self.config = Config(self.configFile)
1979 def tearDown(self):
1980 removeTestTempDir(self.root)
1982 def create_butler(self, manager, label):
1983 config = Config(self.configFile)
1984 config["registry", "managers", "datasets"] = manager
1985 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1987 def create_butlers(self, manager1, manager2):
1988 self.source_butler = self.create_butler(manager1, "1")
1989 self.target_butler = self.create_butler(manager2, "2")
1991 def testTransferUuidToUuid(self):
1992 self.create_butlers(
1993 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1994 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
1995 )
1996 # Setting id_gen_map should have no effect here
1997 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
1999 def testTransferIntToInt(self):
2000 self.create_butlers(
2001 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2002 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2003 )
2004 # int dataset ID only allows UNIQUE
2005 self.assertButlerTransfers()
2007 def testTransferIntToUuid(self):
2008 self.create_butlers(
2009 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager",
2010 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2011 )
2012 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE})
2014 def testTransferMissing(self):
2015 """Test transfers where datastore records are missing.
2017 This is how execution butler works.
2018 """
2019 self.create_butlers(
2020 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2021 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2022 )
2024 # Configure the source butler to allow trust.
2025 self.source_butler.datastore.trustGetRequest = True
2027 self.assertButlerTransfers(purge=True)
2029 def testTransferMissingDisassembly(self):
2030 """Test transfers where datastore records are missing.
2032 This is how execution butler works.
2033 """
2034 self.create_butlers(
2035 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2036 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID",
2037 )
2039 # Configure the source butler to allow trust.
2040 self.source_butler.datastore.trustGetRequest = True
2042 # Test disassembly.
2043 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
2045 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"):
2046 """Test that a run can be transferred to another butler."""
2048 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2049 datasetTypeName = "random_data"
2051 # Test will create 3 collections and we will want to transfer
2052 # two of those three.
2053 runs = ["run1", "run2", "other"]
2055 # Also want to use two different dataset types to ensure that
2056 # grouping works.
2057 datasetTypeNames = ["random_data", "random_data_2"]
2059 # Create the run collections in the source butler.
2060 for run in runs:
2061 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2063 # Create dimensions in both butlers (transfer will not create them).
2064 n_exposures = 30
2065 for butler in (self.source_butler, self.target_butler):
2066 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2067 butler.registry.insertDimensionData(
2068 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2069 )
2070 butler.registry.insertDimensionData(
2071 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2072 )
2074 for i in range(n_exposures):
2075 butler.registry.insertDimensionData(
2076 "exposure",
2077 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
2078 )
2080 # Create dataset types in the source butler.
2081 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
2082 for datasetTypeName in datasetTypeNames:
2083 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2084 self.source_butler.registry.registerDatasetType(datasetType)
2086 # Write a dataset to an unrelated run -- this will ensure that
2087 # we are rewriting integer dataset ids in the target if necessary.
2088 # Will not be relevant for UUID.
2089 run = "distraction"
2090 butler = Butler(butler=self.source_butler, run=run)
2091 butler.put(
2092 makeExampleMetrics(),
2093 datasetTypeName,
2094 exposure=1,
2095 instrument="DummyCamComp",
2096 physical_filter="d-r",
2097 )
2099 # Write some example metrics to the source
2100 butler = Butler(butler=self.source_butler)
2102 # Set of DatasetRefs that should be in the list of refs to transfer
2103 # but which will not be transferred.
2104 deleted = set()
2106 n_expected = 20 # Number of datasets expected to be transferred
2107 source_refs = []
2108 for i in range(n_exposures):
2109 # Put a third of datasets into each collection, only retain
2110 # two thirds.
2111 index = i % 3
2112 run = runs[index]
2113 datasetTypeName = datasetTypeNames[i % 2]
2115 metric_data = {
2116 "summary": {"counter": i},
2117 "output": {"text": "metric"},
2118 "data": [2 * x for x in range(i)],
2119 }
2120 metric = MetricsExample(**metric_data)
2121 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2122 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2124 # Remove the datastore record using low-level API
2125 if purge:
2126 # Remove records for a fraction.
2127 if index == 1:
2129 # For one of these delete the file as well.
2130 # This allows the "missing" code to filter the
2131 # file out.
2132 if not deleted:
2133 primary, uris = butler.datastore.getURIs(ref)
2134 if primary:
2135 primary.remove()
2136 for uri in uris.values():
2137 uri.remove()
2138 n_expected -= 1
2139 deleted.add(ref)
2141 # Remove the datastore record.
2142 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id})
2144 if index < 2:
2145 source_refs.append(ref)
2146 if ref not in deleted:
2147 new_metric = butler.get(ref.unresolved(), collections=run)
2148 self.assertEqual(new_metric, metric)
2150 # Create some bad dataset types to ensure we check for inconsistent
2151 # definitions.
2152 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2153 for datasetTypeName in datasetTypeNames:
2154 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2155 self.target_butler.registry.registerDatasetType(datasetType)
2156 with self.assertRaises(ConflictingDefinitionError):
2157 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2158 # And remove the bad definitions.
2159 for datasetTypeName in datasetTypeNames:
2160 self.target_butler.registry.removeDatasetType(datasetTypeName)
2162 # Transfer without creating dataset types should fail.
2163 with self.assertRaises(KeyError):
2164 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map)
2166 # Now transfer them to the second butler
2167 with self.assertLogs(level=logging.DEBUG) as cm:
2168 transferred = self.target_butler.transfer_from(
2169 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True
2170 )
2171 self.assertEqual(len(transferred), n_expected)
2172 log_output = ";".join(cm.output)
2173 self.assertIn("found in datastore for chunk", log_output)
2174 self.assertIn("Creating output run", log_output)
2176 # Do the transfer twice to ensure that it will do nothing extra.
2177 # Only do this if purge=True because it does not work for int
2178 # dataset_id.
2179 if purge:
2180 # This should not need to register dataset types.
2181 transferred = self.target_butler.transfer_from(
2182 self.source_butler, source_refs, id_gen_map=id_gen_map
2183 )
2184 self.assertEqual(len(transferred), n_expected)
2186 # Also do an explicit low-level transfer to trigger some
2187 # edge cases.
2188 with self.assertLogs(level=logging.DEBUG) as cm:
2189 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2190 log_output = ";".join(cm.output)
2191 self.assertIn("no file artifacts exist", log_output)
2193 with self.assertRaises(TypeError):
2194 self.target_butler.datastore.transfer_from(self.source_butler, source_refs)
2196 with self.assertRaises(ValueError):
2197 self.target_butler.datastore.transfer_from(
2198 self.source_butler.datastore, source_refs, transfer="split"
2199 )
2201 # Now try to get the same refs from the new butler.
2202 for ref in source_refs:
2203 if ref not in deleted:
2204 unresolved_ref = ref.unresolved()
2205 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run)
2206 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run)
2207 self.assertEqual(new_metric, old_metric)
2209 # Now prune run2 collection and create instead a CHAINED collection.
2210 # This should block the transfer.
2211 self.target_butler.pruneCollection("run2", purge=True, unstore=True)
2212 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2213 with self.assertRaises(CollectionTypeError):
2214 # Re-importing the run1 datasets can be problematic if they
2215 # use integer IDs so filter those out.
2216 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2217 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map)
2220if __name__ == "__main__": 2220 ↛ 2221line 2220 didn't jump to line 2221, because the condition on line 2220 was never true
2221 unittest.main()