Coverage for tests/test_butler.py: 15%
1413 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-07 11:04 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-07 11:04 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Tests for Butler.
29"""
30from __future__ import annotations
32import gc
33import json
34import logging
35import os
36import pathlib
37import pickle
38import posixpath
39import random
40import shutil
41import string
42import tempfile
43import unittest
44import uuid
45from collections.abc import Mapping
46from typing import TYPE_CHECKING, Any, cast
48try:
49 import boto3
50 import botocore
51 from lsst.resources.s3utils import clean_test_environment_for_s3
53 try:
54 from moto import mock_aws # v5
55 except ImportError:
56 from moto import mock_s3 as mock_aws
57except ImportError:
58 boto3 = None
60 def mock_aws(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def]
61 """No-op decorator in case moto mock_aws can not be imported."""
62 return None
65try:
66 from lsst.daf.butler.tests.server import create_test_server
67except ImportError:
68 create_test_server = None
70try:
71 # It's possible but silly to have testing.postgresql installed without
72 # having the postgresql server installed (because then nothing in
73 # testing.postgresql would work), so we use the presence of that module
74 # to test whether we can expect the server to be available.
75 import testing.postgresql # type: ignore[import]
76except ImportError:
77 testing = None
79import astropy.time
80import sqlalchemy
81from lsst.daf.butler import (
82 Butler,
83 ButlerConfig,
84 ButlerRepoIndex,
85 CollectionType,
86 Config,
87 DataCoordinate,
88 DatasetExistence,
89 DatasetRef,
90 DatasetType,
91 FileDataset,
92 NoDefaultCollectionError,
93 StorageClassFactory,
94 ValidationError,
95 script,
96)
97from lsst.daf.butler.datastore import NullDatastore
98from lsst.daf.butler.datastore.file_templates import FileTemplate, FileTemplateValidationError
99from lsst.daf.butler.datastores.fileDatastore import FileDatastore
100from lsst.daf.butler.direct_butler import DirectButler
101from lsst.daf.butler.registry import (
102 CollectionError,
103 CollectionTypeError,
104 ConflictingDefinitionError,
105 DataIdValueError,
106 MissingCollectionError,
107 OrphanedRecordError,
108)
109from lsst.daf.butler.registry.sql_registry import SqlRegistry
110from lsst.daf.butler.repo_relocation import BUTLER_ROOT_TAG
111from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
112from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir
113from lsst.resources import ResourcePath
114from lsst.utils import doImportType
115from lsst.utils.introspection import get_full_type_name
117if TYPE_CHECKING:
118 import types
120 from lsst.daf.butler import DimensionGroup, Registry, StorageClass
122TESTDIR = os.path.abspath(os.path.dirname(__file__))
125def clean_environment() -> None:
126 """Remove external environment variables that affect the tests."""
127 for k in ("DAF_BUTLER_REPOSITORY_INDEX",):
128 os.environ.pop(k, None)
131def makeExampleMetrics() -> MetricsExample:
132 """Return example dataset suitable for tests."""
133 return MetricsExample(
134 {"AM1": 5.2, "AM2": 30.6},
135 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
136 [563, 234, 456.7, 752, 8, 9, 27],
137 )
140class TransactionTestError(Exception):
141 """Specific error for testing transactions, to prevent misdiagnosing
142 that might otherwise occur when a standard exception is used.
143 """
145 pass
148class ButlerConfigTests(unittest.TestCase):
149 """Simple tests for ButlerConfig that are not tested in any other test
150 cases.
151 """
153 def testSearchPath(self) -> None:
154 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
155 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
156 config1 = ButlerConfig(configFile)
157 self.assertNotIn("testConfigs", "\n".join(cm.output))
159 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
160 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
161 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
162 self.assertIn("testConfigs", "\n".join(cm.output))
164 key = ("datastore", "records", "table")
165 self.assertNotEqual(config1[key], config2[key])
166 self.assertEqual(config2[key], "override_record")
169class ButlerPutGetTests(TestCaseMixin):
170 """Helper method for running a suite of put/get tests from different
171 butler configurations.
172 """
174 root: str
175 default_run = "ingésτ😺"
176 storageClassFactory: StorageClassFactory
177 configFile: str | None
178 tmpConfigFile: str
180 @staticmethod
181 def addDatasetType(
182 datasetTypeName: str, dimensions: DimensionGroup, storageClass: StorageClass | str, registry: Registry
183 ) -> DatasetType:
184 """Create a DatasetType and register it"""
185 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
186 registry.registerDatasetType(datasetType)
187 return datasetType
189 @classmethod
190 def setUpClass(cls) -> None:
191 cls.storageClassFactory = StorageClassFactory()
192 if cls.configFile is not None:
193 cls.storageClassFactory.addFromConfig(cls.configFile)
195 def assertGetComponents(
196 self,
197 butler: Butler,
198 datasetRef: DatasetRef,
199 components: tuple[str, ...],
200 reference: Any,
201 collections: Any = None,
202 ) -> None:
203 datasetType = datasetRef.datasetType
204 dataId = datasetRef.dataId
205 deferred = butler.getDeferred(datasetRef)
207 for component in components:
208 compTypeName = datasetType.componentTypeName(component)
209 result = butler.get(compTypeName, dataId, collections=collections)
210 self.assertEqual(result, getattr(reference, component))
211 result_deferred = deferred.get(component=component)
212 self.assertEqual(result_deferred, result)
214 def tearDown(self) -> None:
215 if self.root is not None:
216 removeTestTempDir(self.root)
218 def create_empty_butler(self, run: str | None = None, writeable: bool | None = None):
219 """Create a Butler for the test repository, without inserting test
220 data.
221 """
222 butler = Butler.from_config(self.tmpConfigFile, run=run, writeable=writeable)
223 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration"
224 return butler
226 def create_butler(
227 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
228 ) -> tuple[Butler, DatasetType]:
229 """Create a Butler for the test repository and insert some test data
230 into it.
231 """
232 butler = self.create_empty_butler(run=run)
234 collections = set(butler.registry.queryCollections())
235 self.assertEqual(collections, {run})
236 # Create and register a DatasetType
237 dimensions = butler.dimensions.conform(["instrument", "visit"])
239 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
241 # Add needed Dimensions
242 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
243 butler.registry.insertDimensionData(
244 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
245 )
246 butler.registry.insertDimensionData(
247 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
248 )
249 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20200101})
250 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
251 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
252 butler.registry.insertDimensionData(
253 "visit",
254 {
255 "instrument": "DummyCamComp",
256 "id": 423,
257 "name": "fourtwentythree",
258 "physical_filter": "d-r",
259 "datetime_begin": visit_start,
260 "datetime_end": visit_end,
261 "day_obs": 20200101,
262 },
263 )
265 # Add more visits for some later tests
266 for visit_id in (424, 425):
267 butler.registry.insertDimensionData(
268 "visit",
269 {
270 "instrument": "DummyCamComp",
271 "id": visit_id,
272 "name": f"fourtwentyfour_{visit_id}",
273 "physical_filter": "d-r",
274 "day_obs": 20200101,
275 },
276 )
277 return butler, datasetType
279 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler:
280 # New datasets will be added to run and tag, but we will only look in
281 # tag when looking up datasets.
282 run = self.default_run
283 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
284 assert butler.run is not None
286 # Create and store a dataset
287 metric = makeExampleMetrics()
288 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423})
290 # Put and remove the dataset once as a DatasetRef, once as a dataId,
291 # and once with a DatasetType
293 # Keep track of any collections we add and do not clean up
294 expected_collections = {run}
296 counter = 0
297 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1")
298 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate]
299 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)):
300 # Since we are using subTest we can get cascading failures
301 # here with the first attempt failing and the others failing
302 # immediately because the dataset already exists. Work around
303 # this by using a distinct run collection each time
304 counter += 1
305 this_run = f"put_run_{counter}"
306 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
307 expected_collections.update({this_run})
309 with self.subTest(args=args):
310 kwargs: dict[str, Any] = {}
311 if not isinstance(args[0], DatasetRef): # type: ignore
312 kwargs["run"] = this_run
313 ref = butler.put(metric, *args, **kwargs)
314 self.assertIsInstance(ref, DatasetRef)
316 # Test get of a ref.
317 metricOut = butler.get(ref)
318 self.assertEqual(metric, metricOut)
319 # Test get
320 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
321 self.assertEqual(metric, metricOut)
322 # Test get with a datasetRef
323 metricOut = butler.get(ref)
324 self.assertEqual(metric, metricOut)
325 # Test getDeferred with dataId
326 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
327 self.assertEqual(metric, metricOut)
328 # Test getDeferred with a ref
329 metricOut = butler.getDeferred(ref).get()
330 self.assertEqual(metric, metricOut)
332 # Check we can get components
333 if storageClass.isComposite():
334 self.assertGetComponents(
335 butler, ref, ("summary", "data", "output"), metric, collections=this_run
336 )
338 primary_uri, secondary_uris = butler.getURIs(ref)
339 n_uris = len(secondary_uris)
340 if primary_uri:
341 n_uris += 1
343 # Can the artifacts themselves be retrieved?
344 if not butler._datastore.isEphemeral:
345 # Create a temporary directory to hold the retrieved
346 # artifacts.
347 with tempfile.TemporaryDirectory(
348 prefix="butler-artifacts-", ignore_cleanup_errors=True
349 ) as artifact_root:
350 root_uri = ResourcePath(artifact_root, forceDirectory=True)
352 for preserve_path in (True, False):
353 destination = root_uri.join(f"{preserve_path}_{counter}/")
354 log = logging.getLogger("lsst.x")
355 log.warning("Using destination %s for args %s", destination, args)
356 # Use copy so that we can test that overwrite
357 # protection works (using "auto" for File URIs
358 # would use hard links and subsequent transfer
359 # would work because it knows they are the same
360 # file).
361 transferred = butler.retrieveArtifacts(
362 [ref], destination, preserve_path=preserve_path, transfer="copy"
363 )
364 self.assertGreater(len(transferred), 0)
365 artifacts = list(ResourcePath.findFileResources([destination]))
366 self.assertEqual(set(transferred), set(artifacts))
368 for artifact in transferred:
369 path_in_destination = artifact.relative_to(destination)
370 self.assertIsNotNone(path_in_destination)
371 assert path_in_destination is not None
373 # When path is not preserved there should not
374 # be any path separators.
375 num_seps = path_in_destination.count("/")
376 if preserve_path:
377 self.assertGreater(num_seps, 0)
378 else:
379 self.assertEqual(num_seps, 0)
381 self.assertEqual(
382 len(artifacts),
383 n_uris,
384 "Comparing expected artifacts vs actual:"
385 f" {artifacts} vs {primary_uri} and {secondary_uris}",
386 )
388 if preserve_path:
389 # No need to run these twice
390 with self.assertRaises(ValueError):
391 butler.retrieveArtifacts([ref], destination, transfer="move")
393 with self.assertRaisesRegex(
394 ValueError, "^Destination location must refer to a directory"
395 ):
396 butler.retrieveArtifacts(
397 [ref], ResourcePath("/some/file.txt", forceDirectory=False)
398 )
400 with self.assertRaises(FileExistsError):
401 butler.retrieveArtifacts([ref], destination)
403 transferred_again = butler.retrieveArtifacts(
404 [ref], destination, preserve_path=preserve_path, overwrite=True
405 )
406 self.assertEqual(set(transferred_again), set(transferred))
408 # Now remove the dataset completely.
409 butler.pruneDatasets([ref], purge=True, unstore=True)
410 # Lookup with original args should still fail.
411 kwargs = {"collections": this_run}
412 if isinstance(args[0], DatasetRef):
413 kwargs = {} # Prevent warning from being issued.
414 self.assertFalse(butler.exists(*args, **kwargs))
415 # get() should still fail.
416 with self.assertRaises(FileNotFoundError):
417 butler.get(ref)
418 # Registry shouldn't be able to find it by dataset_id anymore.
419 self.assertIsNone(butler.get_dataset(ref.id))
421 # Do explicit registry removal since we know they are
422 # empty
423 butler.registry.removeCollection(this_run)
424 expected_collections.remove(this_run)
426 # Create DatasetRef for put using default run.
427 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run)
429 # Check that getDeferred fails with standalone ref.
430 with self.assertRaises(LookupError):
431 butler.getDeferred(refIn)
433 # Put the dataset again, since the last thing we did was remove it
434 # and we want to use the default collection.
435 ref = butler.put(metric, refIn)
437 # Get with parameters
438 stop = 4
439 sliced = butler.get(ref, parameters={"slice": slice(stop)})
440 self.assertNotEqual(metric, sliced)
441 self.assertEqual(metric.summary, sliced.summary)
442 self.assertEqual(metric.output, sliced.output)
443 assert metric.data is not None # for mypy
444 self.assertEqual(metric.data[:stop], sliced.data)
445 # getDeferred with parameters
446 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
447 self.assertNotEqual(metric, sliced)
448 self.assertEqual(metric.summary, sliced.summary)
449 self.assertEqual(metric.output, sliced.output)
450 self.assertEqual(metric.data[:stop], sliced.data)
451 # getDeferred with deferred parameters
452 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
453 self.assertNotEqual(metric, sliced)
454 self.assertEqual(metric.summary, sliced.summary)
455 self.assertEqual(metric.output, sliced.output)
456 self.assertEqual(metric.data[:stop], sliced.data)
458 if storageClass.isComposite():
459 # Check that components can be retrieved
460 metricOut = butler.get(ref.datasetType.name, dataId)
461 compNameS = ref.datasetType.componentTypeName("summary")
462 compNameD = ref.datasetType.componentTypeName("data")
463 summary = butler.get(compNameS, dataId)
464 self.assertEqual(summary, metric.summary)
465 data = butler.get(compNameD, dataId)
466 self.assertEqual(data, metric.data)
468 if "counter" in storageClass.derivedComponents:
469 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
470 self.assertEqual(count, len(data))
472 count = butler.get(
473 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
474 )
475 self.assertEqual(count, stop)
477 compRef = butler.find_dataset(compNameS, dataId, collections=butler.collections)
478 assert compRef is not None
479 summary = butler.get(compRef)
480 self.assertEqual(summary, metric.summary)
482 # Create a Dataset type that has the same name but is inconsistent.
483 inconsistentDatasetType = DatasetType(
484 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
485 )
487 # Getting with a dataset type that does not match registry fails
488 with self.assertRaisesRegex(
489 ValueError,
490 "(Supplied dataset type .* inconsistent with registry)"
491 "|(The new storage class .* is not compatible with the existing storage class)",
492 ):
493 butler.get(inconsistentDatasetType, dataId)
495 # Combining a DatasetRef with a dataId should fail
496 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"):
497 butler.get(ref, dataId)
498 # Getting with an explicit ref should fail if the id doesn't match.
499 with self.assertRaises(FileNotFoundError):
500 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run))
502 # Getting a dataset with unknown parameters should fail
503 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"):
504 butler.get(ref, parameters={"unsupported": True})
506 # Check we have a collection
507 collections = set(butler.registry.queryCollections())
508 self.assertEqual(collections, expected_collections)
510 # Clean up to check that we can remove something that may have
511 # already had a component removed
512 butler.pruneDatasets([ref], unstore=True, purge=True)
514 # Add the same ref again, so we can check that duplicate put fails.
515 ref = butler.put(metric, datasetType, dataId)
517 # Repeat put will fail.
518 with self.assertRaisesRegex(
519 ConflictingDefinitionError, "A database constraint failure was triggered"
520 ):
521 butler.put(metric, datasetType, dataId)
523 # Remove the datastore entry.
524 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
526 # Put will still fail
527 with self.assertRaisesRegex(
528 ConflictingDefinitionError, "A database constraint failure was triggered"
529 ):
530 butler.put(metric, datasetType, dataId)
532 # Repeat the same sequence with resolved ref.
533 butler.pruneDatasets([ref], unstore=True, purge=True)
534 ref = butler.put(metric, refIn)
536 # Repeat put will fail.
537 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"):
538 butler.put(metric, refIn)
540 # Remove the datastore entry.
541 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
543 # In case of resolved ref this write will succeed.
544 ref = butler.put(metric, refIn)
546 # Leave the dataset in place since some downstream tests require
547 # something to be present
549 return butler
551 def testDeferredCollectionPassing(self) -> None:
552 # Construct a butler with no run or collection, but make it writeable.
553 butler = self.create_empty_butler(writeable=True)
554 # Create and register a DatasetType
555 dimensions = butler.dimensions.conform(["instrument", "visit"])
556 datasetType = self.addDatasetType(
557 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
558 )
559 # Add needed Dimensions
560 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
561 butler.registry.insertDimensionData(
562 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
563 )
564 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
565 butler.registry.insertDimensionData(
566 "visit",
567 {
568 "instrument": "DummyCamComp",
569 "id": 423,
570 "name": "fourtwentythree",
571 "physical_filter": "d-r",
572 "day_obs": 20250101,
573 },
574 )
575 dataId = {"instrument": "DummyCamComp", "visit": 423}
576 # Create dataset.
577 metric = makeExampleMetrics()
578 # Register a new run and put dataset.
579 run = "deferred"
580 self.assertTrue(butler.registry.registerRun(run))
581 # Second time it will be allowed but indicate no-op
582 self.assertFalse(butler.registry.registerRun(run))
583 ref = butler.put(metric, datasetType, dataId, run=run)
584 # Putting with no run should fail with TypeError.
585 with self.assertRaises(CollectionError):
586 butler.put(metric, datasetType, dataId)
587 # Dataset should exist.
588 self.assertTrue(butler.exists(datasetType, dataId, collections=[run]))
589 # We should be able to get the dataset back, but with and without
590 # a deferred dataset handle.
591 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
592 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
593 # Trying to find the dataset without any collection is an error.
594 with self.assertRaises(NoDefaultCollectionError):
595 butler.exists(datasetType, dataId)
596 with self.assertRaises(CollectionError):
597 butler.get(datasetType, dataId)
598 # Associate the dataset with a different collection.
599 butler.registry.registerCollection("tagged")
600 butler.registry.associate("tagged", [ref])
601 # Deleting the dataset from the new collection should make it findable
602 # in the original collection.
603 butler.pruneDatasets([ref], tags=["tagged"])
604 self.assertTrue(butler.exists(datasetType, dataId, collections=[run]))
607class ButlerTests(ButlerPutGetTests):
608 """Tests for Butler."""
610 useTempRoot = True
611 validationCanFail: bool
612 fullConfigKey: str | None
613 registryStr: str | None
614 datastoreName: list[str] | None
615 datastoreStr: list[str]
616 predictionSupported = True
617 """Does getURIs support 'prediction mode'?"""
619 def setUp(self) -> None:
620 """Create a new butler root for each test."""
621 self.root = makeTestTempDir(TESTDIR)
622 Butler.makeRepo(self.root, config=Config(self.configFile))
623 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
625 def are_uris_equivalent(self, uri1: ResourcePath, uri2: ResourcePath) -> bool:
626 """Return True if two URIs refer to the same resource.
628 Subclasses may override to handle unique requirements.
629 """
630 return uri1 == uri2
632 def testConstructor(self) -> None:
633 """Independent test of constructor."""
634 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
635 self.assertIsInstance(butler, Butler)
637 # Check that butler.yaml is added automatically.
638 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
639 config_dir = self.tmpConfigFile[: -len(end)]
640 butler = Butler.from_config(config_dir, run=self.default_run)
641 self.assertIsInstance(butler, Butler)
643 # Even with a ResourcePath.
644 butler = Butler.from_config(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
645 self.assertIsInstance(butler, Butler)
647 collections = set(butler.registry.queryCollections())
648 self.assertEqual(collections, {self.default_run})
650 # Check that some special characters can be included in run name.
651 special_run = "u@b.c-A"
652 butler_special = Butler.from_config(butler=butler, run=special_run)
653 collections = set(butler_special.registry.queryCollections("*@*"))
654 self.assertEqual(collections, {special_run})
656 butler2 = Butler.from_config(butler=butler, collections=["other"])
657 self.assertEqual(butler2.collections, ("other",))
658 self.assertIsNone(butler2.run)
659 self.assertEqual(type(butler._datastore), type(butler2._datastore))
660 self.assertEqual(butler._datastore.config, butler2._datastore.config)
662 # Test that we can use an environment variable to find this
663 # repository.
664 butler_index = Config()
665 butler_index["label"] = self.tmpConfigFile
666 for suffix in (".yaml", ".json"):
667 # Ensure that the content differs so that we know that
668 # we aren't reusing the cache.
669 bad_label = f"file://bucket/not_real{suffix}"
670 butler_index["bad_label"] = bad_label
671 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
672 butler_index.dumpToUri(temp_file)
673 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
674 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"})
675 uri = Butler.get_repo_uri("bad_label")
676 self.assertEqual(uri, ResourcePath(bad_label))
677 uri = Butler.get_repo_uri("label")
678 butler = Butler.from_config(uri, writeable=False)
679 self.assertIsInstance(butler, Butler)
680 butler = Butler.from_config("label", writeable=False)
681 self.assertIsInstance(butler, Butler)
682 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
683 Butler.from_config("not_there", writeable=False)
684 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"):
685 Butler.from_config("bad_label")
686 with self.assertRaises(FileNotFoundError):
687 # Should ignore aliases.
688 Butler.from_config(ResourcePath("label", forceAbsolute=False))
689 with self.assertRaises(KeyError) as cm:
690 Butler.get_repo_uri("missing")
691 self.assertEqual(
692 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False)
693 )
694 self.assertIn("not known to", str(cm.exception))
695 # Should report no failure.
696 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "")
697 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
698 # Now with empty configuration.
699 butler_index = Config()
700 butler_index.dumpToUri(temp_file)
701 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
702 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"):
703 Butler.from_config("label")
704 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
705 # Now with bad contents.
706 with open(temp_file.ospath, "w") as fh:
707 print("'", file=fh)
708 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
709 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"):
710 Butler.from_config("label")
711 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
712 with self.assertRaises(FileNotFoundError):
713 Butler.get_repo_uri("label")
714 self.assertEqual(Butler.get_known_repos(), set())
716 with self.assertRaisesRegex(FileNotFoundError, "index file not found"):
717 Butler.from_config("label")
719 # Check that we can create Butler when the alias file is not found.
720 butler = Butler.from_config(self.tmpConfigFile, writeable=False)
721 self.assertIsInstance(butler, Butler)
722 with self.assertRaises(RuntimeError) as cm:
723 # No environment variable set.
724 Butler.get_repo_uri("label")
725 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False))
726 self.assertIn("No repository index defined", str(cm.exception))
727 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"):
728 # No aliases registered.
729 Butler.from_config("not_there")
730 self.assertEqual(Butler.get_known_repos(), set())
732 def testDafButlerRepositories(self):
733 with unittest.mock.patch.dict(
734 os.environ,
735 {"DAF_BUTLER_REPOSITORIES": "label: 'https://someuri.com'\notherLabel: 'https://otheruri.com'\n"},
736 ):
737 self.assertEqual(str(Butler.get_repo_uri("label")), "https://someuri.com")
739 with unittest.mock.patch.dict(
740 os.environ,
741 {
742 "DAF_BUTLER_REPOSITORIES": "label: https://someuri.com",
743 "DAF_BUTLER_REPOSITORY_INDEX": "https://someuri.com",
744 },
745 ):
746 with self.assertRaisesRegex(RuntimeError, "Only one of the environment variables"):
747 Butler.get_repo_uri("label")
749 with unittest.mock.patch.dict(
750 os.environ,
751 {"DAF_BUTLER_REPOSITORIES": "invalid"},
752 ):
753 with self.assertRaisesRegex(ValueError, "Repository index not in expected format"):
754 Butler.get_repo_uri("label")
756 def testBasicPutGet(self) -> None:
757 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
758 self.runPutGetTest(storageClass, "test_metric")
760 def testCompositePutGetConcrete(self) -> None:
761 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
762 butler = self.runPutGetTest(storageClass, "test_metric")
764 # Should *not* be disassembled
765 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
766 self.assertEqual(len(datasets), 1)
767 uri, components = butler.getURIs(datasets[0])
768 self.assertIsInstance(uri, ResourcePath)
769 self.assertFalse(components)
770 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
771 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
773 # Predicted dataset
774 if self.predictionSupported:
775 dataId = {"instrument": "DummyCamComp", "visit": 424}
776 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
777 self.assertFalse(components)
778 self.assertIsInstance(uri, ResourcePath)
779 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
780 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
782 def testCompositePutGetVirtual(self) -> None:
783 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
784 butler = self.runPutGetTest(storageClass, "test_metric_comp")
786 # Should be disassembled
787 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
788 self.assertEqual(len(datasets), 1)
789 uri, components = butler.getURIs(datasets[0])
791 if butler._datastore.isEphemeral:
792 # Never disassemble in-memory datastore
793 self.assertIsInstance(uri, ResourcePath)
794 self.assertFalse(components)
795 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
796 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
797 else:
798 self.assertIsNone(uri)
799 self.assertEqual(set(components), set(storageClass.components))
800 for compuri in components.values():
801 self.assertIsInstance(compuri, ResourcePath)
802 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
803 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
805 if self.predictionSupported:
806 # Predicted dataset
807 dataId = {"instrument": "DummyCamComp", "visit": 424}
808 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
810 if butler._datastore.isEphemeral:
811 # Never disassembled
812 self.assertIsInstance(uri, ResourcePath)
813 self.assertFalse(components)
814 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
815 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
816 else:
817 self.assertIsNone(uri)
818 self.assertEqual(set(components), set(storageClass.components))
819 for compuri in components.values():
820 self.assertIsInstance(compuri, ResourcePath)
821 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
822 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
824 def testStorageClassOverrideGet(self) -> None:
825 """Test storage class conversion on get with override."""
826 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
827 datasetTypeName = "anything"
828 run = self.default_run
830 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
832 # Create and store a dataset.
833 metric = makeExampleMetrics()
834 dataId = {"instrument": "DummyCamComp", "visit": 423}
836 ref = butler.put(metric, datasetType, dataId)
838 # Return native type.
839 retrieved = butler.get(ref)
840 self.assertEqual(retrieved, metric)
842 # Specify an override.
843 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
844 model = butler.get(ref, storageClass=new_sc)
845 self.assertNotEqual(type(model), type(retrieved))
846 self.assertIs(type(model), new_sc.pytype)
847 self.assertEqual(retrieved, model)
849 # Defer but override later.
850 deferred = butler.getDeferred(ref)
851 model = deferred.get(storageClass=new_sc)
852 self.assertIs(type(model), new_sc.pytype)
853 self.assertEqual(retrieved, model)
855 # Defer but override up front.
856 deferred = butler.getDeferred(ref, storageClass=new_sc)
857 model = deferred.get()
858 self.assertIs(type(model), new_sc.pytype)
859 self.assertEqual(retrieved, model)
861 # Retrieve a component. Should be a tuple.
862 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
863 self.assertIs(type(data), tuple)
864 self.assertEqual(data, tuple(retrieved.data))
866 # Parameter on the write storage class should work regardless
867 # of read storage class.
868 data = butler.get(
869 "anything.data",
870 dataId,
871 storageClass="StructuredDataDataTestTuple",
872 parameters={"slice": slice(2, 4)},
873 )
874 self.assertEqual(len(data), 2)
876 # Try a parameter that is known to the read storage class but not
877 # the write storage class.
878 with self.assertRaises(KeyError):
879 butler.get(
880 "anything.data",
881 dataId,
882 storageClass="StructuredDataDataTestTuple",
883 parameters={"xslice": slice(2, 4)},
884 )
886 def testPytypePutCoercion(self) -> None:
887 """Test python type coercion on Butler.get and put."""
888 # Store some data with the normal example storage class.
889 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
890 datasetTypeName = "test_metric"
891 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
893 dataId = {"instrument": "DummyCamComp", "visit": 423}
895 # Put a dict and this should coerce to a MetricsExample
896 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
897 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
898 test_metric = butler.get(metric_ref)
899 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
900 self.assertEqual(test_metric.summary, test_dict["summary"])
901 self.assertEqual(test_metric.output, test_dict["output"])
903 # Check that the put still works if a DatasetType is given with
904 # a definition matching this python type.
905 registry_type = butler.get_dataset_type(datasetTypeName)
906 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
907 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
908 self.assertEqual(metric2_ref.datasetType, registry_type)
910 # The get will return the type expected by registry.
911 test_metric2 = butler.get(metric2_ref)
912 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
914 # Make a new DatasetRef with the compatible but different DatasetType.
915 # This should now return a dict.
916 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
917 test_dict2 = butler.get(new_ref)
918 self.assertEqual(get_full_type_name(test_dict2), "dict")
920 # Get it again with the wrong dataset type definition using get()
921 # rather than get(). This should be consistent with get()
922 # behavior and return the type of the DatasetType.
923 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
924 self.assertEqual(get_full_type_name(test_dict3), "dict")
926 def testIngest(self) -> None:
927 butler = self.create_empty_butler(run=self.default_run)
929 # Create and register a DatasetType
930 dimensions = butler.dimensions.conform(["instrument", "visit", "detector"])
932 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
933 datasetTypeName = "metric"
935 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
937 # Add needed Dimensions
938 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
939 butler.registry.insertDimensionData(
940 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
941 )
942 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
943 for detector in (1, 2):
944 butler.registry.insertDimensionData(
945 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
946 )
948 butler.registry.insertDimensionData(
949 "visit",
950 {
951 "instrument": "DummyCamComp",
952 "id": 423,
953 "name": "fourtwentythree",
954 "physical_filter": "d-r",
955 "day_obs": 20250101,
956 },
957 {
958 "instrument": "DummyCamComp",
959 "id": 424,
960 "name": "fourtwentyfour",
961 "physical_filter": "d-r",
962 "day_obs": 20250101,
963 },
964 )
966 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter")
967 dataRoot = os.path.join(TESTDIR, "data", "basic")
968 datasets = []
969 for detector in (1, 2):
970 detector_name = f"detector_{detector}"
971 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
972 dataId = butler.registry.expandDataId(
973 {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
974 )
975 # Create a DatasetRef for ingest
976 refIn = DatasetRef(datasetType, dataId, run=self.default_run)
978 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
980 butler.ingest(*datasets, transfer="copy")
982 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
983 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
985 metrics1 = butler.get(datasetTypeName, dataId1)
986 metrics2 = butler.get(datasetTypeName, dataId2)
987 self.assertNotEqual(metrics1, metrics2)
989 # Compare URIs
990 uri1 = butler.getURI(datasetTypeName, dataId1)
991 uri2 = butler.getURI(datasetTypeName, dataId2)
992 self.assertFalse(self.are_uris_equivalent(uri1, uri2), f"Cf. {uri1} with {uri2}")
994 # Now do a multi-dataset but single file ingest
995 metricFile = os.path.join(dataRoot, "detectors.yaml")
996 refs = []
997 for detector in (1, 2):
998 detector_name = f"detector_{detector}"
999 dataId = butler.registry.expandDataId(
1000 {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
1001 )
1002 # Create a DatasetRef for ingest
1003 refs.append(DatasetRef(datasetType, dataId, run=self.default_run))
1005 # Test "move" transfer to ensure that the files themselves
1006 # have disappeared following ingest.
1007 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
1008 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
1010 datasets = []
1011 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
1013 # For first ingest use copy.
1014 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
1016 # Now try to ingest again in "execution butler" mode where
1017 # the registry entries exist but the datastore does not have
1018 # the files. We also need to strip the dimension records to ensure
1019 # that they will be re-added by the ingest.
1020 ref = datasets[0].refs[0]
1021 datasets[0].refs = [
1022 cast(
1023 DatasetRef,
1024 butler.find_dataset(ref.datasetType, data_id=ref.dataId, collections=ref.run),
1025 )
1026 for ref in datasets[0].refs
1027 ]
1028 all_refs = []
1029 for dataset in datasets:
1030 refs = []
1031 for ref in dataset.refs:
1032 # Create a dict from the dataId to drop the records.
1033 new_data_id = dict(ref.dataId.required)
1034 new_ref = butler.find_dataset(ref.datasetType, new_data_id, collections=ref.run)
1035 assert new_ref is not None
1036 self.assertFalse(new_ref.dataId.hasRecords())
1037 refs.append(new_ref)
1038 dataset.refs = refs
1039 all_refs.extend(dataset.refs)
1040 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False)
1042 # Use move mode to test that the file is deleted. Also
1043 # disable recording of file size.
1044 butler.ingest(*datasets, transfer="move", record_validation_info=False)
1046 # Check that every ref now has records.
1047 for dataset in datasets:
1048 for ref in dataset.refs:
1049 self.assertTrue(ref.dataId.hasRecords())
1051 # Ensure that the file has disappeared.
1052 self.assertFalse(tempFile.exists())
1054 # Check that the datastore recorded no file size.
1055 # Not all datastores can support this.
1056 try:
1057 infos = butler._datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined]
1058 self.assertEqual(infos[0].file_size, -1)
1059 except AttributeError:
1060 pass
1062 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
1063 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
1065 multi1 = butler.get(datasetTypeName, dataId1)
1066 multi2 = butler.get(datasetTypeName, dataId2)
1068 self.assertEqual(multi1, metrics1)
1069 self.assertEqual(multi2, metrics2)
1071 # Compare URIs
1072 uri1 = butler.getURI(datasetTypeName, dataId1)
1073 uri2 = butler.getURI(datasetTypeName, dataId2)
1074 self.assertTrue(self.are_uris_equivalent(uri1, uri2), f"Cf. {uri1} with {uri2}")
1076 # Test that removing one does not break the second
1077 # This line will issue a warning log message for a ChainedDatastore
1078 # that uses an InMemoryDatastore since in-memory can not ingest
1079 # files.
1080 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
1081 self.assertFalse(butler.exists(datasetTypeName, dataId1))
1082 self.assertTrue(butler.exists(datasetTypeName, dataId2))
1083 multi2b = butler.get(datasetTypeName, dataId2)
1084 self.assertEqual(multi2, multi2b)
1086 # Ensure we can ingest 0 datasets
1087 datasets = []
1088 butler.ingest(*datasets)
1090 def testPickle(self) -> None:
1091 """Test pickle support."""
1092 butler = self.create_empty_butler(run=self.default_run)
1093 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration"
1094 butlerOut = pickle.loads(pickle.dumps(butler))
1095 self.assertIsInstance(butlerOut, Butler)
1096 self.assertEqual(butlerOut._config, butler._config)
1097 self.assertEqual(butlerOut.collections, butler.collections)
1098 self.assertEqual(butlerOut.run, butler.run)
1100 def testGetDatasetTypes(self) -> None:
1101 butler = self.create_empty_butler(run=self.default_run)
1102 dimensions = butler.dimensions.conform(["instrument", "visit", "physical_filter"])
1103 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [
1104 (
1105 "instrument",
1106 [
1107 {"instrument": "DummyCam"},
1108 {"instrument": "DummyHSC"},
1109 {"instrument": "DummyCamComp"},
1110 ],
1111 ),
1112 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]),
1113 ("day_obs", [{"instrument": "DummyCam", "id": 20250101}]),
1114 (
1115 "visit",
1116 [
1117 {
1118 "instrument": "DummyCam",
1119 "id": 42,
1120 "name": "fortytwo",
1121 "physical_filter": "d-r",
1122 "day_obs": 20250101,
1123 }
1124 ],
1125 ),
1126 ]
1127 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1128 # Add needed Dimensions
1129 for element, data in dimensionEntries:
1130 butler.registry.insertDimensionData(element, *data)
1132 # When a DatasetType is added to the registry entries are not created
1133 # for components but querying them can return the components.
1134 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
1135 components = set()
1136 for datasetTypeName in datasetTypeNames:
1137 # Create and register a DatasetType
1138 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1140 for componentName in storageClass.components:
1141 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
1143 fromRegistry: set[DatasetType] = set()
1144 for parent_dataset_type in butler.registry.queryDatasetTypes():
1145 fromRegistry.add(parent_dataset_type)
1146 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
1147 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
1149 # Now that we have some dataset types registered, validate them
1150 butler.validateConfiguration(
1151 ignore=[
1152 "test_metric_comp",
1153 "metric3",
1154 "metric5",
1155 "calexp",
1156 "DummySC",
1157 "datasetType.component",
1158 "random_data",
1159 "random_data_2",
1160 ]
1161 )
1163 # Add a new datasetType that will fail template validation
1164 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
1165 if self.validationCanFail:
1166 with self.assertRaises(ValidationError):
1167 butler.validateConfiguration()
1169 # Rerun validation but with a subset of dataset type names
1170 butler.validateConfiguration(datasetTypeNames=["metric4"])
1172 # Rerun validation but ignore the bad datasetType
1173 butler.validateConfiguration(
1174 ignore=[
1175 "test_metric_comp",
1176 "metric3",
1177 "metric5",
1178 "calexp",
1179 "DummySC",
1180 "datasetType.component",
1181 "random_data",
1182 "random_data_2",
1183 ]
1184 )
1186 def testTransaction(self) -> None:
1187 butler = self.create_empty_butler(run=self.default_run)
1188 datasetTypeName = "test_metric"
1189 dimensions = butler.dimensions.conform(["instrument", "visit"])
1190 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = (
1191 ("instrument", {"instrument": "DummyCam"}),
1192 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
1193 ("day_obs", {"instrument": "DummyCam", "id": 20250101}),
1194 (
1195 "visit",
1196 {
1197 "instrument": "DummyCam",
1198 "id": 42,
1199 "name": "fortytwo",
1200 "physical_filter": "d-r",
1201 "day_obs": 20250101,
1202 },
1203 ),
1204 )
1205 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1206 metric = makeExampleMetrics()
1207 dataId = {"instrument": "DummyCam", "visit": 42}
1208 # Create and register a DatasetType
1209 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1210 with self.assertRaises(TransactionTestError):
1211 with butler.transaction():
1212 # Add needed Dimensions
1213 for args in dimensionEntries:
1214 butler.registry.insertDimensionData(*args)
1215 # Store a dataset
1216 ref = butler.put(metric, datasetTypeName, dataId)
1217 self.assertIsInstance(ref, DatasetRef)
1218 # Test get of a ref.
1219 metricOut = butler.get(ref)
1220 self.assertEqual(metric, metricOut)
1221 # Test get
1222 metricOut = butler.get(datasetTypeName, dataId)
1223 self.assertEqual(metric, metricOut)
1224 # Check we can get components
1225 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
1226 raise TransactionTestError("This should roll back the entire transaction")
1227 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
1228 butler.registry.expandDataId(dataId)
1229 # Should raise LookupError for missing data ID value
1230 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
1231 butler.get(datasetTypeName, dataId)
1232 # Also check explicitly if Dataset entry is missing
1233 self.assertIsNone(butler.find_dataset(datasetType, dataId, collections=butler.collections))
1234 # Direct retrieval should not find the file in the Datastore
1235 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
1236 butler.get(ref)
1238 def testMakeRepo(self) -> None:
1239 """Test that we can write butler configuration to a new repository via
1240 the Butler.makeRepo interface and then instantiate a butler from the
1241 repo root.
1242 """
1243 # Do not run the test if we know this datastore configuration does
1244 # not support a file system root
1245 if self.fullConfigKey is None:
1246 return
1248 # create two separate directories
1249 root1 = tempfile.mkdtemp(dir=self.root)
1250 root2 = tempfile.mkdtemp(dir=self.root)
1252 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1253 limited = Config(self.configFile)
1254 butler1 = Butler.from_config(butlerConfig)
1255 assert isinstance(butler1, DirectButler), "Expect DirectButler in configuration"
1256 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1257 full = Config(self.tmpConfigFile)
1258 butler2 = Butler.from_config(butlerConfig)
1259 assert isinstance(butler2, DirectButler), "Expect DirectButler in configuration"
1260 # Butlers should have the same configuration regardless of whether
1261 # defaults were expanded.
1262 self.assertEqual(butler1._config, butler2._config)
1263 # Config files loaded directly should not be the same.
1264 self.assertNotEqual(limited, full)
1265 # Make sure "limited" doesn't have a few keys we know it should be
1266 # inheriting from defaults.
1267 self.assertIn(self.fullConfigKey, full)
1268 self.assertNotIn(self.fullConfigKey, limited)
1270 # Collections don't appear until something is put in them
1271 collections1 = set(butler1.registry.queryCollections())
1272 self.assertEqual(collections1, set())
1273 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1275 # Check that a config with no associated file name will not
1276 # work properly with relocatable Butler repo
1277 butlerConfig.configFile = None
1278 with self.assertRaises(ValueError):
1279 Butler.from_config(butlerConfig)
1281 with self.assertRaises(FileExistsError):
1282 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1284 def testStringification(self) -> None:
1285 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
1286 butlerStr = str(butler)
1288 if self.datastoreStr is not None:
1289 for testStr in self.datastoreStr:
1290 self.assertIn(testStr, butlerStr)
1291 if self.registryStr is not None:
1292 self.assertIn(self.registryStr, butlerStr)
1294 datastoreName = butler._datastore.name
1295 if self.datastoreName is not None:
1296 for testStr in self.datastoreName:
1297 self.assertIn(testStr, datastoreName)
1299 def testButlerRewriteDataId(self) -> None:
1300 """Test that dataIds can be rewritten based on dimension records."""
1301 butler = self.create_empty_butler(run=self.default_run)
1303 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1304 datasetTypeName = "random_data"
1306 # Create dimension records.
1307 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1308 butler.registry.insertDimensionData(
1309 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1310 )
1311 butler.registry.insertDimensionData(
1312 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1313 )
1315 dimensions = butler.dimensions.conform(["instrument", "exposure"])
1316 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1317 butler.registry.registerDatasetType(datasetType)
1319 n_exposures = 5
1320 dayobs = 20210530
1322 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": dayobs})
1324 for i in range(n_exposures):
1325 butler.registry.insertDimensionData("group", {"instrument": "DummyCamComp", "name": f"group{i}"})
1326 butler.registry.insertDimensionData(
1327 "exposure",
1328 {
1329 "instrument": "DummyCamComp",
1330 "id": i,
1331 "obs_id": f"exp{i}",
1332 "seq_num": i,
1333 "day_obs": dayobs,
1334 "physical_filter": "d-r",
1335 "group": f"group{i}",
1336 },
1337 )
1339 # Write some data.
1340 for i in range(n_exposures):
1341 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1343 # Use the seq_num for the put to test rewriting.
1344 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1345 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1347 # Check that the exposure is correct in the dataId
1348 self.assertEqual(ref.dataId["exposure"], i)
1350 # and check that we can get the dataset back with the same dataId
1351 new_metric = butler.get(datasetTypeName, dataId=dataId)
1352 self.assertEqual(new_metric, metric)
1354 # Check that we can find the datasets using the day_obs or the
1355 # exposure.day_obs.
1356 datasets_1 = list(
1357 butler.registry.queryDatasets(
1358 datasetType,
1359 collections=self.default_run,
1360 where="day_obs = dayObs AND instrument = instr",
1361 bind={"dayObs": dayobs, "instr": "DummyCamComp"},
1362 )
1363 )
1364 datasets_2 = list(
1365 butler.registry.queryDatasets(
1366 datasetType,
1367 collections=self.default_run,
1368 where="exposure.day_obs = dayObs AND instrument = instr",
1369 bind={"dayObs": dayobs, "instr": "DummyCamComp"},
1370 )
1371 )
1372 self.assertEqual(datasets_1, datasets_2)
1374 def testGetDatasetCollectionCaching(self):
1375 # Prior to DM-41117, there was a bug where get_dataset would throw
1376 # MissingCollectionError if you tried to fetch a dataset that was added
1377 # after the collection cache was last updated.
1378 reader_butler, datasetType = self.create_butler(self.default_run, "int", "datasettypename")
1379 writer_butler = self.create_empty_butler(writeable=True, run="new_run")
1380 dataId = {"instrument": "DummyCamComp", "visit": 423}
1381 put_ref = writer_butler.put(123, datasetType, dataId)
1382 get_ref = reader_butler.get_dataset(put_ref.id)
1383 self.assertEqual(get_ref.id, put_ref.id)
1386class FileDatastoreButlerTests(ButlerTests):
1387 """Common tests and specialization of ButlerTests for butlers backed
1388 by datastores that inherit from FileDatastore.
1389 """
1391 trustModeSupported = True
1393 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool:
1394 """Check if file exists at a given path (relative to root).
1396 Test testPutTemplates verifies actual physical existance of the files
1397 in the requested location.
1398 """
1399 uri = ResourcePath(root, forceDirectory=True)
1400 return uri.join(relpath).exists()
1402 def testPutTemplates(self) -> None:
1403 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1404 butler = self.create_empty_butler(run=self.default_run)
1406 # Add needed Dimensions
1407 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1408 butler.registry.insertDimensionData(
1409 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1410 )
1411 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
1412 butler.registry.insertDimensionData(
1413 "visit",
1414 {
1415 "instrument": "DummyCamComp",
1416 "id": 423,
1417 "name": "v423",
1418 "physical_filter": "d-r",
1419 "day_obs": 20250101,
1420 },
1421 )
1422 butler.registry.insertDimensionData(
1423 "visit",
1424 {
1425 "instrument": "DummyCamComp",
1426 "id": 425,
1427 "name": "v425",
1428 "physical_filter": "d-r",
1429 "day_obs": 20250101,
1430 },
1431 )
1433 # Create and store a dataset
1434 metric = makeExampleMetrics()
1436 # Create two almost-identical DatasetTypes (both will use default
1437 # template)
1438 dimensions = butler.dimensions.conform(["instrument", "visit"])
1439 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1440 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1441 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1443 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1444 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1446 # Put with exactly the data ID keys needed
1447 ref = butler.put(metric, "metric1", dataId1)
1448 uri = butler.getURI(ref)
1449 self.assertTrue(uri.exists())
1450 self.assertTrue(
1451 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1452 )
1454 # Check the template based on dimensions
1455 if hasattr(butler._datastore, "templates"):
1456 butler._datastore.templates.validateTemplates([ref])
1458 # Put with extra data ID keys (physical_filter is an optional
1459 # dependency); should not change template (at least the way we're
1460 # defining them to behave now; the important thing is that they
1461 # must be consistent).
1462 ref = butler.put(metric, "metric2", dataId2)
1463 uri = butler.getURI(ref)
1464 self.assertTrue(uri.exists())
1465 self.assertTrue(
1466 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1467 )
1469 # Check the template based on dimensions
1470 if hasattr(butler._datastore, "templates"):
1471 butler._datastore.templates.validateTemplates([ref])
1473 # Use a template that has a typo in dimension record metadata.
1474 # Easier to test with a butler that has a ref with records attached.
1475 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1476 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"):
1477 path = template.format(ref)
1478 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1480 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1481 with self.assertRaises(KeyError):
1482 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"):
1483 template.format(ref)
1485 # Now use a file template that will not result in unique filenames
1486 with self.assertRaises(FileTemplateValidationError):
1487 butler.put(metric, "metric3", dataId1)
1489 def testImportExport(self) -> None:
1490 # Run put/get tests just to create and populate a repo.
1491 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1492 self.runImportExportTest(storageClass)
1494 @unittest.expectedFailure
1495 def testImportExportVirtualComposite(self) -> None:
1496 # Run put/get tests just to create and populate a repo.
1497 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1498 self.runImportExportTest(storageClass)
1500 def runImportExportTest(self, storageClass: StorageClass) -> None:
1501 """Test exporting and importing.
1503 This test does an export to a temp directory and an import back
1504 into a new temp directory repo. It does not assume a posix datastore.
1505 """
1506 exportButler = self.runPutGetTest(storageClass, "test_metric")
1508 # Test that we must have a file extension.
1509 with self.assertRaises(ValueError):
1510 with exportButler.export(filename="dump", directory=".") as export:
1511 pass
1513 # Test that unknown format is not allowed.
1514 with self.assertRaises(ValueError):
1515 with exportButler.export(filename="dump.fits", directory=".") as export:
1516 pass
1518 # Test that the repo actually has at least one dataset.
1519 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1520 self.assertGreater(len(datasets), 0)
1521 # Add a DimensionRecord that's unused by those datasets.
1522 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1523 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1524 # Export and then import datasets.
1525 with safeTestTempDir(TESTDIR) as exportDir:
1526 exportFile = os.path.join(exportDir, "exports.yaml")
1527 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1528 export.saveDatasets(datasets)
1529 # Export the same datasets again. This should quietly do
1530 # nothing because of internal deduplication, and it shouldn't
1531 # complain about being asked to export the "htm7" elements even
1532 # though there aren't any in these datasets or in the database.
1533 export.saveDatasets(datasets, elements=["htm7"])
1534 # Save one of the data IDs again; this should be harmless
1535 # because of internal deduplication.
1536 export.saveDataIds([datasets[0].dataId])
1537 # Save some dimension records directly.
1538 export.saveDimensionData("skymap", [skymapRecord])
1539 self.assertTrue(os.path.exists(exportFile))
1540 with safeTestTempDir(TESTDIR) as importDir:
1541 # We always want this to be a local posix butler
1542 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1543 # Calling script.butlerImport tests the implementation of the
1544 # butler command line interface "import" subcommand. Functions
1545 # in the script folder are generally considered protected and
1546 # should not be used as public api.
1547 with open(exportFile) as f:
1548 script.butlerImport(
1549 importDir,
1550 export_file=f,
1551 directory=exportDir,
1552 transfer="auto",
1553 skip_dimensions=None,
1554 )
1555 importButler = Butler.from_config(importDir, run=self.default_run)
1556 for ref in datasets:
1557 with self.subTest(ref=ref):
1558 # Test for existence by passing in the DatasetType and
1559 # data ID separately, to avoid lookup by dataset_id.
1560 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId))
1561 self.assertEqual(
1562 list(importButler.registry.queryDimensionRecords("skymap")),
1563 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)],
1564 )
1566 def testRemoveRuns(self) -> None:
1567 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1568 butler = self.create_empty_butler(writeable=True)
1569 # Load registry data with dimensions to hang datasets off of.
1570 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1571 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1572 # Add some RUN-type collection.
1573 run1 = "run1"
1574 butler.registry.registerRun(run1)
1575 run2 = "run2"
1576 butler.registry.registerRun(run2)
1577 # put a dataset in each
1578 metric = makeExampleMetrics()
1579 dimensions = butler.dimensions.conform(["instrument", "physical_filter"])
1580 datasetType = self.addDatasetType(
1581 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1582 )
1583 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1584 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1585 uri1 = butler.getURI(ref1)
1586 uri2 = butler.getURI(ref2)
1588 with self.assertRaises(OrphanedRecordError):
1589 butler.registry.removeDatasetType(datasetType.name)
1591 # Remove from both runs with different values for unstore.
1592 butler.removeRuns([run1], unstore=True)
1593 butler.removeRuns([run2], unstore=False)
1594 # Should be nothing in registry for either one, and datastore should
1595 # not think either exists.
1596 with self.assertRaises(MissingCollectionError):
1597 butler.registry.getCollectionType(run1)
1598 with self.assertRaises(MissingCollectionError):
1599 butler.registry.getCollectionType(run2)
1600 self.assertFalse(butler.stored(ref1))
1601 self.assertFalse(butler.stored(ref2))
1602 # The ref we unstored should be gone according to the URI, but the
1603 # one we forgot should still be around.
1604 self.assertFalse(uri1.exists())
1605 self.assertTrue(uri2.exists())
1607 # Now that the collections have been pruned we can remove the
1608 # dataset type
1609 butler.registry.removeDatasetType(datasetType.name)
1611 with self.assertLogs("lsst.daf.butler.registry", "INFO") as cm:
1612 butler.registry.removeDatasetType(("test*", "test*"))
1613 self.assertIn("not defined", "\n".join(cm.output))
1615 def remove_dataset_out_of_band(self, butler: Butler, ref: DatasetRef) -> None:
1616 """Simulate an external actor removing a file outside of Butler's
1617 knowledge.
1619 Subclasses may override to handle more complicated datastore
1620 configurations.
1621 """
1622 uri = butler.getURI(ref)
1623 uri.remove()
1624 datastore = cast(FileDatastore, butler._datastore)
1625 datastore.cacheManager.remove_from_cache(ref)
1627 def testPruneDatasets(self) -> None:
1628 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1629 butler = self.create_empty_butler(writeable=True)
1630 # Load registry data with dimensions to hang datasets off of.
1631 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1632 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1633 # Add some RUN-type collections.
1634 run1 = "run1"
1635 butler.registry.registerRun(run1)
1636 run2 = "run2"
1637 butler.registry.registerRun(run2)
1638 # put some datasets. ref1 and ref2 have the same data ID, and are in
1639 # different runs. ref3 has a different data ID.
1640 metric = makeExampleMetrics()
1641 dimensions = butler.dimensions.conform(["instrument", "physical_filter"])
1642 datasetType = self.addDatasetType(
1643 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1644 )
1645 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1646 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1647 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1649 many_stored = butler.stored_many([ref1, ref2, ref3])
1650 for ref, stored in many_stored.items():
1651 self.assertTrue(stored, f"Ref {ref} should be stored")
1653 many_exists = butler._exists_many([ref1, ref2, ref3])
1654 for ref, exists in many_exists.items():
1655 self.assertTrue(exists, f"Checking ref {ref} exists.")
1656 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored")
1658 # Simple prune.
1659 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1660 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1))
1662 many_stored = butler.stored_many([ref1, ref2, ref3])
1663 for ref, stored in many_stored.items():
1664 self.assertFalse(stored, f"Ref {ref} should not be stored")
1666 many_exists = butler._exists_many([ref1, ref2, ref3])
1667 for ref, exists in many_exists.items():
1668 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored")
1670 # Put data back.
1671 ref1_new = butler.put(metric, ref1)
1672 self.assertEqual(ref1_new, ref1) # Reuses original ID.
1673 ref2 = butler.put(metric, ref2)
1675 many_stored = butler.stored_many([ref1, ref2, ref3])
1676 self.assertTrue(many_stored[ref1])
1677 self.assertTrue(many_stored[ref2])
1678 self.assertFalse(many_stored[ref3])
1680 ref3 = butler.put(metric, ref3)
1682 many_exists = butler._exists_many([ref1, ref2, ref3])
1683 for ref, exists in many_exists.items():
1684 self.assertTrue(exists, f"Ref {ref} should not be stored")
1686 # Clear out the datasets from registry and start again.
1687 refs = [ref1, ref2, ref3]
1688 butler.pruneDatasets(refs, purge=True, unstore=True)
1689 for ref in refs:
1690 butler.put(metric, ref)
1692 # Confirm we can retrieve deferred.
1693 dref1 = butler.getDeferred(ref1) # known and exists
1694 metric1 = dref1.get()
1695 self.assertEqual(metric1, metric)
1697 # Test different forms of file availability.
1698 # Need to be in a state where:
1699 # - one ref just has registry record.
1700 # - one ref has a missing file but a datastore record.
1701 # - one ref has a missing datastore record but file is there.
1702 # - one ref does not exist anywhere.
1703 # Do not need to test a ref that has everything since that is tested
1704 # above.
1705 ref0 = DatasetRef(
1706 datasetType,
1707 DataCoordinate.standardize(
1708 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions
1709 ),
1710 run=run1,
1711 )
1713 # Delete from datastore and retain in Registry.
1714 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False)
1716 # File has been removed.
1717 self.remove_dataset_out_of_band(butler, ref2)
1719 # Datastore has lost track.
1720 butler._datastore.forget([ref3])
1722 # First test with a standard butler.
1723 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True)
1724 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1725 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED)
1726 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE)
1727 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED)
1729 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False)
1730 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1731 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED)
1732 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN)
1733 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED)
1734 self.assertTrue(exists_many[ref2])
1736 # Check that per-ref query gives the same answer as many query.
1737 for ref, exists in exists_many.items():
1738 self.assertEqual(butler.exists(ref, full_check=False), exists)
1740 # Get deferred checks for existence before it allows it to be
1741 # retrieved.
1742 with self.assertRaises(LookupError):
1743 butler.getDeferred(ref3) # not known, file exists
1744 dref2 = butler.getDeferred(ref2) # known but file missing
1745 with self.assertRaises(FileNotFoundError):
1746 dref2.get()
1748 # Test again with a trusting butler.
1749 if self.trustModeSupported:
1750 butler._datastore.trustGetRequest = True
1751 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True)
1752 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1753 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED)
1754 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE)
1755 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT)
1757 # When trusting we can get a deferred dataset handle that is not
1758 # known but does exist.
1759 dref3 = butler.getDeferred(ref3)
1760 metric3 = dref3.get()
1761 self.assertEqual(metric3, metric)
1763 # Check that per-ref query gives the same answer as many query.
1764 for ref, exists in exists_many.items():
1765 self.assertEqual(butler.exists(ref, full_check=True), exists)
1767 # Create a ref that surprisingly has the UUID of an existing ref
1768 # but is not the same.
1769 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id)
1770 with self.assertRaises(ValueError):
1771 butler.exists(ref_bad)
1773 # Create a ref that has a compatible storage class.
1774 ref_compat = ref2.overrideStorageClass("StructuredDataDict")
1775 exists = butler.exists(ref_compat)
1776 self.assertEqual(exists, exists_many[ref2])
1778 # Remove everything and start from scratch.
1779 butler._datastore.trustGetRequest = False
1780 butler.pruneDatasets(refs, purge=True, unstore=True)
1781 for ref in refs:
1782 butler.put(metric, ref)
1784 # These tests mess directly with the trash table and can leave the
1785 # datastore in an odd state. Do them at the end.
1786 # Check that in normal mode, deleting the record will lead to
1787 # trash not touching the file.
1788 uri1 = butler.getURI(ref1)
1789 butler._datastore.bridge.moveToTrash(
1790 [ref1], transaction=None
1791 ) # Update the dataset_location table
1792 butler._datastore.forget([ref1])
1793 butler._datastore.trash(ref1)
1794 butler._datastore.emptyTrash()
1795 self.assertTrue(uri1.exists())
1796 uri1.remove() # Clean it up.
1798 # Simulate execution butler setup by deleting the datastore
1799 # record but keeping the file around and trusting.
1800 butler._datastore.trustGetRequest = True
1801 uris = butler.get_many_uris([ref2, ref3])
1802 uri2 = uris[ref2].primaryURI
1803 uri3 = uris[ref3].primaryURI
1804 self.assertTrue(uri2.exists())
1805 self.assertTrue(uri3.exists())
1807 # Remove the datastore record.
1808 butler._datastore.bridge.moveToTrash(
1809 [ref2], transaction=None
1810 ) # Update the dataset_location table
1811 butler._datastore.forget([ref2])
1812 self.assertTrue(uri2.exists())
1813 butler._datastore.trash([ref2, ref3])
1814 # Immediate removal for ref2 file
1815 self.assertFalse(uri2.exists())
1816 # But ref3 has to wait for the empty.
1817 self.assertTrue(uri3.exists())
1818 butler._datastore.emptyTrash()
1819 self.assertFalse(uri3.exists())
1821 # Clear out the datasets from registry.
1822 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1825class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1826 """PosixDatastore specialization of a butler"""
1828 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1829 fullConfigKey: str | None = ".datastore.formatters"
1830 validationCanFail = True
1831 datastoreStr = ["/tmp"]
1832 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1833 registryStr = "/gen3.sqlite3"
1835 def testPathConstructor(self) -> None:
1836 """Independent test of constructor using PathLike."""
1837 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
1838 self.assertIsInstance(butler, Butler)
1840 # And again with a Path object with the butler yaml
1841 path = pathlib.Path(self.tmpConfigFile)
1842 butler = Butler.from_config(path, writeable=False)
1843 self.assertIsInstance(butler, Butler)
1845 # And again with a Path object without the butler yaml
1846 # (making sure we skip it if the tmp config doesn't end
1847 # in butler.yaml -- which is the case for a subclass)
1848 if self.tmpConfigFile.endswith("butler.yaml"):
1849 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1850 butler = Butler.from_config(path, writeable=False)
1851 self.assertIsInstance(butler, Butler)
1853 def testExportTransferCopy(self) -> None:
1854 """Test local export using all transfer modes"""
1855 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1856 exportButler = self.runPutGetTest(storageClass, "test_metric")
1857 # Test that the repo actually has at least one dataset.
1858 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1859 self.assertGreater(len(datasets), 0)
1860 uris = [exportButler.getURI(d) for d in datasets]
1861 assert isinstance(exportButler._datastore, FileDatastore)
1862 datastoreRoot = exportButler.get_datastore_roots()[exportButler.get_datastore_names()[0]]
1864 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1866 for path in pathsInStore:
1867 # Assume local file system
1868 assert path is not None
1869 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1871 for transfer in ("copy", "link", "symlink", "relsymlink"):
1872 with safeTestTempDir(TESTDIR) as exportDir:
1873 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1874 export.saveDatasets(datasets)
1875 for path in pathsInStore:
1876 assert path is not None
1877 self.assertTrue(
1878 self.checkFileExists(exportDir, path),
1879 f"Check that mode {transfer} exported files",
1880 )
1882 def testPytypeCoercion(self) -> None:
1883 """Test python type coercion on Butler.get and put."""
1884 # Store some data with the normal example storage class.
1885 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1886 datasetTypeName = "test_metric"
1887 butler = self.runPutGetTest(storageClass, datasetTypeName)
1889 dataId = {"instrument": "DummyCamComp", "visit": 423}
1890 metric = butler.get(datasetTypeName, dataId=dataId)
1891 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1893 datasetType_ori = butler.get_dataset_type(datasetTypeName)
1894 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1896 # Now need to hack the registry dataset type definition.
1897 # There is no API for this.
1898 assert isinstance(butler._registry, SqlRegistry)
1899 manager = butler._registry._managers.datasets
1900 assert hasattr(manager, "_db") and hasattr(manager, "_static")
1901 manager._db.update(
1902 manager._static.dataset_type,
1903 {"name": datasetTypeName},
1904 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1905 )
1907 # Force reset of dataset type cache
1908 butler.registry.refresh()
1910 datasetType_new = butler.get_dataset_type(datasetTypeName)
1911 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1912 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1914 metric_model = butler.get(datasetTypeName, dataId=dataId)
1915 self.assertNotEqual(type(metric_model), type(metric))
1916 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1918 # Put the model and read it back to show that everything now
1919 # works as normal.
1920 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1921 metric_model_new = butler.get(metric_ref)
1922 self.assertEqual(metric_model_new, metric_model)
1924 # Hack the storage class again to something that will fail on the
1925 # get with no conversion class.
1926 manager._db.update(
1927 manager._static.dataset_type,
1928 {"name": datasetTypeName},
1929 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1930 )
1931 butler.registry.refresh()
1933 with self.assertRaises(ValueError):
1934 butler.get(datasetTypeName, dataId=dataId)
1937@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1938class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1939 """PosixDatastore specialization of a butler using Postgres"""
1941 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1942 fullConfigKey = ".datastore.formatters"
1943 validationCanFail = True
1944 datastoreStr = ["/tmp"]
1945 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1946 registryStr = "PostgreSQL@test"
1947 postgresql: Any
1949 @staticmethod
1950 def _handler(postgresql: Any) -> None:
1951 engine = sqlalchemy.engine.create_engine(postgresql.url())
1952 with engine.begin() as connection:
1953 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1955 @classmethod
1956 def setUpClass(cls) -> None:
1957 # Create the postgres test server.
1958 cls.postgresql = testing.postgresql.PostgresqlFactory(
1959 cache_initialized_db=True, on_initialized=cls._handler
1960 )
1961 super().setUpClass()
1963 @classmethod
1964 def tearDownClass(cls) -> None:
1965 # Clean up any lingering SQLAlchemy engines/connections
1966 # so they're closed before we shut down the server.
1967 gc.collect()
1968 cls.postgresql.clear_cache()
1969 super().tearDownClass()
1971 def setUp(self) -> None:
1972 self.server = self.postgresql()
1974 # Need to add a registry section to the config.
1975 self._temp_config = False
1976 config = Config(self.configFile)
1977 config["registry", "db"] = self.server.url()
1978 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1979 config.dump(fh)
1980 self.configFile = fh.name
1981 self._temp_config = True
1982 super().setUp()
1984 def tearDown(self) -> None:
1985 self.server.stop()
1986 if self._temp_config and os.path.exists(self.configFile):
1987 os.remove(self.configFile)
1988 super().tearDown()
1990 def testMakeRepo(self) -> None:
1991 # The base class test assumes that it's using sqlite and assumes
1992 # the config file is acceptable to sqlite.
1993 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1996@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1997class ClonedPostgresPosixDatastoreButlerTestCase(PostgresPosixDatastoreButlerTestCase, unittest.TestCase):
1998 """Test that Butler with a Postgres registry still works after cloning."""
2000 def create_butler(
2001 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
2002 ) -> tuple[DirectButler, DatasetType]:
2003 butler, datasetType = super().create_butler(run, storageClass, datasetTypeName)
2004 return butler._clone(run=run), datasetType
2007class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
2008 """InMemoryDatastore specialization of a butler"""
2010 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
2011 fullConfigKey = None
2012 useTempRoot = False
2013 validationCanFail = False
2014 datastoreStr = ["datastore='InMemory"]
2015 datastoreName = ["InMemoryDatastore@"]
2016 registryStr = "/gen3.sqlite3"
2018 def testIngest(self) -> None:
2019 pass
2022class ClonedSqliteButlerTestCase(InMemoryDatastoreButlerTestCase, unittest.TestCase):
2023 """Test that a Butler with a Sqlite registry still works after cloning."""
2025 def create_butler(
2026 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
2027 ) -> tuple[DirectButler, DatasetType]:
2028 butler, datasetType = super().create_butler(run, storageClass, datasetTypeName)
2029 return butler._clone(run=run), datasetType
2032class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
2033 """PosixDatastore specialization"""
2035 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2036 fullConfigKey = ".datastore.datastores.1.formatters"
2037 validationCanFail = True
2038 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
2039 datastoreName = [
2040 "InMemoryDatastore@",
2041 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
2042 "SecondDatastore",
2043 ]
2044 registryStr = "/gen3.sqlite3"
2046 def testPruneDatasets(self) -> None:
2047 # This test relies on manipulating files out-of-band, which is
2048 # impossible for this configuration because of the InMemoryDatastore in
2049 # the ChainedDatastore.
2050 pass
2053class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
2054 """Test that a yaml file in one location can refer to a root in another."""
2056 datastoreStr = ["dir1"]
2057 # Disable the makeRepo test since we are deliberately not using
2058 # butler.yaml as the config name.
2059 fullConfigKey = None
2061 def setUp(self) -> None:
2062 self.root = makeTestTempDir(TESTDIR)
2064 # Make a new repository in one place
2065 self.dir1 = os.path.join(self.root, "dir1")
2066 Butler.makeRepo(self.dir1, config=Config(self.configFile))
2068 # Move the yaml file to a different place and add a "root"
2069 self.dir2 = os.path.join(self.root, "dir2")
2070 os.makedirs(self.dir2, exist_ok=True)
2071 configFile1 = os.path.join(self.dir1, "butler.yaml")
2072 config = Config(configFile1)
2073 config["root"] = self.dir1
2074 configFile2 = os.path.join(self.dir2, "butler2.yaml")
2075 config.dumpToUri(configFile2)
2076 os.remove(configFile1)
2077 self.tmpConfigFile = configFile2
2079 def testFileLocations(self) -> None:
2080 self.assertNotEqual(self.dir1, self.dir2)
2081 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
2082 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
2083 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
2086class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
2087 """Test that a config file created by makeRepo outside of repo works."""
2089 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2091 def setUp(self) -> None:
2092 self.root = makeTestTempDir(TESTDIR)
2093 self.root2 = makeTestTempDir(TESTDIR)
2095 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
2096 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2098 def tearDown(self) -> None:
2099 if os.path.exists(self.root2):
2100 shutil.rmtree(self.root2, ignore_errors=True)
2101 super().tearDown()
2103 def testConfigExistence(self) -> None:
2104 c = Config(self.tmpConfigFile)
2105 uri_config = ResourcePath(c["root"])
2106 uri_expected = ResourcePath(self.root, forceDirectory=True)
2107 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
2108 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
2110 def testPutGet(self) -> None:
2111 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
2112 self.runPutGetTest(storageClass, "test_metric")
2115class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
2116 """Test that a config file created by makeRepo outside of repo works."""
2118 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2120 def setUp(self) -> None:
2121 self.root = makeTestTempDir(TESTDIR)
2122 self.root2 = makeTestTempDir(TESTDIR)
2124 self.tmpConfigFile = self.root2
2125 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2127 def testConfigExistence(self) -> None:
2128 # Append the yaml file else Config constructor does not know the file
2129 # type.
2130 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
2131 super().testConfigExistence()
2134class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
2135 """Test that a config file created by makeRepo outside of repo works."""
2137 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2139 def setUp(self) -> None:
2140 self.root = makeTestTempDir(TESTDIR)
2141 self.root2 = makeTestTempDir(TESTDIR)
2143 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
2144 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2147@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
2148class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
2149 """S3Datastore specialization of a butler; an S3 storage Datastore +
2150 a local in-memory SqlRegistry.
2151 """
2153 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
2154 fullConfigKey = None
2155 validationCanFail = True
2157 bucketName = "anybucketname"
2158 """Name of the Bucket that will be used in the tests. The name is read from
2159 the config file used with the tests during set-up.
2160 """
2162 root = "butlerRoot/"
2163 """Root repository directory expected to be used in case useTempRoot=False.
2164 Otherwise the root is set to a 20 characters long randomly generated string
2165 during set-up.
2166 """
2168 datastoreStr = [f"datastore={root}"]
2169 """Contains all expected root locations in a format expected to be
2170 returned by Butler stringification.
2171 """
2173 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
2174 """The expected format of the S3 Datastore string."""
2176 registryStr = "/gen3.sqlite3"
2177 """Expected format of the Registry string."""
2179 mock_aws = mock_aws()
2180 """The mocked s3 interface from moto."""
2182 def genRoot(self) -> str:
2183 """Return a random string of len 20 to serve as a root
2184 name for the temporary bucket repo.
2186 This is equivalent to tempfile.mkdtemp as this is what self.root
2187 becomes when useTempRoot is True.
2188 """
2189 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
2190 return rndstr + "/"
2192 def setUp(self) -> None:
2193 config = Config(self.configFile)
2194 uri = ResourcePath(config[".datastore.datastore.root"])
2195 self.bucketName = uri.netloc
2197 # Enable S3 mocking of tests.
2198 self.enterContext(clean_test_environment_for_s3())
2199 self.mock_aws.start()
2201 if self.useTempRoot:
2202 self.root = self.genRoot()
2203 rooturi = f"s3://{self.bucketName}/{self.root}"
2204 config.update({"datastore": {"datastore": {"root": rooturi}}})
2206 # need local folder to store registry database
2207 self.reg_dir = makeTestTempDir(TESTDIR)
2208 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
2210 # MOTO needs to know that we expect Bucket bucketname to exist
2211 # (this used to be the class attribute bucketName)
2212 s3 = boto3.resource("s3")
2213 s3.create_bucket(Bucket=self.bucketName)
2215 self.datastoreStr = [f"datastore='{rooturi}'"]
2216 self.datastoreName = [f"FileDatastore@{rooturi}"]
2217 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
2218 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
2220 def tearDown(self) -> None:
2221 s3 = boto3.resource("s3")
2222 bucket = s3.Bucket(self.bucketName)
2223 try:
2224 bucket.objects.all().delete()
2225 except botocore.exceptions.ClientError as e:
2226 if e.response["Error"]["Code"] == "404":
2227 # the key was not reachable - pass
2228 pass
2229 else:
2230 raise
2232 bucket = s3.Bucket(self.bucketName)
2233 bucket.delete()
2235 # Stop the S3 mock.
2236 self.mock_aws.stop()
2238 if self.reg_dir is not None and os.path.exists(self.reg_dir):
2239 shutil.rmtree(self.reg_dir, ignore_errors=True)
2241 if self.useTempRoot and os.path.exists(self.root):
2242 shutil.rmtree(self.root, ignore_errors=True)
2244 super().tearDown()
2247class PosixDatastoreTransfers(unittest.TestCase):
2248 """Test data transfers between butlers.
2250 Test for different managers. UUID to UUID and integer to integer are
2251 tested. UUID to integer is not supported since we do not currently
2252 want to allow that. Integer to UUID is supported with the caveat
2253 that UUID4 will be generated and this will be incorrect for raw
2254 dataset types. The test ignores that.
2255 """
2257 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2258 storageClassFactory: StorageClassFactory
2260 @classmethod
2261 def setUpClass(cls) -> None:
2262 cls.storageClassFactory = StorageClassFactory()
2263 cls.storageClassFactory.addFromConfig(cls.configFile)
2265 def setUp(self) -> None:
2266 self.root = makeTestTempDir(TESTDIR)
2267 self.config = Config(self.configFile)
2269 def tearDown(self) -> None:
2270 removeTestTempDir(self.root)
2272 def create_butler(self, manager: str, label: str) -> Butler:
2273 config = Config(self.configFile)
2274 config["registry", "managers", "datasets"] = manager
2275 return Butler.from_config(
2276 Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True
2277 )
2279 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None:
2280 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID"
2281 if manager1 is None:
2282 manager1 = default
2283 if manager2 is None:
2284 manager2 = default
2285 self.source_butler = self.create_butler(manager1, "1")
2286 self.target_butler = self.create_butler(manager2, "2")
2288 def testTransferUuidToUuid(self) -> None:
2289 self.create_butlers()
2290 self.assertButlerTransfers()
2292 def testTransferMissing(self) -> None:
2293 """Test transfers where datastore records are missing.
2295 This is how execution butler works.
2296 """
2297 self.create_butlers()
2299 # Configure the source butler to allow trust.
2300 self.source_butler._datastore._set_trust_mode(True)
2302 self.assertButlerTransfers(purge=True)
2304 def testTransferMissingDisassembly(self) -> None:
2305 """Test transfers where datastore records are missing.
2307 This is how execution butler works.
2308 """
2309 self.create_butlers()
2311 # Configure the source butler to allow trust.
2312 self.source_butler._datastore._set_trust_mode(True)
2314 # Test disassembly.
2315 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
2317 def testAbsoluteURITransferDirect(self) -> None:
2318 """Test transfer using an absolute URI."""
2319 self._absolute_transfer("auto")
2321 def testAbsoluteURITransferCopy(self) -> None:
2322 """Test transfer using an absolute URI."""
2323 self._absolute_transfer("copy")
2325 def _absolute_transfer(self, transfer: str) -> None:
2326 self.create_butlers()
2328 storageClassName = "StructuredData"
2329 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2330 datasetTypeName = "random_data"
2331 run = "run1"
2332 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2334 dimensions = self.source_butler.dimensions.conform(())
2335 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2336 self.source_butler.registry.registerDatasetType(datasetType)
2338 metrics = makeExampleMetrics()
2339 with ResourcePath.temporary_uri(suffix=".json") as temp:
2340 dataId = DataCoordinate.make_empty(self.source_butler.dimensions)
2341 source_refs = [DatasetRef(datasetType, dataId, run=run)]
2342 temp.write(json.dumps(metrics.exportAsDict()).encode())
2343 dataset = FileDataset(path=temp, refs=source_refs)
2344 self.source_butler.ingest(dataset, transfer="direct")
2346 self.target_butler.transfer_from(
2347 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer
2348 )
2350 uri = self.target_butler.getURI(dataset.refs[0])
2351 if transfer == "auto":
2352 self.assertEqual(uri, temp)
2353 else:
2354 self.assertNotEqual(uri, temp)
2356 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None:
2357 """Test that a run can be transferred to another butler."""
2358 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2359 datasetTypeName = "random_data"
2361 # Test will create 3 collections and we will want to transfer
2362 # two of those three.
2363 runs = ["run1", "run2", "other"]
2365 # Also want to use two different dataset types to ensure that
2366 # grouping works.
2367 datasetTypeNames = ["random_data", "random_data_2"]
2369 # Create the run collections in the source butler.
2370 for run in runs:
2371 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2373 # Create dimensions in source butler.
2374 n_exposures = 30
2375 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2376 self.source_butler.registry.insertDimensionData(
2377 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2378 )
2379 self.source_butler.registry.insertDimensionData(
2380 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2381 )
2382 self.source_butler.registry.insertDimensionData(
2383 "day_obs",
2384 {
2385 "instrument": "DummyCamComp",
2386 "id": 20250101,
2387 },
2388 )
2390 for i in range(n_exposures):
2391 self.source_butler.registry.insertDimensionData(
2392 "group", {"instrument": "DummyCamComp", "name": f"group{i}"}
2393 )
2394 self.source_butler.registry.insertDimensionData(
2395 "exposure",
2396 {
2397 "instrument": "DummyCamComp",
2398 "id": i,
2399 "obs_id": f"exp{i}",
2400 "physical_filter": "d-r",
2401 "group": f"group{i}",
2402 "day_obs": 20250101,
2403 },
2404 )
2406 # Create dataset types in the source butler.
2407 dimensions = self.source_butler.dimensions.conform(["instrument", "exposure"])
2408 for datasetTypeName in datasetTypeNames:
2409 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2410 self.source_butler.registry.registerDatasetType(datasetType)
2412 # Write a dataset to an unrelated run -- this will ensure that
2413 # we are rewriting integer dataset ids in the target if necessary.
2414 # Will not be relevant for UUID.
2415 run = "distraction"
2416 butler = Butler.from_config(butler=self.source_butler, run=run)
2417 butler.put(
2418 makeExampleMetrics(),
2419 datasetTypeName,
2420 exposure=1,
2421 instrument="DummyCamComp",
2422 physical_filter="d-r",
2423 )
2425 # Write some example metrics to the source
2426 butler = Butler.from_config(butler=self.source_butler)
2428 # Set of DatasetRefs that should be in the list of refs to transfer
2429 # but which will not be transferred.
2430 deleted: set[DatasetRef] = set()
2432 n_expected = 20 # Number of datasets expected to be transferred
2433 source_refs = []
2434 for i in range(n_exposures):
2435 # Put a third of datasets into each collection, only retain
2436 # two thirds.
2437 index = i % 3
2438 run = runs[index]
2439 datasetTypeName = datasetTypeNames[i % 2]
2441 metric = MetricsExample(
2442 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)]
2443 )
2444 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2445 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2447 # Remove the datastore record using low-level API, but only
2448 # for a specific index.
2449 if purge and index == 1:
2450 # For one of these delete the file as well.
2451 # This allows the "missing" code to filter the
2452 # file out.
2453 # Access the individual datastores.
2454 datastores = []
2455 if hasattr(butler._datastore, "datastores"):
2456 datastores.extend(butler._datastore.datastores)
2457 else:
2458 datastores.append(butler._datastore)
2460 if not deleted:
2461 # For a chained datastore we need to remove
2462 # files in each chain.
2463 for datastore in datastores:
2464 # The file might not be known to the datastore
2465 # if constraints are used.
2466 try:
2467 primary, uris = datastore.getURIs(ref)
2468 except FileNotFoundError:
2469 continue
2470 if primary and primary.scheme != "mem":
2471 primary.remove()
2472 for uri in uris.values():
2473 if uri.scheme != "mem":
2474 uri.remove()
2475 n_expected -= 1
2476 deleted.add(ref)
2478 # Remove the datastore record.
2479 for datastore in datastores:
2480 if hasattr(datastore, "removeStoredItemInfo"):
2481 datastore.removeStoredItemInfo(ref)
2483 if index < 2:
2484 source_refs.append(ref)
2485 if ref not in deleted:
2486 new_metric = butler.get(ref)
2487 self.assertEqual(new_metric, metric)
2489 # Create some bad dataset types to ensure we check for inconsistent
2490 # definitions.
2491 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2492 for datasetTypeName in datasetTypeNames:
2493 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2494 self.target_butler.registry.registerDatasetType(datasetType)
2495 with self.assertRaises(ConflictingDefinitionError) as cm:
2496 self.target_butler.transfer_from(self.source_butler, source_refs)
2497 self.assertIn("dataset type differs", str(cm.exception))
2499 # And remove the bad definitions.
2500 for datasetTypeName in datasetTypeNames:
2501 self.target_butler.registry.removeDatasetType(datasetTypeName)
2503 # Transfer without creating dataset types should fail.
2504 with self.assertRaises(KeyError):
2505 self.target_butler.transfer_from(self.source_butler, source_refs)
2507 # Transfer without creating dimensions should fail.
2508 with self.assertRaises(ConflictingDefinitionError) as cm:
2509 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True)
2510 self.assertIn("dimension", str(cm.exception))
2512 # The failed transfer above leaves registry in an inconsistent
2513 # state because the run is created but then rolled back without
2514 # the collection cache being cleared. For now force a refresh.
2515 # Can remove with DM-35498.
2516 self.target_butler.registry.refresh()
2518 # Do a dry run -- this should not have any effect on the target butler.
2519 self.target_butler.transfer_from(self.source_butler, source_refs, dry_run=True)
2521 # Transfer the records for one ref to test the alternative API.
2522 with self.assertLogs(logger="lsst", level=logging.DEBUG) as log_cm:
2523 self.target_butler.transfer_dimension_records_from(self.source_butler, [source_refs[0]])
2524 self.assertIn("number of records transferred: 1", ";".join(log_cm.output))
2526 # Now transfer them to the second butler, including dimensions.
2527 with self.assertLogs(logger="lsst", level=logging.DEBUG) as log_cm:
2528 transferred = self.target_butler.transfer_from(
2529 self.source_butler,
2530 source_refs,
2531 register_dataset_types=True,
2532 transfer_dimensions=True,
2533 )
2534 self.assertEqual(len(transferred), n_expected)
2535 log_output = ";".join(log_cm.output)
2537 # A ChainedDatastore will use the in-memory datastore for mexists
2538 # so we can not rely on the mexists log message.
2539 self.assertIn("Number of datastore records found in source", log_output)
2540 self.assertIn("Creating output run", log_output)
2542 # Do the transfer twice to ensure that it will do nothing extra.
2543 # Only do this if purge=True because it does not work for int
2544 # dataset_id.
2545 if purge:
2546 # This should not need to register dataset types.
2547 transferred = self.target_butler.transfer_from(self.source_butler, source_refs)
2548 self.assertEqual(len(transferred), n_expected)
2550 # Also do an explicit low-level transfer to trigger some
2551 # edge cases.
2552 with self.assertLogs(level=logging.DEBUG) as log_cm:
2553 self.target_butler._datastore.transfer_from(self.source_butler._datastore, source_refs)
2554 log_output = ";".join(log_cm.output)
2555 self.assertIn("no file artifacts exist", log_output)
2557 with self.assertRaises((TypeError, AttributeError)):
2558 self.target_butler._datastore.transfer_from(self.source_butler, source_refs) # type: ignore
2560 with self.assertRaises(ValueError):
2561 self.target_butler._datastore.transfer_from(
2562 self.source_butler._datastore, source_refs, transfer="split"
2563 )
2565 # Now try to get the same refs from the new butler.
2566 for ref in source_refs:
2567 if ref not in deleted:
2568 new_metric = self.target_butler.get(ref)
2569 old_metric = self.source_butler.get(ref)
2570 self.assertEqual(new_metric, old_metric)
2572 # Now prune run2 collection and create instead a CHAINED collection.
2573 # This should block the transfer.
2574 self.target_butler.removeRuns(["run2"], unstore=True)
2575 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2576 with self.assertRaises(CollectionTypeError):
2577 # Re-importing the run1 datasets can be problematic if they
2578 # use integer IDs so filter those out.
2579 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2580 self.target_butler.transfer_from(self.source_butler, to_transfer)
2583class ChainedDatastoreTransfers(PosixDatastoreTransfers):
2584 """Test transfers using a chained datastore."""
2586 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2589class NullDatastoreTestCase(unittest.TestCase):
2590 """Test that we can fall back to a null datastore."""
2592 # Need a good config to create the repo.
2593 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2594 storageClassFactory: StorageClassFactory
2596 @classmethod
2597 def setUpClass(cls) -> None:
2598 cls.storageClassFactory = StorageClassFactory()
2599 cls.storageClassFactory.addFromConfig(cls.configFile)
2601 def setUp(self) -> None:
2602 """Create a new butler root for each test."""
2603 self.root = makeTestTempDir(TESTDIR)
2604 Butler.makeRepo(self.root, config=Config(self.configFile))
2606 def tearDown(self) -> None:
2607 removeTestTempDir(self.root)
2609 def test_fallback(self) -> None:
2610 # Read the butler config and mess with the datastore section.
2611 config_path = os.path.join(self.root, "butler.yaml")
2612 bad_config = Config(config_path)
2613 bad_config["datastore", "cls"] = "lsst.not.a.datastore.Datastore"
2614 bad_config.dumpToUri(config_path)
2616 with self.assertRaises(RuntimeError):
2617 Butler(self.root, without_datastore=False)
2619 with self.assertRaises(RuntimeError):
2620 Butler.from_config(self.root, without_datastore=False)
2622 butler = Butler.from_config(self.root, writeable=True, without_datastore=True)
2623 self.assertIsInstance(butler._datastore, NullDatastore)
2625 # Check that registry is working.
2626 butler.registry.registerRun("MYRUN")
2627 collections = butler.registry.queryCollections(...)
2628 self.assertIn("MYRUN", set(collections))
2630 # Create a ref.
2631 dimensions = butler.dimensions.conform([])
2632 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
2633 datasetTypeName = "metric"
2634 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2635 butler.registry.registerDatasetType(datasetType)
2636 ref = DatasetRef(datasetType, {}, run="MYRUN")
2638 # Check that datastore will complain.
2639 with self.assertRaises(FileNotFoundError):
2640 butler.get(ref)
2641 with self.assertRaises(FileNotFoundError):
2642 butler.getURI(ref)
2645@unittest.skipIf(create_test_server is None, "Server dependencies not installed.")
2646class ButlerServerTests(FileDatastoreButlerTests, unittest.TestCase):
2647 """Test RemoteButler and Butler server."""
2649 configFile = None
2650 predictionSupported = False
2651 trustModeSupported = False
2653 def setUp(self):
2654 self.server_instance = self.enterContext(create_test_server(TESTDIR))
2656 def tearDown(self):
2657 pass
2659 def are_uris_equivalent(self, uri1: ResourcePath, uri2: ResourcePath) -> bool:
2660 # S3 pre-signed URLs may end up with differing expiration times in the
2661 # query parameters, so ignore query parameters when comparing.
2662 return uri1.scheme == uri2.scheme and uri1.netloc == uri2.netloc and uri1.path == uri2.path
2664 def create_empty_butler(self, run: str | None = None, writeable: bool | None = None) -> Butler:
2665 return self.server_instance.hybrid_butler._clone(run=run)
2667 def remove_dataset_out_of_band(self, butler: Butler, ref: DatasetRef) -> None:
2668 # Can't delete a file via S3 signed URLs, so we need to reach in
2669 # through DirectButler to delete the dataset.
2670 uri = self.server_instance.direct_butler.getURI(ref)
2671 uri.remove()
2673 def testConstructor(self):
2674 # RemoteButler constructor is tested in test_server.py and
2675 # test_remote_butler.py.
2676 pass
2678 def testDafButlerRepositories(self):
2679 # Loading of RemoteButler via repository index is tested in
2680 # test_server.py.
2681 pass
2683 def testGetDatasetTypes(self) -> None:
2684 # This is mostly a test of validateConfiguration, which is for
2685 # validating Datastore configuration and thus isn't relevant to
2686 # RemoteButler.
2687 pass
2689 def testMakeRepo(self) -> None:
2690 # Only applies to DirectButler.
2691 pass
2693 # Pickling not yet implemented for RemoteButler/HybridButler.
2694 @unittest.expectedFailure
2695 def testPickle(self) -> None:
2696 return super().testPickle()
2698 def testStringification(self) -> None:
2699 self.assertEqual(
2700 str(self.server_instance.remote_butler),
2701 "RemoteButler(https://test.example/api/butler/repo/testrepo)",
2702 )
2704 def testTransaction(self) -> None:
2705 # Transactions will never be supported for RemoteButler.
2706 pass
2708 def testPutTemplates(self) -> None:
2709 # The Butler server instance is configured with different file naming
2710 # templates than this test is expecting.
2711 pass
2714def setup_module(module: types.ModuleType) -> None:
2715 """Set up the module for pytest."""
2716 clean_environment()
2719if __name__ == "__main__":
2720 clean_environment()
2721 unittest.main()