Coverage for tests/test_butler.py: 15%
1415 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-04 02:55 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-04 02:55 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Tests for Butler.
29"""
30from __future__ import annotations
32import gc
33import json
34import logging
35import os
36import pathlib
37import pickle
38import posixpath
39import random
40import shutil
41import string
42import tempfile
43import unittest
44import uuid
45from collections.abc import Mapping
46from typing import TYPE_CHECKING, Any, cast
48try:
49 import boto3
50 import botocore
51 from lsst.resources.s3utils import clean_test_environment_for_s3
53 try:
54 from moto import mock_aws # v5
55 except ImportError:
56 from moto import mock_s3 as mock_aws
57except ImportError:
58 boto3 = None
60 def mock_aws(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def]
61 """No-op decorator in case moto mock_aws can not be imported."""
62 return None
65try:
66 from lsst.daf.butler.tests.server import create_test_server
67except ImportError:
68 create_test_server = None
70try:
71 # It's possible but silly to have testing.postgresql installed without
72 # having the postgresql server installed (because then nothing in
73 # testing.postgresql would work), so we use the presence of that module
74 # to test whether we can expect the server to be available.
75 import testing.postgresql # type: ignore[import]
76except ImportError:
77 testing = None
79import astropy.time
80import sqlalchemy
81from lsst.daf.butler import (
82 Butler,
83 ButlerConfig,
84 ButlerRepoIndex,
85 CollectionType,
86 Config,
87 DataCoordinate,
88 DatasetExistence,
89 DatasetNotFoundError,
90 DatasetRef,
91 DatasetType,
92 FileDataset,
93 NoDefaultCollectionError,
94 StorageClassFactory,
95 ValidationError,
96 script,
97)
98from lsst.daf.butler.datastore import NullDatastore
99from lsst.daf.butler.datastore.file_templates import FileTemplate, FileTemplateValidationError
100from lsst.daf.butler.datastores.fileDatastore import FileDatastore
101from lsst.daf.butler.direct_butler import DirectButler
102from lsst.daf.butler.registry import (
103 CollectionError,
104 CollectionTypeError,
105 ConflictingDefinitionError,
106 DataIdValueError,
107 MissingCollectionError,
108 OrphanedRecordError,
109)
110from lsst.daf.butler.registry.sql_registry import SqlRegistry
111from lsst.daf.butler.repo_relocation import BUTLER_ROOT_TAG
112from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
113from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir
114from lsst.resources import ResourcePath
115from lsst.utils import doImportType
116from lsst.utils.introspection import get_full_type_name
118if TYPE_CHECKING:
119 import types
121 from lsst.daf.butler import DimensionGroup, Registry, StorageClass
123TESTDIR = os.path.abspath(os.path.dirname(__file__))
126def clean_environment() -> None:
127 """Remove external environment variables that affect the tests."""
128 for k in ("DAF_BUTLER_REPOSITORY_INDEX",):
129 os.environ.pop(k, None)
132def makeExampleMetrics() -> MetricsExample:
133 """Return example dataset suitable for tests."""
134 return MetricsExample(
135 {"AM1": 5.2, "AM2": 30.6},
136 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
137 [563, 234, 456.7, 752, 8, 9, 27],
138 )
141class TransactionTestError(Exception):
142 """Specific error for testing transactions, to prevent misdiagnosing
143 that might otherwise occur when a standard exception is used.
144 """
146 pass
149class ButlerConfigTests(unittest.TestCase):
150 """Simple tests for ButlerConfig that are not tested in any other test
151 cases.
152 """
154 def testSearchPath(self) -> None:
155 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
156 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
157 config1 = ButlerConfig(configFile)
158 self.assertNotIn("testConfigs", "\n".join(cm.output))
160 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
161 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
162 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
163 self.assertIn("testConfigs", "\n".join(cm.output))
165 key = ("datastore", "records", "table")
166 self.assertNotEqual(config1[key], config2[key])
167 self.assertEqual(config2[key], "override_record")
170class ButlerPutGetTests(TestCaseMixin):
171 """Helper method for running a suite of put/get tests from different
172 butler configurations.
173 """
175 root: str
176 default_run = "ingésτ😺"
177 storageClassFactory: StorageClassFactory
178 configFile: str | None
179 tmpConfigFile: str
181 @staticmethod
182 def addDatasetType(
183 datasetTypeName: str, dimensions: DimensionGroup, storageClass: StorageClass | str, registry: Registry
184 ) -> DatasetType:
185 """Create a DatasetType and register it"""
186 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
187 registry.registerDatasetType(datasetType)
188 return datasetType
190 @classmethod
191 def setUpClass(cls) -> None:
192 cls.storageClassFactory = StorageClassFactory()
193 if cls.configFile is not None:
194 cls.storageClassFactory.addFromConfig(cls.configFile)
196 def assertGetComponents(
197 self,
198 butler: Butler,
199 datasetRef: DatasetRef,
200 components: tuple[str, ...],
201 reference: Any,
202 collections: Any = None,
203 ) -> None:
204 datasetType = datasetRef.datasetType
205 dataId = datasetRef.dataId
206 deferred = butler.getDeferred(datasetRef)
208 for component in components:
209 compTypeName = datasetType.componentTypeName(component)
210 result = butler.get(compTypeName, dataId, collections=collections)
211 self.assertEqual(result, getattr(reference, component))
212 result_deferred = deferred.get(component=component)
213 self.assertEqual(result_deferred, result)
215 def tearDown(self) -> None:
216 if self.root is not None:
217 removeTestTempDir(self.root)
219 def create_empty_butler(self, run: str | None = None, writeable: bool | None = None):
220 """Create a Butler for the test repository, without inserting test
221 data.
222 """
223 butler = Butler.from_config(self.tmpConfigFile, run=run, writeable=writeable)
224 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration"
225 return butler
227 def create_butler(
228 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
229 ) -> tuple[Butler, DatasetType]:
230 """Create a Butler for the test repository and insert some test data
231 into it.
232 """
233 butler = self.create_empty_butler(run=run)
235 collections = set(butler.registry.queryCollections())
236 self.assertEqual(collections, {run})
237 # Create and register a DatasetType
238 dimensions = butler.dimensions.conform(["instrument", "visit"])
240 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
242 # Add needed Dimensions
243 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
244 butler.registry.insertDimensionData(
245 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
246 )
247 butler.registry.insertDimensionData(
248 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
249 )
250 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20200101})
251 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
252 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
253 butler.registry.insertDimensionData(
254 "visit",
255 {
256 "instrument": "DummyCamComp",
257 "id": 423,
258 "name": "fourtwentythree",
259 "physical_filter": "d-r",
260 "datetime_begin": visit_start,
261 "datetime_end": visit_end,
262 "day_obs": 20200101,
263 },
264 )
266 # Add more visits for some later tests
267 for visit_id in (424, 425):
268 butler.registry.insertDimensionData(
269 "visit",
270 {
271 "instrument": "DummyCamComp",
272 "id": visit_id,
273 "name": f"fourtwentyfour_{visit_id}",
274 "physical_filter": "d-r",
275 "day_obs": 20200101,
276 },
277 )
278 return butler, datasetType
280 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler:
281 # New datasets will be added to run and tag, but we will only look in
282 # tag when looking up datasets.
283 run = self.default_run
284 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
285 assert butler.run is not None
287 # Create and store a dataset
288 metric = makeExampleMetrics()
289 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423})
291 # Dataset should not exist if we haven't added it
292 with self.assertRaises(DatasetNotFoundError):
293 butler.get(datasetTypeName, dataId)
295 # Put and remove the dataset once as a DatasetRef, once as a dataId,
296 # and once with a DatasetType
298 # Keep track of any collections we add and do not clean up
299 expected_collections = {run}
301 counter = 0
302 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1")
303 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate]
304 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)):
305 # Since we are using subTest we can get cascading failures
306 # here with the first attempt failing and the others failing
307 # immediately because the dataset already exists. Work around
308 # this by using a distinct run collection each time
309 counter += 1
310 this_run = f"put_run_{counter}"
311 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
312 expected_collections.update({this_run})
314 with self.subTest(args=args):
315 kwargs: dict[str, Any] = {}
316 if not isinstance(args[0], DatasetRef): # type: ignore
317 kwargs["run"] = this_run
318 ref = butler.put(metric, *args, **kwargs)
319 self.assertIsInstance(ref, DatasetRef)
321 # Test get of a ref.
322 metricOut = butler.get(ref)
323 self.assertEqual(metric, metricOut)
324 # Test get
325 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
326 self.assertEqual(metric, metricOut)
327 # Test get with a datasetRef
328 metricOut = butler.get(ref)
329 self.assertEqual(metric, metricOut)
330 # Test getDeferred with dataId
331 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
332 self.assertEqual(metric, metricOut)
333 # Test getDeferred with a ref
334 metricOut = butler.getDeferred(ref).get()
335 self.assertEqual(metric, metricOut)
337 # Check we can get components
338 if storageClass.isComposite():
339 self.assertGetComponents(
340 butler, ref, ("summary", "data", "output"), metric, collections=this_run
341 )
343 primary_uri, secondary_uris = butler.getURIs(ref)
344 n_uris = len(secondary_uris)
345 if primary_uri:
346 n_uris += 1
348 # Can the artifacts themselves be retrieved?
349 if not butler._datastore.isEphemeral:
350 # Create a temporary directory to hold the retrieved
351 # artifacts.
352 with tempfile.TemporaryDirectory(
353 prefix="butler-artifacts-", ignore_cleanup_errors=True
354 ) as artifact_root:
355 root_uri = ResourcePath(artifact_root, forceDirectory=True)
357 for preserve_path in (True, False):
358 destination = root_uri.join(f"{preserve_path}_{counter}/")
359 log = logging.getLogger("lsst.x")
360 log.warning("Using destination %s for args %s", destination, args)
361 # Use copy so that we can test that overwrite
362 # protection works (using "auto" for File URIs
363 # would use hard links and subsequent transfer
364 # would work because it knows they are the same
365 # file).
366 transferred = butler.retrieveArtifacts(
367 [ref], destination, preserve_path=preserve_path, transfer="copy"
368 )
369 self.assertGreater(len(transferred), 0)
370 artifacts = list(ResourcePath.findFileResources([destination]))
371 self.assertEqual(set(transferred), set(artifacts))
373 for artifact in transferred:
374 path_in_destination = artifact.relative_to(destination)
375 self.assertIsNotNone(path_in_destination)
376 assert path_in_destination is not None
378 # When path is not preserved there should not
379 # be any path separators.
380 num_seps = path_in_destination.count("/")
381 if preserve_path:
382 self.assertGreater(num_seps, 0)
383 else:
384 self.assertEqual(num_seps, 0)
386 self.assertEqual(
387 len(artifacts),
388 n_uris,
389 "Comparing expected artifacts vs actual:"
390 f" {artifacts} vs {primary_uri} and {secondary_uris}",
391 )
393 if preserve_path:
394 # No need to run these twice
395 with self.assertRaises(ValueError):
396 butler.retrieveArtifacts([ref], destination, transfer="move")
398 with self.assertRaisesRegex(
399 ValueError, "^Destination location must refer to a directory"
400 ):
401 butler.retrieveArtifacts(
402 [ref], ResourcePath("/some/file.txt", forceDirectory=False)
403 )
405 with self.assertRaises(FileExistsError):
406 butler.retrieveArtifacts([ref], destination)
408 transferred_again = butler.retrieveArtifacts(
409 [ref], destination, preserve_path=preserve_path, overwrite=True
410 )
411 self.assertEqual(set(transferred_again), set(transferred))
413 # Now remove the dataset completely.
414 butler.pruneDatasets([ref], purge=True, unstore=True)
415 # Lookup with original args should still fail.
416 kwargs = {"collections": this_run}
417 if isinstance(args[0], DatasetRef):
418 kwargs = {} # Prevent warning from being issued.
419 self.assertFalse(butler.exists(*args, **kwargs))
420 # get() should still fail.
421 with self.assertRaises((FileNotFoundError, DatasetNotFoundError)):
422 butler.get(ref)
423 # Registry shouldn't be able to find it by dataset_id anymore.
424 self.assertIsNone(butler.get_dataset(ref.id))
426 # Do explicit registry removal since we know they are
427 # empty
428 butler.registry.removeCollection(this_run)
429 expected_collections.remove(this_run)
431 # Create DatasetRef for put using default run.
432 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run)
434 # Check that getDeferred fails with standalone ref.
435 with self.assertRaises(LookupError):
436 butler.getDeferred(refIn)
438 # Put the dataset again, since the last thing we did was remove it
439 # and we want to use the default collection.
440 ref = butler.put(metric, refIn)
442 # Get with parameters
443 stop = 4
444 sliced = butler.get(ref, parameters={"slice": slice(stop)})
445 self.assertNotEqual(metric, sliced)
446 self.assertEqual(metric.summary, sliced.summary)
447 self.assertEqual(metric.output, sliced.output)
448 assert metric.data is not None # for mypy
449 self.assertEqual(metric.data[:stop], sliced.data)
450 # getDeferred with parameters
451 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
452 self.assertNotEqual(metric, sliced)
453 self.assertEqual(metric.summary, sliced.summary)
454 self.assertEqual(metric.output, sliced.output)
455 self.assertEqual(metric.data[:stop], sliced.data)
456 # getDeferred with deferred parameters
457 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
458 self.assertNotEqual(metric, sliced)
459 self.assertEqual(metric.summary, sliced.summary)
460 self.assertEqual(metric.output, sliced.output)
461 self.assertEqual(metric.data[:stop], sliced.data)
463 if storageClass.isComposite():
464 # Check that components can be retrieved
465 metricOut = butler.get(ref.datasetType.name, dataId)
466 compNameS = ref.datasetType.componentTypeName("summary")
467 compNameD = ref.datasetType.componentTypeName("data")
468 summary = butler.get(compNameS, dataId)
469 self.assertEqual(summary, metric.summary)
470 data = butler.get(compNameD, dataId)
471 self.assertEqual(data, metric.data)
473 if "counter" in storageClass.derivedComponents:
474 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
475 self.assertEqual(count, len(data))
477 count = butler.get(
478 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
479 )
480 self.assertEqual(count, stop)
482 compRef = butler.find_dataset(compNameS, dataId, collections=butler.collections)
483 assert compRef is not None
484 summary = butler.get(compRef)
485 self.assertEqual(summary, metric.summary)
487 # Create a Dataset type that has the same name but is inconsistent.
488 inconsistentDatasetType = DatasetType(
489 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
490 )
492 # Getting with a dataset type that does not match registry fails
493 with self.assertRaisesRegex(
494 ValueError,
495 "(Supplied dataset type .* inconsistent with registry)"
496 "|(The new storage class .* is not compatible with the existing storage class)",
497 ):
498 butler.get(inconsistentDatasetType, dataId)
500 # Combining a DatasetRef with a dataId should fail
501 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"):
502 butler.get(ref, dataId)
503 # Getting with an explicit ref should fail if the id doesn't match.
504 with self.assertRaises((FileNotFoundError, DatasetNotFoundError)):
505 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run))
507 # Getting a dataset with unknown parameters should fail
508 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"):
509 butler.get(ref, parameters={"unsupported": True})
511 # Check we have a collection
512 collections = set(butler.registry.queryCollections())
513 self.assertEqual(collections, expected_collections)
515 # Clean up to check that we can remove something that may have
516 # already had a component removed
517 butler.pruneDatasets([ref], unstore=True, purge=True)
519 # Add the same ref again, so we can check that duplicate put fails.
520 ref = butler.put(metric, datasetType, dataId)
522 # Repeat put will fail.
523 with self.assertRaisesRegex(
524 ConflictingDefinitionError, "A database constraint failure was triggered"
525 ):
526 butler.put(metric, datasetType, dataId)
528 # Remove the datastore entry.
529 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
531 # Put will still fail
532 with self.assertRaisesRegex(
533 ConflictingDefinitionError, "A database constraint failure was triggered"
534 ):
535 butler.put(metric, datasetType, dataId)
537 # Repeat the same sequence with resolved ref.
538 butler.pruneDatasets([ref], unstore=True, purge=True)
539 ref = butler.put(metric, refIn)
541 # Repeat put will fail.
542 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"):
543 butler.put(metric, refIn)
545 # Remove the datastore entry.
546 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
548 # In case of resolved ref this write will succeed.
549 ref = butler.put(metric, refIn)
551 # Leave the dataset in place since some downstream tests require
552 # something to be present
554 return butler
556 def testDeferredCollectionPassing(self) -> None:
557 # Construct a butler with no run or collection, but make it writeable.
558 butler = self.create_empty_butler(writeable=True)
559 # Create and register a DatasetType
560 dimensions = butler.dimensions.conform(["instrument", "visit"])
561 datasetType = self.addDatasetType(
562 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
563 )
564 # Add needed Dimensions
565 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
566 butler.registry.insertDimensionData(
567 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
568 )
569 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
570 butler.registry.insertDimensionData(
571 "visit",
572 {
573 "instrument": "DummyCamComp",
574 "id": 423,
575 "name": "fourtwentythree",
576 "physical_filter": "d-r",
577 "day_obs": 20250101,
578 },
579 )
580 dataId = {"instrument": "DummyCamComp", "visit": 423}
581 # Create dataset.
582 metric = makeExampleMetrics()
583 # Register a new run and put dataset.
584 run = "deferred"
585 self.assertTrue(butler.registry.registerRun(run))
586 # Second time it will be allowed but indicate no-op
587 self.assertFalse(butler.registry.registerRun(run))
588 ref = butler.put(metric, datasetType, dataId, run=run)
589 # Putting with no run should fail with TypeError.
590 with self.assertRaises(CollectionError):
591 butler.put(metric, datasetType, dataId)
592 # Dataset should exist.
593 self.assertTrue(butler.exists(datasetType, dataId, collections=[run]))
594 # We should be able to get the dataset back, but with and without
595 # a deferred dataset handle.
596 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
597 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
598 # Trying to find the dataset without any collection is an error.
599 with self.assertRaises(NoDefaultCollectionError):
600 butler.exists(datasetType, dataId)
601 with self.assertRaises(CollectionError):
602 butler.get(datasetType, dataId)
603 # Associate the dataset with a different collection.
604 butler.registry.registerCollection("tagged")
605 butler.registry.associate("tagged", [ref])
606 # Deleting the dataset from the new collection should make it findable
607 # in the original collection.
608 butler.pruneDatasets([ref], tags=["tagged"])
609 self.assertTrue(butler.exists(datasetType, dataId, collections=[run]))
612class ButlerTests(ButlerPutGetTests):
613 """Tests for Butler."""
615 useTempRoot = True
616 validationCanFail: bool
617 fullConfigKey: str | None
618 registryStr: str | None
619 datastoreName: list[str] | None
620 datastoreStr: list[str]
621 predictionSupported = True
622 """Does getURIs support 'prediction mode'?"""
624 def setUp(self) -> None:
625 """Create a new butler root for each test."""
626 self.root = makeTestTempDir(TESTDIR)
627 Butler.makeRepo(self.root, config=Config(self.configFile))
628 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
630 def are_uris_equivalent(self, uri1: ResourcePath, uri2: ResourcePath) -> bool:
631 """Return True if two URIs refer to the same resource.
633 Subclasses may override to handle unique requirements.
634 """
635 return uri1 == uri2
637 def testConstructor(self) -> None:
638 """Independent test of constructor."""
639 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
640 self.assertIsInstance(butler, Butler)
642 # Check that butler.yaml is added automatically.
643 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
644 config_dir = self.tmpConfigFile[: -len(end)]
645 butler = Butler.from_config(config_dir, run=self.default_run)
646 self.assertIsInstance(butler, Butler)
648 # Even with a ResourcePath.
649 butler = Butler.from_config(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
650 self.assertIsInstance(butler, Butler)
652 collections = set(butler.registry.queryCollections())
653 self.assertEqual(collections, {self.default_run})
655 # Check that some special characters can be included in run name.
656 special_run = "u@b.c-A"
657 butler_special = Butler.from_config(butler=butler, run=special_run)
658 collections = set(butler_special.registry.queryCollections("*@*"))
659 self.assertEqual(collections, {special_run})
661 butler2 = Butler.from_config(butler=butler, collections=["other"])
662 self.assertEqual(butler2.collections, ("other",))
663 self.assertIsNone(butler2.run)
664 self.assertEqual(type(butler._datastore), type(butler2._datastore))
665 self.assertEqual(butler._datastore.config, butler2._datastore.config)
667 # Test that we can use an environment variable to find this
668 # repository.
669 butler_index = Config()
670 butler_index["label"] = self.tmpConfigFile
671 for suffix in (".yaml", ".json"):
672 # Ensure that the content differs so that we know that
673 # we aren't reusing the cache.
674 bad_label = f"file://bucket/not_real{suffix}"
675 butler_index["bad_label"] = bad_label
676 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
677 butler_index.dumpToUri(temp_file)
678 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
679 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"})
680 uri = Butler.get_repo_uri("bad_label")
681 self.assertEqual(uri, ResourcePath(bad_label))
682 uri = Butler.get_repo_uri("label")
683 butler = Butler.from_config(uri, writeable=False)
684 self.assertIsInstance(butler, Butler)
685 butler = Butler.from_config("label", writeable=False)
686 self.assertIsInstance(butler, Butler)
687 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
688 Butler.from_config("not_there", writeable=False)
689 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"):
690 Butler.from_config("bad_label")
691 with self.assertRaises(FileNotFoundError):
692 # Should ignore aliases.
693 Butler.from_config(ResourcePath("label", forceAbsolute=False))
694 with self.assertRaises(KeyError) as cm:
695 Butler.get_repo_uri("missing")
696 self.assertEqual(
697 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False)
698 )
699 self.assertIn("not known to", str(cm.exception))
700 # Should report no failure.
701 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "")
702 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
703 # Now with empty configuration.
704 butler_index = Config()
705 butler_index.dumpToUri(temp_file)
706 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
707 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"):
708 Butler.from_config("label")
709 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
710 # Now with bad contents.
711 with open(temp_file.ospath, "w") as fh:
712 print("'", file=fh)
713 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
714 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"):
715 Butler.from_config("label")
716 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
717 with self.assertRaises(FileNotFoundError):
718 Butler.get_repo_uri("label")
719 self.assertEqual(Butler.get_known_repos(), set())
721 with self.assertRaisesRegex(FileNotFoundError, "index file not found"):
722 Butler.from_config("label")
724 # Check that we can create Butler when the alias file is not found.
725 butler = Butler.from_config(self.tmpConfigFile, writeable=False)
726 self.assertIsInstance(butler, Butler)
727 with self.assertRaises(RuntimeError) as cm:
728 # No environment variable set.
729 Butler.get_repo_uri("label")
730 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False))
731 self.assertIn("No repository index defined", str(cm.exception))
732 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"):
733 # No aliases registered.
734 Butler.from_config("not_there")
735 self.assertEqual(Butler.get_known_repos(), set())
737 def testDafButlerRepositories(self):
738 with unittest.mock.patch.dict(
739 os.environ,
740 {"DAF_BUTLER_REPOSITORIES": "label: 'https://someuri.com'\notherLabel: 'https://otheruri.com'\n"},
741 ):
742 self.assertEqual(str(Butler.get_repo_uri("label")), "https://someuri.com")
744 with unittest.mock.patch.dict(
745 os.environ,
746 {
747 "DAF_BUTLER_REPOSITORIES": "label: https://someuri.com",
748 "DAF_BUTLER_REPOSITORY_INDEX": "https://someuri.com",
749 },
750 ):
751 with self.assertRaisesRegex(RuntimeError, "Only one of the environment variables"):
752 Butler.get_repo_uri("label")
754 with unittest.mock.patch.dict(
755 os.environ,
756 {"DAF_BUTLER_REPOSITORIES": "invalid"},
757 ):
758 with self.assertRaisesRegex(ValueError, "Repository index not in expected format"):
759 Butler.get_repo_uri("label")
761 def testBasicPutGet(self) -> None:
762 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
763 self.runPutGetTest(storageClass, "test_metric")
765 def testCompositePutGetConcrete(self) -> None:
766 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
767 butler = self.runPutGetTest(storageClass, "test_metric")
769 # Should *not* be disassembled
770 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
771 self.assertEqual(len(datasets), 1)
772 uri, components = butler.getURIs(datasets[0])
773 self.assertIsInstance(uri, ResourcePath)
774 self.assertFalse(components)
775 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
776 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
778 # Predicted dataset
779 if self.predictionSupported:
780 dataId = {"instrument": "DummyCamComp", "visit": 424}
781 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
782 self.assertFalse(components)
783 self.assertIsInstance(uri, ResourcePath)
784 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
785 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
787 def testCompositePutGetVirtual(self) -> None:
788 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
789 butler = self.runPutGetTest(storageClass, "test_metric_comp")
791 # Should be disassembled
792 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
793 self.assertEqual(len(datasets), 1)
794 uri, components = butler.getURIs(datasets[0])
796 if butler._datastore.isEphemeral:
797 # Never disassemble in-memory datastore
798 self.assertIsInstance(uri, ResourcePath)
799 self.assertFalse(components)
800 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
801 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
802 else:
803 self.assertIsNone(uri)
804 self.assertEqual(set(components), set(storageClass.components))
805 for compuri in components.values():
806 self.assertIsInstance(compuri, ResourcePath)
807 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
808 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
810 if self.predictionSupported:
811 # Predicted dataset
812 dataId = {"instrument": "DummyCamComp", "visit": 424}
813 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
815 if butler._datastore.isEphemeral:
816 # Never disassembled
817 self.assertIsInstance(uri, ResourcePath)
818 self.assertFalse(components)
819 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
820 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
821 else:
822 self.assertIsNone(uri)
823 self.assertEqual(set(components), set(storageClass.components))
824 for compuri in components.values():
825 self.assertIsInstance(compuri, ResourcePath)
826 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
827 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
829 def testStorageClassOverrideGet(self) -> None:
830 """Test storage class conversion on get with override."""
831 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
832 datasetTypeName = "anything"
833 run = self.default_run
835 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
837 # Create and store a dataset.
838 metric = makeExampleMetrics()
839 dataId = {"instrument": "DummyCamComp", "visit": 423}
841 ref = butler.put(metric, datasetType, dataId)
843 # Return native type.
844 retrieved = butler.get(ref)
845 self.assertEqual(retrieved, metric)
847 # Specify an override.
848 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
849 model = butler.get(ref, storageClass=new_sc)
850 self.assertNotEqual(type(model), type(retrieved))
851 self.assertIs(type(model), new_sc.pytype)
852 self.assertEqual(retrieved, model)
854 # Defer but override later.
855 deferred = butler.getDeferred(ref)
856 model = deferred.get(storageClass=new_sc)
857 self.assertIs(type(model), new_sc.pytype)
858 self.assertEqual(retrieved, model)
860 # Defer but override up front.
861 deferred = butler.getDeferred(ref, storageClass=new_sc)
862 model = deferred.get()
863 self.assertIs(type(model), new_sc.pytype)
864 self.assertEqual(retrieved, model)
866 # Retrieve a component. Should be a tuple.
867 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
868 self.assertIs(type(data), tuple)
869 self.assertEqual(data, tuple(retrieved.data))
871 # Parameter on the write storage class should work regardless
872 # of read storage class.
873 data = butler.get(
874 "anything.data",
875 dataId,
876 storageClass="StructuredDataDataTestTuple",
877 parameters={"slice": slice(2, 4)},
878 )
879 self.assertEqual(len(data), 2)
881 # Try a parameter that is known to the read storage class but not
882 # the write storage class.
883 with self.assertRaises(KeyError):
884 butler.get(
885 "anything.data",
886 dataId,
887 storageClass="StructuredDataDataTestTuple",
888 parameters={"xslice": slice(2, 4)},
889 )
891 def testPytypePutCoercion(self) -> None:
892 """Test python type coercion on Butler.get and put."""
893 # Store some data with the normal example storage class.
894 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
895 datasetTypeName = "test_metric"
896 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
898 dataId = {"instrument": "DummyCamComp", "visit": 423}
900 # Put a dict and this should coerce to a MetricsExample
901 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
902 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
903 test_metric = butler.get(metric_ref)
904 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
905 self.assertEqual(test_metric.summary, test_dict["summary"])
906 self.assertEqual(test_metric.output, test_dict["output"])
908 # Check that the put still works if a DatasetType is given with
909 # a definition matching this python type.
910 registry_type = butler.get_dataset_type(datasetTypeName)
911 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
912 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
913 self.assertEqual(metric2_ref.datasetType, registry_type)
915 # The get will return the type expected by registry.
916 test_metric2 = butler.get(metric2_ref)
917 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
919 # Make a new DatasetRef with the compatible but different DatasetType.
920 # This should now return a dict.
921 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
922 test_dict2 = butler.get(new_ref)
923 self.assertEqual(get_full_type_name(test_dict2), "dict")
925 # Get it again with the wrong dataset type definition using get()
926 # rather than get(). This should be consistent with get()
927 # behavior and return the type of the DatasetType.
928 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
929 self.assertEqual(get_full_type_name(test_dict3), "dict")
931 def testIngest(self) -> None:
932 butler = self.create_empty_butler(run=self.default_run)
934 # Create and register a DatasetType
935 dimensions = butler.dimensions.conform(["instrument", "visit", "detector"])
937 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
938 datasetTypeName = "metric"
940 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
942 # Add needed Dimensions
943 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
944 butler.registry.insertDimensionData(
945 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
946 )
947 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
948 for detector in (1, 2):
949 butler.registry.insertDimensionData(
950 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
951 )
953 butler.registry.insertDimensionData(
954 "visit",
955 {
956 "instrument": "DummyCamComp",
957 "id": 423,
958 "name": "fourtwentythree",
959 "physical_filter": "d-r",
960 "day_obs": 20250101,
961 },
962 {
963 "instrument": "DummyCamComp",
964 "id": 424,
965 "name": "fourtwentyfour",
966 "physical_filter": "d-r",
967 "day_obs": 20250101,
968 },
969 )
971 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter")
972 dataRoot = os.path.join(TESTDIR, "data", "basic")
973 datasets = []
974 for detector in (1, 2):
975 detector_name = f"detector_{detector}"
976 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
977 dataId = butler.registry.expandDataId(
978 {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
979 )
980 # Create a DatasetRef for ingest
981 refIn = DatasetRef(datasetType, dataId, run=self.default_run)
983 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
985 butler.ingest(*datasets, transfer="copy")
987 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
988 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
990 metrics1 = butler.get(datasetTypeName, dataId1)
991 metrics2 = butler.get(datasetTypeName, dataId2)
992 self.assertNotEqual(metrics1, metrics2)
994 # Compare URIs
995 uri1 = butler.getURI(datasetTypeName, dataId1)
996 uri2 = butler.getURI(datasetTypeName, dataId2)
997 self.assertFalse(self.are_uris_equivalent(uri1, uri2), f"Cf. {uri1} with {uri2}")
999 # Now do a multi-dataset but single file ingest
1000 metricFile = os.path.join(dataRoot, "detectors.yaml")
1001 refs = []
1002 for detector in (1, 2):
1003 detector_name = f"detector_{detector}"
1004 dataId = butler.registry.expandDataId(
1005 {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
1006 )
1007 # Create a DatasetRef for ingest
1008 refs.append(DatasetRef(datasetType, dataId, run=self.default_run))
1010 # Test "move" transfer to ensure that the files themselves
1011 # have disappeared following ingest.
1012 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
1013 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
1015 datasets = []
1016 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
1018 # For first ingest use copy.
1019 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
1021 # Now try to ingest again in "execution butler" mode where
1022 # the registry entries exist but the datastore does not have
1023 # the files. We also need to strip the dimension records to ensure
1024 # that they will be re-added by the ingest.
1025 ref = datasets[0].refs[0]
1026 datasets[0].refs = [
1027 cast(
1028 DatasetRef,
1029 butler.find_dataset(ref.datasetType, data_id=ref.dataId, collections=ref.run),
1030 )
1031 for ref in datasets[0].refs
1032 ]
1033 all_refs = []
1034 for dataset in datasets:
1035 refs = []
1036 for ref in dataset.refs:
1037 # Create a dict from the dataId to drop the records.
1038 new_data_id = dict(ref.dataId.required)
1039 new_ref = butler.find_dataset(ref.datasetType, new_data_id, collections=ref.run)
1040 assert new_ref is not None
1041 self.assertFalse(new_ref.dataId.hasRecords())
1042 refs.append(new_ref)
1043 dataset.refs = refs
1044 all_refs.extend(dataset.refs)
1045 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False)
1047 # Use move mode to test that the file is deleted. Also
1048 # disable recording of file size.
1049 butler.ingest(*datasets, transfer="move", record_validation_info=False)
1051 # Check that every ref now has records.
1052 for dataset in datasets:
1053 for ref in dataset.refs:
1054 self.assertTrue(ref.dataId.hasRecords())
1056 # Ensure that the file has disappeared.
1057 self.assertFalse(tempFile.exists())
1059 # Check that the datastore recorded no file size.
1060 # Not all datastores can support this.
1061 try:
1062 infos = butler._datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined]
1063 self.assertEqual(infos[0].file_size, -1)
1064 except AttributeError:
1065 pass
1067 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
1068 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
1070 multi1 = butler.get(datasetTypeName, dataId1)
1071 multi2 = butler.get(datasetTypeName, dataId2)
1073 self.assertEqual(multi1, metrics1)
1074 self.assertEqual(multi2, metrics2)
1076 # Compare URIs
1077 uri1 = butler.getURI(datasetTypeName, dataId1)
1078 uri2 = butler.getURI(datasetTypeName, dataId2)
1079 self.assertTrue(self.are_uris_equivalent(uri1, uri2), f"Cf. {uri1} with {uri2}")
1081 # Test that removing one does not break the second
1082 # This line will issue a warning log message for a ChainedDatastore
1083 # that uses an InMemoryDatastore since in-memory can not ingest
1084 # files.
1085 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
1086 self.assertFalse(butler.exists(datasetTypeName, dataId1))
1087 self.assertTrue(butler.exists(datasetTypeName, dataId2))
1088 multi2b = butler.get(datasetTypeName, dataId2)
1089 self.assertEqual(multi2, multi2b)
1091 # Ensure we can ingest 0 datasets
1092 datasets = []
1093 butler.ingest(*datasets)
1095 def testPickle(self) -> None:
1096 """Test pickle support."""
1097 butler = self.create_empty_butler(run=self.default_run)
1098 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration"
1099 butlerOut = pickle.loads(pickle.dumps(butler))
1100 self.assertIsInstance(butlerOut, Butler)
1101 self.assertEqual(butlerOut._config, butler._config)
1102 self.assertEqual(butlerOut.collections, butler.collections)
1103 self.assertEqual(butlerOut.run, butler.run)
1105 def testGetDatasetTypes(self) -> None:
1106 butler = self.create_empty_butler(run=self.default_run)
1107 dimensions = butler.dimensions.conform(["instrument", "visit", "physical_filter"])
1108 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [
1109 (
1110 "instrument",
1111 [
1112 {"instrument": "DummyCam"},
1113 {"instrument": "DummyHSC"},
1114 {"instrument": "DummyCamComp"},
1115 ],
1116 ),
1117 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]),
1118 ("day_obs", [{"instrument": "DummyCam", "id": 20250101}]),
1119 (
1120 "visit",
1121 [
1122 {
1123 "instrument": "DummyCam",
1124 "id": 42,
1125 "name": "fortytwo",
1126 "physical_filter": "d-r",
1127 "day_obs": 20250101,
1128 }
1129 ],
1130 ),
1131 ]
1132 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1133 # Add needed Dimensions
1134 for element, data in dimensionEntries:
1135 butler.registry.insertDimensionData(element, *data)
1137 # When a DatasetType is added to the registry entries are not created
1138 # for components but querying them can return the components.
1139 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
1140 components = set()
1141 for datasetTypeName in datasetTypeNames:
1142 # Create and register a DatasetType
1143 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1145 for componentName in storageClass.components:
1146 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
1148 fromRegistry: set[DatasetType] = set()
1149 for parent_dataset_type in butler.registry.queryDatasetTypes():
1150 fromRegistry.add(parent_dataset_type)
1151 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
1152 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
1154 # Now that we have some dataset types registered, validate them
1155 butler.validateConfiguration(
1156 ignore=[
1157 "test_metric_comp",
1158 "metric3",
1159 "metric5",
1160 "calexp",
1161 "DummySC",
1162 "datasetType.component",
1163 "random_data",
1164 "random_data_2",
1165 ]
1166 )
1168 # Add a new datasetType that will fail template validation
1169 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
1170 if self.validationCanFail:
1171 with self.assertRaises(ValidationError):
1172 butler.validateConfiguration()
1174 # Rerun validation but with a subset of dataset type names
1175 butler.validateConfiguration(datasetTypeNames=["metric4"])
1177 # Rerun validation but ignore the bad datasetType
1178 butler.validateConfiguration(
1179 ignore=[
1180 "test_metric_comp",
1181 "metric3",
1182 "metric5",
1183 "calexp",
1184 "DummySC",
1185 "datasetType.component",
1186 "random_data",
1187 "random_data_2",
1188 ]
1189 )
1191 def testTransaction(self) -> None:
1192 butler = self.create_empty_butler(run=self.default_run)
1193 datasetTypeName = "test_metric"
1194 dimensions = butler.dimensions.conform(["instrument", "visit"])
1195 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = (
1196 ("instrument", {"instrument": "DummyCam"}),
1197 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
1198 ("day_obs", {"instrument": "DummyCam", "id": 20250101}),
1199 (
1200 "visit",
1201 {
1202 "instrument": "DummyCam",
1203 "id": 42,
1204 "name": "fortytwo",
1205 "physical_filter": "d-r",
1206 "day_obs": 20250101,
1207 },
1208 ),
1209 )
1210 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1211 metric = makeExampleMetrics()
1212 dataId = {"instrument": "DummyCam", "visit": 42}
1213 # Create and register a DatasetType
1214 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1215 with self.assertRaises(TransactionTestError):
1216 with butler.transaction():
1217 # Add needed Dimensions
1218 for args in dimensionEntries:
1219 butler.registry.insertDimensionData(*args)
1220 # Store a dataset
1221 ref = butler.put(metric, datasetTypeName, dataId)
1222 self.assertIsInstance(ref, DatasetRef)
1223 # Test get of a ref.
1224 metricOut = butler.get(ref)
1225 self.assertEqual(metric, metricOut)
1226 # Test get
1227 metricOut = butler.get(datasetTypeName, dataId)
1228 self.assertEqual(metric, metricOut)
1229 # Check we can get components
1230 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
1231 raise TransactionTestError("This should roll back the entire transaction")
1232 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
1233 butler.registry.expandDataId(dataId)
1234 # Should raise LookupError for missing data ID value
1235 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
1236 butler.get(datasetTypeName, dataId)
1237 # Also check explicitly if Dataset entry is missing
1238 self.assertIsNone(butler.find_dataset(datasetType, dataId, collections=butler.collections))
1239 # Direct retrieval should not find the file in the Datastore
1240 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
1241 butler.get(ref)
1243 def testMakeRepo(self) -> None:
1244 """Test that we can write butler configuration to a new repository via
1245 the Butler.makeRepo interface and then instantiate a butler from the
1246 repo root.
1247 """
1248 # Do not run the test if we know this datastore configuration does
1249 # not support a file system root
1250 if self.fullConfigKey is None:
1251 return
1253 # create two separate directories
1254 root1 = tempfile.mkdtemp(dir=self.root)
1255 root2 = tempfile.mkdtemp(dir=self.root)
1257 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1258 limited = Config(self.configFile)
1259 butler1 = Butler.from_config(butlerConfig)
1260 assert isinstance(butler1, DirectButler), "Expect DirectButler in configuration"
1261 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1262 full = Config(self.tmpConfigFile)
1263 butler2 = Butler.from_config(butlerConfig)
1264 assert isinstance(butler2, DirectButler), "Expect DirectButler in configuration"
1265 # Butlers should have the same configuration regardless of whether
1266 # defaults were expanded.
1267 self.assertEqual(butler1._config, butler2._config)
1268 # Config files loaded directly should not be the same.
1269 self.assertNotEqual(limited, full)
1270 # Make sure "limited" doesn't have a few keys we know it should be
1271 # inheriting from defaults.
1272 self.assertIn(self.fullConfigKey, full)
1273 self.assertNotIn(self.fullConfigKey, limited)
1275 # Collections don't appear until something is put in them
1276 collections1 = set(butler1.registry.queryCollections())
1277 self.assertEqual(collections1, set())
1278 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1280 # Check that a config with no associated file name will not
1281 # work properly with relocatable Butler repo
1282 butlerConfig.configFile = None
1283 with self.assertRaises(ValueError):
1284 Butler.from_config(butlerConfig)
1286 with self.assertRaises(FileExistsError):
1287 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1289 def testStringification(self) -> None:
1290 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
1291 butlerStr = str(butler)
1293 if self.datastoreStr is not None:
1294 for testStr in self.datastoreStr:
1295 self.assertIn(testStr, butlerStr)
1296 if self.registryStr is not None:
1297 self.assertIn(self.registryStr, butlerStr)
1299 datastoreName = butler._datastore.name
1300 if self.datastoreName is not None:
1301 for testStr in self.datastoreName:
1302 self.assertIn(testStr, datastoreName)
1304 def testButlerRewriteDataId(self) -> None:
1305 """Test that dataIds can be rewritten based on dimension records."""
1306 butler = self.create_empty_butler(run=self.default_run)
1308 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1309 datasetTypeName = "random_data"
1311 # Create dimension records.
1312 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1313 butler.registry.insertDimensionData(
1314 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1315 )
1316 butler.registry.insertDimensionData(
1317 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1318 )
1320 dimensions = butler.dimensions.conform(["instrument", "exposure"])
1321 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1322 butler.registry.registerDatasetType(datasetType)
1324 n_exposures = 5
1325 dayobs = 20210530
1327 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": dayobs})
1329 for i in range(n_exposures):
1330 butler.registry.insertDimensionData("group", {"instrument": "DummyCamComp", "name": f"group{i}"})
1331 butler.registry.insertDimensionData(
1332 "exposure",
1333 {
1334 "instrument": "DummyCamComp",
1335 "id": i,
1336 "obs_id": f"exp{i}",
1337 "seq_num": i,
1338 "day_obs": dayobs,
1339 "physical_filter": "d-r",
1340 "group": f"group{i}",
1341 },
1342 )
1344 # Write some data.
1345 for i in range(n_exposures):
1346 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1348 # Use the seq_num for the put to test rewriting.
1349 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1350 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1352 # Check that the exposure is correct in the dataId
1353 self.assertEqual(ref.dataId["exposure"], i)
1355 # and check that we can get the dataset back with the same dataId
1356 new_metric = butler.get(datasetTypeName, dataId=dataId)
1357 self.assertEqual(new_metric, metric)
1359 # Check that we can find the datasets using the day_obs or the
1360 # exposure.day_obs.
1361 datasets_1 = list(
1362 butler.registry.queryDatasets(
1363 datasetType,
1364 collections=self.default_run,
1365 where="day_obs = dayObs AND instrument = instr",
1366 bind={"dayObs": dayobs, "instr": "DummyCamComp"},
1367 )
1368 )
1369 datasets_2 = list(
1370 butler.registry.queryDatasets(
1371 datasetType,
1372 collections=self.default_run,
1373 where="exposure.day_obs = dayObs AND instrument = instr",
1374 bind={"dayObs": dayobs, "instr": "DummyCamComp"},
1375 )
1376 )
1377 self.assertEqual(datasets_1, datasets_2)
1379 def testGetDatasetCollectionCaching(self):
1380 # Prior to DM-41117, there was a bug where get_dataset would throw
1381 # MissingCollectionError if you tried to fetch a dataset that was added
1382 # after the collection cache was last updated.
1383 reader_butler, datasetType = self.create_butler(self.default_run, "int", "datasettypename")
1384 writer_butler = self.create_empty_butler(writeable=True, run="new_run")
1385 dataId = {"instrument": "DummyCamComp", "visit": 423}
1386 put_ref = writer_butler.put(123, datasetType, dataId)
1387 get_ref = reader_butler.get_dataset(put_ref.id)
1388 self.assertEqual(get_ref.id, put_ref.id)
1391class FileDatastoreButlerTests(ButlerTests):
1392 """Common tests and specialization of ButlerTests for butlers backed
1393 by datastores that inherit from FileDatastore.
1394 """
1396 trustModeSupported = True
1398 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool:
1399 """Check if file exists at a given path (relative to root).
1401 Test testPutTemplates verifies actual physical existance of the files
1402 in the requested location.
1403 """
1404 uri = ResourcePath(root, forceDirectory=True)
1405 return uri.join(relpath).exists()
1407 def testPutTemplates(self) -> None:
1408 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1409 butler = self.create_empty_butler(run=self.default_run)
1411 # Add needed Dimensions
1412 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1413 butler.registry.insertDimensionData(
1414 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1415 )
1416 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
1417 butler.registry.insertDimensionData(
1418 "visit",
1419 {
1420 "instrument": "DummyCamComp",
1421 "id": 423,
1422 "name": "v423",
1423 "physical_filter": "d-r",
1424 "day_obs": 20250101,
1425 },
1426 )
1427 butler.registry.insertDimensionData(
1428 "visit",
1429 {
1430 "instrument": "DummyCamComp",
1431 "id": 425,
1432 "name": "v425",
1433 "physical_filter": "d-r",
1434 "day_obs": 20250101,
1435 },
1436 )
1438 # Create and store a dataset
1439 metric = makeExampleMetrics()
1441 # Create two almost-identical DatasetTypes (both will use default
1442 # template)
1443 dimensions = butler.dimensions.conform(["instrument", "visit"])
1444 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1445 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1446 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1448 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1449 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1451 # Put with exactly the data ID keys needed
1452 ref = butler.put(metric, "metric1", dataId1)
1453 uri = butler.getURI(ref)
1454 self.assertTrue(uri.exists())
1455 self.assertTrue(
1456 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1457 )
1459 # Check the template based on dimensions
1460 if hasattr(butler._datastore, "templates"):
1461 butler._datastore.templates.validateTemplates([ref])
1463 # Put with extra data ID keys (physical_filter is an optional
1464 # dependency); should not change template (at least the way we're
1465 # defining them to behave now; the important thing is that they
1466 # must be consistent).
1467 ref = butler.put(metric, "metric2", dataId2)
1468 uri = butler.getURI(ref)
1469 self.assertTrue(uri.exists())
1470 self.assertTrue(
1471 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1472 )
1474 # Check the template based on dimensions
1475 if hasattr(butler._datastore, "templates"):
1476 butler._datastore.templates.validateTemplates([ref])
1478 # Use a template that has a typo in dimension record metadata.
1479 # Easier to test with a butler that has a ref with records attached.
1480 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1481 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"):
1482 path = template.format(ref)
1483 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1485 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1486 with self.assertRaises(KeyError):
1487 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"):
1488 template.format(ref)
1490 # Now use a file template that will not result in unique filenames
1491 with self.assertRaises(FileTemplateValidationError):
1492 butler.put(metric, "metric3", dataId1)
1494 def testImportExport(self) -> None:
1495 # Run put/get tests just to create and populate a repo.
1496 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1497 self.runImportExportTest(storageClass)
1499 @unittest.expectedFailure
1500 def testImportExportVirtualComposite(self) -> None:
1501 # Run put/get tests just to create and populate a repo.
1502 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1503 self.runImportExportTest(storageClass)
1505 def runImportExportTest(self, storageClass: StorageClass) -> None:
1506 """Test exporting and importing.
1508 This test does an export to a temp directory and an import back
1509 into a new temp directory repo. It does not assume a posix datastore.
1510 """
1511 exportButler = self.runPutGetTest(storageClass, "test_metric")
1513 # Test that we must have a file extension.
1514 with self.assertRaises(ValueError):
1515 with exportButler.export(filename="dump", directory=".") as export:
1516 pass
1518 # Test that unknown format is not allowed.
1519 with self.assertRaises(ValueError):
1520 with exportButler.export(filename="dump.fits", directory=".") as export:
1521 pass
1523 # Test that the repo actually has at least one dataset.
1524 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1525 self.assertGreater(len(datasets), 0)
1526 # Add a DimensionRecord that's unused by those datasets.
1527 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1528 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1529 # Export and then import datasets.
1530 with safeTestTempDir(TESTDIR) as exportDir:
1531 exportFile = os.path.join(exportDir, "exports.yaml")
1532 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1533 export.saveDatasets(datasets)
1534 # Export the same datasets again. This should quietly do
1535 # nothing because of internal deduplication, and it shouldn't
1536 # complain about being asked to export the "htm7" elements even
1537 # though there aren't any in these datasets or in the database.
1538 export.saveDatasets(datasets, elements=["htm7"])
1539 # Save one of the data IDs again; this should be harmless
1540 # because of internal deduplication.
1541 export.saveDataIds([datasets[0].dataId])
1542 # Save some dimension records directly.
1543 export.saveDimensionData("skymap", [skymapRecord])
1544 self.assertTrue(os.path.exists(exportFile))
1545 with safeTestTempDir(TESTDIR) as importDir:
1546 # We always want this to be a local posix butler
1547 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1548 # Calling script.butlerImport tests the implementation of the
1549 # butler command line interface "import" subcommand. Functions
1550 # in the script folder are generally considered protected and
1551 # should not be used as public api.
1552 with open(exportFile) as f:
1553 script.butlerImport(
1554 importDir,
1555 export_file=f,
1556 directory=exportDir,
1557 transfer="auto",
1558 skip_dimensions=None,
1559 )
1560 importButler = Butler.from_config(importDir, run=self.default_run)
1561 for ref in datasets:
1562 with self.subTest(ref=ref):
1563 # Test for existence by passing in the DatasetType and
1564 # data ID separately, to avoid lookup by dataset_id.
1565 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId))
1566 self.assertEqual(
1567 list(importButler.registry.queryDimensionRecords("skymap")),
1568 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)],
1569 )
1571 def testRemoveRuns(self) -> None:
1572 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1573 butler = self.create_empty_butler(writeable=True)
1574 # Load registry data with dimensions to hang datasets off of.
1575 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1576 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1577 # Add some RUN-type collection.
1578 run1 = "run1"
1579 butler.registry.registerRun(run1)
1580 run2 = "run2"
1581 butler.registry.registerRun(run2)
1582 # put a dataset in each
1583 metric = makeExampleMetrics()
1584 dimensions = butler.dimensions.conform(["instrument", "physical_filter"])
1585 datasetType = self.addDatasetType(
1586 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1587 )
1588 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1589 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1590 uri1 = butler.getURI(ref1)
1591 uri2 = butler.getURI(ref2)
1593 with self.assertRaises(OrphanedRecordError):
1594 butler.registry.removeDatasetType(datasetType.name)
1596 # Remove from both runs with different values for unstore.
1597 butler.removeRuns([run1], unstore=True)
1598 butler.removeRuns([run2], unstore=False)
1599 # Should be nothing in registry for either one, and datastore should
1600 # not think either exists.
1601 with self.assertRaises(MissingCollectionError):
1602 butler.registry.getCollectionType(run1)
1603 with self.assertRaises(MissingCollectionError):
1604 butler.registry.getCollectionType(run2)
1605 self.assertFalse(butler.stored(ref1))
1606 self.assertFalse(butler.stored(ref2))
1607 # The ref we unstored should be gone according to the URI, but the
1608 # one we forgot should still be around.
1609 self.assertFalse(uri1.exists())
1610 self.assertTrue(uri2.exists())
1612 # Now that the collections have been pruned we can remove the
1613 # dataset type
1614 butler.registry.removeDatasetType(datasetType.name)
1616 with self.assertLogs("lsst.daf.butler.registry", "INFO") as cm:
1617 butler.registry.removeDatasetType(("test*", "test*"))
1618 self.assertIn("not defined", "\n".join(cm.output))
1620 def remove_dataset_out_of_band(self, butler: Butler, ref: DatasetRef) -> None:
1621 """Simulate an external actor removing a file outside of Butler's
1622 knowledge.
1624 Subclasses may override to handle more complicated datastore
1625 configurations.
1626 """
1627 uri = butler.getURI(ref)
1628 uri.remove()
1629 datastore = cast(FileDatastore, butler._datastore)
1630 datastore.cacheManager.remove_from_cache(ref)
1632 def testPruneDatasets(self) -> None:
1633 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1634 butler = self.create_empty_butler(writeable=True)
1635 # Load registry data with dimensions to hang datasets off of.
1636 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1637 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1638 # Add some RUN-type collections.
1639 run1 = "run1"
1640 butler.registry.registerRun(run1)
1641 run2 = "run2"
1642 butler.registry.registerRun(run2)
1643 # put some datasets. ref1 and ref2 have the same data ID, and are in
1644 # different runs. ref3 has a different data ID.
1645 metric = makeExampleMetrics()
1646 dimensions = butler.dimensions.conform(["instrument", "physical_filter"])
1647 datasetType = self.addDatasetType(
1648 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1649 )
1650 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1651 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1652 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1654 many_stored = butler.stored_many([ref1, ref2, ref3])
1655 for ref, stored in many_stored.items():
1656 self.assertTrue(stored, f"Ref {ref} should be stored")
1658 many_exists = butler._exists_many([ref1, ref2, ref3])
1659 for ref, exists in many_exists.items():
1660 self.assertTrue(exists, f"Checking ref {ref} exists.")
1661 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored")
1663 # Simple prune.
1664 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1665 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1))
1667 many_stored = butler.stored_many([ref1, ref2, ref3])
1668 for ref, stored in many_stored.items():
1669 self.assertFalse(stored, f"Ref {ref} should not be stored")
1671 many_exists = butler._exists_many([ref1, ref2, ref3])
1672 for ref, exists in many_exists.items():
1673 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored")
1675 # Put data back.
1676 ref1_new = butler.put(metric, ref1)
1677 self.assertEqual(ref1_new, ref1) # Reuses original ID.
1678 ref2 = butler.put(metric, ref2)
1680 many_stored = butler.stored_many([ref1, ref2, ref3])
1681 self.assertTrue(many_stored[ref1])
1682 self.assertTrue(many_stored[ref2])
1683 self.assertFalse(many_stored[ref3])
1685 ref3 = butler.put(metric, ref3)
1687 many_exists = butler._exists_many([ref1, ref2, ref3])
1688 for ref, exists in many_exists.items():
1689 self.assertTrue(exists, f"Ref {ref} should not be stored")
1691 # Clear out the datasets from registry and start again.
1692 refs = [ref1, ref2, ref3]
1693 butler.pruneDatasets(refs, purge=True, unstore=True)
1694 for ref in refs:
1695 butler.put(metric, ref)
1697 # Confirm we can retrieve deferred.
1698 dref1 = butler.getDeferred(ref1) # known and exists
1699 metric1 = dref1.get()
1700 self.assertEqual(metric1, metric)
1702 # Test different forms of file availability.
1703 # Need to be in a state where:
1704 # - one ref just has registry record.
1705 # - one ref has a missing file but a datastore record.
1706 # - one ref has a missing datastore record but file is there.
1707 # - one ref does not exist anywhere.
1708 # Do not need to test a ref that has everything since that is tested
1709 # above.
1710 ref0 = DatasetRef(
1711 datasetType,
1712 DataCoordinate.standardize(
1713 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions
1714 ),
1715 run=run1,
1716 )
1718 # Delete from datastore and retain in Registry.
1719 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False)
1721 # File has been removed.
1722 self.remove_dataset_out_of_band(butler, ref2)
1724 # Datastore has lost track.
1725 butler._datastore.forget([ref3])
1727 # First test with a standard butler.
1728 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True)
1729 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1730 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED)
1731 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE)
1732 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED)
1734 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False)
1735 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1736 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED)
1737 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN)
1738 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED)
1739 self.assertTrue(exists_many[ref2])
1741 # Check that per-ref query gives the same answer as many query.
1742 for ref, exists in exists_many.items():
1743 self.assertEqual(butler.exists(ref, full_check=False), exists)
1745 # Get deferred checks for existence before it allows it to be
1746 # retrieved.
1747 with self.assertRaises(LookupError):
1748 butler.getDeferred(ref3) # not known, file exists
1749 dref2 = butler.getDeferred(ref2) # known but file missing
1750 with self.assertRaises(FileNotFoundError):
1751 dref2.get()
1753 # Test again with a trusting butler.
1754 if self.trustModeSupported:
1755 butler._datastore.trustGetRequest = True
1756 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True)
1757 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1758 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED)
1759 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE)
1760 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT)
1762 # When trusting we can get a deferred dataset handle that is not
1763 # known but does exist.
1764 dref3 = butler.getDeferred(ref3)
1765 metric3 = dref3.get()
1766 self.assertEqual(metric3, metric)
1768 # Check that per-ref query gives the same answer as many query.
1769 for ref, exists in exists_many.items():
1770 self.assertEqual(butler.exists(ref, full_check=True), exists)
1772 # Create a ref that surprisingly has the UUID of an existing ref
1773 # but is not the same.
1774 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id)
1775 with self.assertRaises(ValueError):
1776 butler.exists(ref_bad)
1778 # Create a ref that has a compatible storage class.
1779 ref_compat = ref2.overrideStorageClass("StructuredDataDict")
1780 exists = butler.exists(ref_compat)
1781 self.assertEqual(exists, exists_many[ref2])
1783 # Remove everything and start from scratch.
1784 butler._datastore.trustGetRequest = False
1785 butler.pruneDatasets(refs, purge=True, unstore=True)
1786 for ref in refs:
1787 butler.put(metric, ref)
1789 # These tests mess directly with the trash table and can leave the
1790 # datastore in an odd state. Do them at the end.
1791 # Check that in normal mode, deleting the record will lead to
1792 # trash not touching the file.
1793 uri1 = butler.getURI(ref1)
1794 butler._datastore.bridge.moveToTrash(
1795 [ref1], transaction=None
1796 ) # Update the dataset_location table
1797 butler._datastore.forget([ref1])
1798 butler._datastore.trash(ref1)
1799 butler._datastore.emptyTrash()
1800 self.assertTrue(uri1.exists())
1801 uri1.remove() # Clean it up.
1803 # Simulate execution butler setup by deleting the datastore
1804 # record but keeping the file around and trusting.
1805 butler._datastore.trustGetRequest = True
1806 uris = butler.get_many_uris([ref2, ref3])
1807 uri2 = uris[ref2].primaryURI
1808 uri3 = uris[ref3].primaryURI
1809 self.assertTrue(uri2.exists())
1810 self.assertTrue(uri3.exists())
1812 # Remove the datastore record.
1813 butler._datastore.bridge.moveToTrash(
1814 [ref2], transaction=None
1815 ) # Update the dataset_location table
1816 butler._datastore.forget([ref2])
1817 self.assertTrue(uri2.exists())
1818 butler._datastore.trash([ref2, ref3])
1819 # Immediate removal for ref2 file
1820 self.assertFalse(uri2.exists())
1821 # But ref3 has to wait for the empty.
1822 self.assertTrue(uri3.exists())
1823 butler._datastore.emptyTrash()
1824 self.assertFalse(uri3.exists())
1826 # Clear out the datasets from registry.
1827 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1830class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1831 """PosixDatastore specialization of a butler"""
1833 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1834 fullConfigKey: str | None = ".datastore.formatters"
1835 validationCanFail = True
1836 datastoreStr = ["/tmp"]
1837 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1838 registryStr = "/gen3.sqlite3"
1840 def testPathConstructor(self) -> None:
1841 """Independent test of constructor using PathLike."""
1842 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
1843 self.assertIsInstance(butler, Butler)
1845 # And again with a Path object with the butler yaml
1846 path = pathlib.Path(self.tmpConfigFile)
1847 butler = Butler.from_config(path, writeable=False)
1848 self.assertIsInstance(butler, Butler)
1850 # And again with a Path object without the butler yaml
1851 # (making sure we skip it if the tmp config doesn't end
1852 # in butler.yaml -- which is the case for a subclass)
1853 if self.tmpConfigFile.endswith("butler.yaml"):
1854 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1855 butler = Butler.from_config(path, writeable=False)
1856 self.assertIsInstance(butler, Butler)
1858 def testExportTransferCopy(self) -> None:
1859 """Test local export using all transfer modes"""
1860 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1861 exportButler = self.runPutGetTest(storageClass, "test_metric")
1862 # Test that the repo actually has at least one dataset.
1863 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1864 self.assertGreater(len(datasets), 0)
1865 uris = [exportButler.getURI(d) for d in datasets]
1866 assert isinstance(exportButler._datastore, FileDatastore)
1867 datastoreRoot = exportButler.get_datastore_roots()[exportButler.get_datastore_names()[0]]
1869 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1871 for path in pathsInStore:
1872 # Assume local file system
1873 assert path is not None
1874 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1876 for transfer in ("copy", "link", "symlink", "relsymlink"):
1877 with safeTestTempDir(TESTDIR) as exportDir:
1878 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1879 export.saveDatasets(datasets)
1880 for path in pathsInStore:
1881 assert path is not None
1882 self.assertTrue(
1883 self.checkFileExists(exportDir, path),
1884 f"Check that mode {transfer} exported files",
1885 )
1887 def testPytypeCoercion(self) -> None:
1888 """Test python type coercion on Butler.get and put."""
1889 # Store some data with the normal example storage class.
1890 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1891 datasetTypeName = "test_metric"
1892 butler = self.runPutGetTest(storageClass, datasetTypeName)
1894 dataId = {"instrument": "DummyCamComp", "visit": 423}
1895 metric = butler.get(datasetTypeName, dataId=dataId)
1896 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1898 datasetType_ori = butler.get_dataset_type(datasetTypeName)
1899 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1901 # Now need to hack the registry dataset type definition.
1902 # There is no API for this.
1903 assert isinstance(butler._registry, SqlRegistry)
1904 manager = butler._registry._managers.datasets
1905 assert hasattr(manager, "_db") and hasattr(manager, "_static")
1906 manager._db.update(
1907 manager._static.dataset_type,
1908 {"name": datasetTypeName},
1909 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1910 )
1912 # Force reset of dataset type cache
1913 butler.registry.refresh()
1915 datasetType_new = butler.get_dataset_type(datasetTypeName)
1916 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1917 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1919 metric_model = butler.get(datasetTypeName, dataId=dataId)
1920 self.assertNotEqual(type(metric_model), type(metric))
1921 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1923 # Put the model and read it back to show that everything now
1924 # works as normal.
1925 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1926 metric_model_new = butler.get(metric_ref)
1927 self.assertEqual(metric_model_new, metric_model)
1929 # Hack the storage class again to something that will fail on the
1930 # get with no conversion class.
1931 manager._db.update(
1932 manager._static.dataset_type,
1933 {"name": datasetTypeName},
1934 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1935 )
1936 butler.registry.refresh()
1938 with self.assertRaises(ValueError):
1939 butler.get(datasetTypeName, dataId=dataId)
1942@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1943class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1944 """PosixDatastore specialization of a butler using Postgres"""
1946 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1947 fullConfigKey = ".datastore.formatters"
1948 validationCanFail = True
1949 datastoreStr = ["/tmp"]
1950 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1951 registryStr = "PostgreSQL@test"
1952 postgresql: Any
1954 @staticmethod
1955 def _handler(postgresql: Any) -> None:
1956 engine = sqlalchemy.engine.create_engine(postgresql.url())
1957 with engine.begin() as connection:
1958 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1960 @classmethod
1961 def setUpClass(cls) -> None:
1962 # Create the postgres test server.
1963 cls.postgresql = testing.postgresql.PostgresqlFactory(
1964 cache_initialized_db=True, on_initialized=cls._handler
1965 )
1966 super().setUpClass()
1968 @classmethod
1969 def tearDownClass(cls) -> None:
1970 # Clean up any lingering SQLAlchemy engines/connections
1971 # so they're closed before we shut down the server.
1972 gc.collect()
1973 cls.postgresql.clear_cache()
1974 super().tearDownClass()
1976 def setUp(self) -> None:
1977 self.server = self.postgresql()
1979 # Need to add a registry section to the config.
1980 self._temp_config = False
1981 config = Config(self.configFile)
1982 config["registry", "db"] = self.server.url()
1983 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1984 config.dump(fh)
1985 self.configFile = fh.name
1986 self._temp_config = True
1987 super().setUp()
1989 def tearDown(self) -> None:
1990 self.server.stop()
1991 if self._temp_config and os.path.exists(self.configFile):
1992 os.remove(self.configFile)
1993 super().tearDown()
1995 def testMakeRepo(self) -> None:
1996 # The base class test assumes that it's using sqlite and assumes
1997 # the config file is acceptable to sqlite.
1998 raise unittest.SkipTest("Postgres config is not compatible with this test.")
2001@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
2002class ClonedPostgresPosixDatastoreButlerTestCase(PostgresPosixDatastoreButlerTestCase, unittest.TestCase):
2003 """Test that Butler with a Postgres registry still works after cloning."""
2005 def create_butler(
2006 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
2007 ) -> tuple[DirectButler, DatasetType]:
2008 butler, datasetType = super().create_butler(run, storageClass, datasetTypeName)
2009 return butler._clone(run=run), datasetType
2012class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
2013 """InMemoryDatastore specialization of a butler"""
2015 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
2016 fullConfigKey = None
2017 useTempRoot = False
2018 validationCanFail = False
2019 datastoreStr = ["datastore='InMemory"]
2020 datastoreName = ["InMemoryDatastore@"]
2021 registryStr = "/gen3.sqlite3"
2023 def testIngest(self) -> None:
2024 pass
2027class ClonedSqliteButlerTestCase(InMemoryDatastoreButlerTestCase, unittest.TestCase):
2028 """Test that a Butler with a Sqlite registry still works after cloning."""
2030 def create_butler(
2031 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
2032 ) -> tuple[DirectButler, DatasetType]:
2033 butler, datasetType = super().create_butler(run, storageClass, datasetTypeName)
2034 return butler._clone(run=run), datasetType
2037class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
2038 """PosixDatastore specialization"""
2040 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2041 fullConfigKey = ".datastore.datastores.1.formatters"
2042 validationCanFail = True
2043 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
2044 datastoreName = [
2045 "InMemoryDatastore@",
2046 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
2047 "SecondDatastore",
2048 ]
2049 registryStr = "/gen3.sqlite3"
2051 def testPruneDatasets(self) -> None:
2052 # This test relies on manipulating files out-of-band, which is
2053 # impossible for this configuration because of the InMemoryDatastore in
2054 # the ChainedDatastore.
2055 pass
2058class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
2059 """Test that a yaml file in one location can refer to a root in another."""
2061 datastoreStr = ["dir1"]
2062 # Disable the makeRepo test since we are deliberately not using
2063 # butler.yaml as the config name.
2064 fullConfigKey = None
2066 def setUp(self) -> None:
2067 self.root = makeTestTempDir(TESTDIR)
2069 # Make a new repository in one place
2070 self.dir1 = os.path.join(self.root, "dir1")
2071 Butler.makeRepo(self.dir1, config=Config(self.configFile))
2073 # Move the yaml file to a different place and add a "root"
2074 self.dir2 = os.path.join(self.root, "dir2")
2075 os.makedirs(self.dir2, exist_ok=True)
2076 configFile1 = os.path.join(self.dir1, "butler.yaml")
2077 config = Config(configFile1)
2078 config["root"] = self.dir1
2079 configFile2 = os.path.join(self.dir2, "butler2.yaml")
2080 config.dumpToUri(configFile2)
2081 os.remove(configFile1)
2082 self.tmpConfigFile = configFile2
2084 def testFileLocations(self) -> None:
2085 self.assertNotEqual(self.dir1, self.dir2)
2086 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
2087 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
2088 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
2091class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
2092 """Test that a config file created by makeRepo outside of repo works."""
2094 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2096 def setUp(self) -> None:
2097 self.root = makeTestTempDir(TESTDIR)
2098 self.root2 = makeTestTempDir(TESTDIR)
2100 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
2101 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2103 def tearDown(self) -> None:
2104 if os.path.exists(self.root2):
2105 shutil.rmtree(self.root2, ignore_errors=True)
2106 super().tearDown()
2108 def testConfigExistence(self) -> None:
2109 c = Config(self.tmpConfigFile)
2110 uri_config = ResourcePath(c["root"])
2111 uri_expected = ResourcePath(self.root, forceDirectory=True)
2112 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
2113 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
2115 def testPutGet(self) -> None:
2116 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
2117 self.runPutGetTest(storageClass, "test_metric")
2120class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
2121 """Test that a config file created by makeRepo outside of repo works."""
2123 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2125 def setUp(self) -> None:
2126 self.root = makeTestTempDir(TESTDIR)
2127 self.root2 = makeTestTempDir(TESTDIR)
2129 self.tmpConfigFile = self.root2
2130 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2132 def testConfigExistence(self) -> None:
2133 # Append the yaml file else Config constructor does not know the file
2134 # type.
2135 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
2136 super().testConfigExistence()
2139class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
2140 """Test that a config file created by makeRepo outside of repo works."""
2142 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2144 def setUp(self) -> None:
2145 self.root = makeTestTempDir(TESTDIR)
2146 self.root2 = makeTestTempDir(TESTDIR)
2148 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
2149 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2152@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
2153class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
2154 """S3Datastore specialization of a butler; an S3 storage Datastore +
2155 a local in-memory SqlRegistry.
2156 """
2158 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
2159 fullConfigKey = None
2160 validationCanFail = True
2162 bucketName = "anybucketname"
2163 """Name of the Bucket that will be used in the tests. The name is read from
2164 the config file used with the tests during set-up.
2165 """
2167 root = "butlerRoot/"
2168 """Root repository directory expected to be used in case useTempRoot=False.
2169 Otherwise the root is set to a 20 characters long randomly generated string
2170 during set-up.
2171 """
2173 datastoreStr = [f"datastore={root}"]
2174 """Contains all expected root locations in a format expected to be
2175 returned by Butler stringification.
2176 """
2178 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
2179 """The expected format of the S3 Datastore string."""
2181 registryStr = "/gen3.sqlite3"
2182 """Expected format of the Registry string."""
2184 mock_aws = mock_aws()
2185 """The mocked s3 interface from moto."""
2187 def genRoot(self) -> str:
2188 """Return a random string of len 20 to serve as a root
2189 name for the temporary bucket repo.
2191 This is equivalent to tempfile.mkdtemp as this is what self.root
2192 becomes when useTempRoot is True.
2193 """
2194 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
2195 return rndstr + "/"
2197 def setUp(self) -> None:
2198 config = Config(self.configFile)
2199 uri = ResourcePath(config[".datastore.datastore.root"])
2200 self.bucketName = uri.netloc
2202 # Enable S3 mocking of tests.
2203 self.enterContext(clean_test_environment_for_s3())
2204 self.mock_aws.start()
2206 if self.useTempRoot:
2207 self.root = self.genRoot()
2208 rooturi = f"s3://{self.bucketName}/{self.root}"
2209 config.update({"datastore": {"datastore": {"root": rooturi}}})
2211 # need local folder to store registry database
2212 self.reg_dir = makeTestTempDir(TESTDIR)
2213 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
2215 # MOTO needs to know that we expect Bucket bucketname to exist
2216 # (this used to be the class attribute bucketName)
2217 s3 = boto3.resource("s3")
2218 s3.create_bucket(Bucket=self.bucketName)
2220 self.datastoreStr = [f"datastore='{rooturi}'"]
2221 self.datastoreName = [f"FileDatastore@{rooturi}"]
2222 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
2223 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
2225 def tearDown(self) -> None:
2226 s3 = boto3.resource("s3")
2227 bucket = s3.Bucket(self.bucketName)
2228 try:
2229 bucket.objects.all().delete()
2230 except botocore.exceptions.ClientError as e:
2231 if e.response["Error"]["Code"] == "404":
2232 # the key was not reachable - pass
2233 pass
2234 else:
2235 raise
2237 bucket = s3.Bucket(self.bucketName)
2238 bucket.delete()
2240 # Stop the S3 mock.
2241 self.mock_aws.stop()
2243 if self.reg_dir is not None and os.path.exists(self.reg_dir):
2244 shutil.rmtree(self.reg_dir, ignore_errors=True)
2246 if self.useTempRoot and os.path.exists(self.root):
2247 shutil.rmtree(self.root, ignore_errors=True)
2249 super().tearDown()
2252class PosixDatastoreTransfers(unittest.TestCase):
2253 """Test data transfers between butlers.
2255 Test for different managers. UUID to UUID and integer to integer are
2256 tested. UUID to integer is not supported since we do not currently
2257 want to allow that. Integer to UUID is supported with the caveat
2258 that UUID4 will be generated and this will be incorrect for raw
2259 dataset types. The test ignores that.
2260 """
2262 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2263 storageClassFactory: StorageClassFactory
2265 @classmethod
2266 def setUpClass(cls) -> None:
2267 cls.storageClassFactory = StorageClassFactory()
2268 cls.storageClassFactory.addFromConfig(cls.configFile)
2270 def setUp(self) -> None:
2271 self.root = makeTestTempDir(TESTDIR)
2272 self.config = Config(self.configFile)
2274 def tearDown(self) -> None:
2275 removeTestTempDir(self.root)
2277 def create_butler(self, manager: str, label: str) -> Butler:
2278 config = Config(self.configFile)
2279 config["registry", "managers", "datasets"] = manager
2280 return Butler.from_config(
2281 Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True
2282 )
2284 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None:
2285 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID"
2286 if manager1 is None:
2287 manager1 = default
2288 if manager2 is None:
2289 manager2 = default
2290 self.source_butler = self.create_butler(manager1, "1")
2291 self.target_butler = self.create_butler(manager2, "2")
2293 def testTransferUuidToUuid(self) -> None:
2294 self.create_butlers()
2295 self.assertButlerTransfers()
2297 def testTransferMissing(self) -> None:
2298 """Test transfers where datastore records are missing.
2300 This is how execution butler works.
2301 """
2302 self.create_butlers()
2304 # Configure the source butler to allow trust.
2305 self.source_butler._datastore._set_trust_mode(True)
2307 self.assertButlerTransfers(purge=True)
2309 def testTransferMissingDisassembly(self) -> None:
2310 """Test transfers where datastore records are missing.
2312 This is how execution butler works.
2313 """
2314 self.create_butlers()
2316 # Configure the source butler to allow trust.
2317 self.source_butler._datastore._set_trust_mode(True)
2319 # Test disassembly.
2320 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
2322 def testAbsoluteURITransferDirect(self) -> None:
2323 """Test transfer using an absolute URI."""
2324 self._absolute_transfer("auto")
2326 def testAbsoluteURITransferCopy(self) -> None:
2327 """Test transfer using an absolute URI."""
2328 self._absolute_transfer("copy")
2330 def _absolute_transfer(self, transfer: str) -> None:
2331 self.create_butlers()
2333 storageClassName = "StructuredData"
2334 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2335 datasetTypeName = "random_data"
2336 run = "run1"
2337 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2339 dimensions = self.source_butler.dimensions.conform(())
2340 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2341 self.source_butler.registry.registerDatasetType(datasetType)
2343 metrics = makeExampleMetrics()
2344 with ResourcePath.temporary_uri(suffix=".json") as temp:
2345 dataId = DataCoordinate.make_empty(self.source_butler.dimensions)
2346 source_refs = [DatasetRef(datasetType, dataId, run=run)]
2347 temp.write(json.dumps(metrics.exportAsDict()).encode())
2348 dataset = FileDataset(path=temp, refs=source_refs)
2349 self.source_butler.ingest(dataset, transfer="direct")
2351 self.target_butler.transfer_from(
2352 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer
2353 )
2355 uri = self.target_butler.getURI(dataset.refs[0])
2356 if transfer == "auto":
2357 self.assertEqual(uri, temp)
2358 else:
2359 self.assertNotEqual(uri, temp)
2361 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None:
2362 """Test that a run can be transferred to another butler."""
2363 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2364 datasetTypeName = "random_data"
2366 # Test will create 3 collections and we will want to transfer
2367 # two of those three.
2368 runs = ["run1", "run2", "other"]
2370 # Also want to use two different dataset types to ensure that
2371 # grouping works.
2372 datasetTypeNames = ["random_data", "random_data_2"]
2374 # Create the run collections in the source butler.
2375 for run in runs:
2376 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2378 # Create dimensions in source butler.
2379 n_exposures = 30
2380 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2381 self.source_butler.registry.insertDimensionData(
2382 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2383 )
2384 self.source_butler.registry.insertDimensionData(
2385 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2386 )
2387 self.source_butler.registry.insertDimensionData(
2388 "day_obs",
2389 {
2390 "instrument": "DummyCamComp",
2391 "id": 20250101,
2392 },
2393 )
2395 for i in range(n_exposures):
2396 self.source_butler.registry.insertDimensionData(
2397 "group", {"instrument": "DummyCamComp", "name": f"group{i}"}
2398 )
2399 self.source_butler.registry.insertDimensionData(
2400 "exposure",
2401 {
2402 "instrument": "DummyCamComp",
2403 "id": i,
2404 "obs_id": f"exp{i}",
2405 "physical_filter": "d-r",
2406 "group": f"group{i}",
2407 "day_obs": 20250101,
2408 },
2409 )
2411 # Create dataset types in the source butler.
2412 dimensions = self.source_butler.dimensions.conform(["instrument", "exposure"])
2413 for datasetTypeName in datasetTypeNames:
2414 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2415 self.source_butler.registry.registerDatasetType(datasetType)
2417 # Write a dataset to an unrelated run -- this will ensure that
2418 # we are rewriting integer dataset ids in the target if necessary.
2419 # Will not be relevant for UUID.
2420 run = "distraction"
2421 butler = Butler.from_config(butler=self.source_butler, run=run)
2422 butler.put(
2423 makeExampleMetrics(),
2424 datasetTypeName,
2425 exposure=1,
2426 instrument="DummyCamComp",
2427 physical_filter="d-r",
2428 )
2430 # Write some example metrics to the source
2431 butler = Butler.from_config(butler=self.source_butler)
2433 # Set of DatasetRefs that should be in the list of refs to transfer
2434 # but which will not be transferred.
2435 deleted: set[DatasetRef] = set()
2437 n_expected = 20 # Number of datasets expected to be transferred
2438 source_refs = []
2439 for i in range(n_exposures):
2440 # Put a third of datasets into each collection, only retain
2441 # two thirds.
2442 index = i % 3
2443 run = runs[index]
2444 datasetTypeName = datasetTypeNames[i % 2]
2446 metric = MetricsExample(
2447 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)]
2448 )
2449 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2450 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2452 # Remove the datastore record using low-level API, but only
2453 # for a specific index.
2454 if purge and index == 1:
2455 # For one of these delete the file as well.
2456 # This allows the "missing" code to filter the
2457 # file out.
2458 # Access the individual datastores.
2459 datastores = []
2460 if hasattr(butler._datastore, "datastores"):
2461 datastores.extend(butler._datastore.datastores)
2462 else:
2463 datastores.append(butler._datastore)
2465 if not deleted:
2466 # For a chained datastore we need to remove
2467 # files in each chain.
2468 for datastore in datastores:
2469 # The file might not be known to the datastore
2470 # if constraints are used.
2471 try:
2472 primary, uris = datastore.getURIs(ref)
2473 except FileNotFoundError:
2474 continue
2475 if primary and primary.scheme != "mem":
2476 primary.remove()
2477 for uri in uris.values():
2478 if uri.scheme != "mem":
2479 uri.remove()
2480 n_expected -= 1
2481 deleted.add(ref)
2483 # Remove the datastore record.
2484 for datastore in datastores:
2485 if hasattr(datastore, "removeStoredItemInfo"):
2486 datastore.removeStoredItemInfo(ref)
2488 if index < 2:
2489 source_refs.append(ref)
2490 if ref not in deleted:
2491 new_metric = butler.get(ref)
2492 self.assertEqual(new_metric, metric)
2494 # Create some bad dataset types to ensure we check for inconsistent
2495 # definitions.
2496 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2497 for datasetTypeName in datasetTypeNames:
2498 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2499 self.target_butler.registry.registerDatasetType(datasetType)
2500 with self.assertRaises(ConflictingDefinitionError) as cm:
2501 self.target_butler.transfer_from(self.source_butler, source_refs)
2502 self.assertIn("dataset type differs", str(cm.exception))
2504 # And remove the bad definitions.
2505 for datasetTypeName in datasetTypeNames:
2506 self.target_butler.registry.removeDatasetType(datasetTypeName)
2508 # Transfer without creating dataset types should fail.
2509 with self.assertRaises(KeyError):
2510 self.target_butler.transfer_from(self.source_butler, source_refs)
2512 # Transfer without creating dimensions should fail.
2513 with self.assertRaises(ConflictingDefinitionError) as cm:
2514 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True)
2515 self.assertIn("dimension", str(cm.exception))
2517 # The failed transfer above leaves registry in an inconsistent
2518 # state because the run is created but then rolled back without
2519 # the collection cache being cleared. For now force a refresh.
2520 # Can remove with DM-35498.
2521 self.target_butler.registry.refresh()
2523 # Do a dry run -- this should not have any effect on the target butler.
2524 self.target_butler.transfer_from(self.source_butler, source_refs, dry_run=True)
2526 # Transfer the records for one ref to test the alternative API.
2527 with self.assertLogs(logger="lsst", level=logging.DEBUG) as log_cm:
2528 self.target_butler.transfer_dimension_records_from(self.source_butler, [source_refs[0]])
2529 self.assertIn("number of records transferred: 1", ";".join(log_cm.output))
2531 # Now transfer them to the second butler, including dimensions.
2532 with self.assertLogs(logger="lsst", level=logging.DEBUG) as log_cm:
2533 transferred = self.target_butler.transfer_from(
2534 self.source_butler,
2535 source_refs,
2536 register_dataset_types=True,
2537 transfer_dimensions=True,
2538 )
2539 self.assertEqual(len(transferred), n_expected)
2540 log_output = ";".join(log_cm.output)
2542 # A ChainedDatastore will use the in-memory datastore for mexists
2543 # so we can not rely on the mexists log message.
2544 self.assertIn("Number of datastore records found in source", log_output)
2545 self.assertIn("Creating output run", log_output)
2547 # Do the transfer twice to ensure that it will do nothing extra.
2548 # Only do this if purge=True because it does not work for int
2549 # dataset_id.
2550 if purge:
2551 # This should not need to register dataset types.
2552 transferred = self.target_butler.transfer_from(self.source_butler, source_refs)
2553 self.assertEqual(len(transferred), n_expected)
2555 # Also do an explicit low-level transfer to trigger some
2556 # edge cases.
2557 with self.assertLogs(level=logging.DEBUG) as log_cm:
2558 self.target_butler._datastore.transfer_from(self.source_butler._datastore, source_refs)
2559 log_output = ";".join(log_cm.output)
2560 self.assertIn("no file artifacts exist", log_output)
2562 with self.assertRaises((TypeError, AttributeError)):
2563 self.target_butler._datastore.transfer_from(self.source_butler, source_refs) # type: ignore
2565 with self.assertRaises(ValueError):
2566 self.target_butler._datastore.transfer_from(
2567 self.source_butler._datastore, source_refs, transfer="split"
2568 )
2570 # Now try to get the same refs from the new butler.
2571 for ref in source_refs:
2572 if ref not in deleted:
2573 new_metric = self.target_butler.get(ref)
2574 old_metric = self.source_butler.get(ref)
2575 self.assertEqual(new_metric, old_metric)
2577 # Now prune run2 collection and create instead a CHAINED collection.
2578 # This should block the transfer.
2579 self.target_butler.removeRuns(["run2"], unstore=True)
2580 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2581 with self.assertRaises(CollectionTypeError):
2582 # Re-importing the run1 datasets can be problematic if they
2583 # use integer IDs so filter those out.
2584 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2585 self.target_butler.transfer_from(self.source_butler, to_transfer)
2588class ChainedDatastoreTransfers(PosixDatastoreTransfers):
2589 """Test transfers using a chained datastore."""
2591 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2594class NullDatastoreTestCase(unittest.TestCase):
2595 """Test that we can fall back to a null datastore."""
2597 # Need a good config to create the repo.
2598 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2599 storageClassFactory: StorageClassFactory
2601 @classmethod
2602 def setUpClass(cls) -> None:
2603 cls.storageClassFactory = StorageClassFactory()
2604 cls.storageClassFactory.addFromConfig(cls.configFile)
2606 def setUp(self) -> None:
2607 """Create a new butler root for each test."""
2608 self.root = makeTestTempDir(TESTDIR)
2609 Butler.makeRepo(self.root, config=Config(self.configFile))
2611 def tearDown(self) -> None:
2612 removeTestTempDir(self.root)
2614 def test_fallback(self) -> None:
2615 # Read the butler config and mess with the datastore section.
2616 config_path = os.path.join(self.root, "butler.yaml")
2617 bad_config = Config(config_path)
2618 bad_config["datastore", "cls"] = "lsst.not.a.datastore.Datastore"
2619 bad_config.dumpToUri(config_path)
2621 with self.assertRaises(RuntimeError):
2622 Butler(self.root, without_datastore=False)
2624 with self.assertRaises(RuntimeError):
2625 Butler.from_config(self.root, without_datastore=False)
2627 butler = Butler.from_config(self.root, writeable=True, without_datastore=True)
2628 self.assertIsInstance(butler._datastore, NullDatastore)
2630 # Check that registry is working.
2631 butler.registry.registerRun("MYRUN")
2632 collections = butler.registry.queryCollections(...)
2633 self.assertIn("MYRUN", set(collections))
2635 # Create a ref.
2636 dimensions = butler.dimensions.conform([])
2637 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
2638 datasetTypeName = "metric"
2639 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2640 butler.registry.registerDatasetType(datasetType)
2641 ref = DatasetRef(datasetType, {}, run="MYRUN")
2643 # Check that datastore will complain.
2644 with self.assertRaises(FileNotFoundError):
2645 butler.get(ref)
2646 with self.assertRaises(FileNotFoundError):
2647 butler.getURI(ref)
2650@unittest.skipIf(create_test_server is None, "Server dependencies not installed.")
2651class ButlerServerTests(FileDatastoreButlerTests, unittest.TestCase):
2652 """Test RemoteButler and Butler server."""
2654 configFile = None
2655 predictionSupported = False
2656 trustModeSupported = False
2658 def setUp(self):
2659 self.server_instance = self.enterContext(create_test_server(TESTDIR))
2661 def tearDown(self):
2662 pass
2664 def are_uris_equivalent(self, uri1: ResourcePath, uri2: ResourcePath) -> bool:
2665 # S3 pre-signed URLs may end up with differing expiration times in the
2666 # query parameters, so ignore query parameters when comparing.
2667 return uri1.scheme == uri2.scheme and uri1.netloc == uri2.netloc and uri1.path == uri2.path
2669 def create_empty_butler(self, run: str | None = None, writeable: bool | None = None) -> Butler:
2670 return self.server_instance.hybrid_butler._clone(run=run)
2672 def remove_dataset_out_of_band(self, butler: Butler, ref: DatasetRef) -> None:
2673 # Can't delete a file via S3 signed URLs, so we need to reach in
2674 # through DirectButler to delete the dataset.
2675 uri = self.server_instance.direct_butler.getURI(ref)
2676 uri.remove()
2678 def testConstructor(self):
2679 # RemoteButler constructor is tested in test_server.py and
2680 # test_remote_butler.py.
2681 pass
2683 def testDafButlerRepositories(self):
2684 # Loading of RemoteButler via repository index is tested in
2685 # test_server.py.
2686 pass
2688 def testGetDatasetTypes(self) -> None:
2689 # This is mostly a test of validateConfiguration, which is for
2690 # validating Datastore configuration and thus isn't relevant to
2691 # RemoteButler.
2692 pass
2694 def testMakeRepo(self) -> None:
2695 # Only applies to DirectButler.
2696 pass
2698 # Pickling not yet implemented for RemoteButler/HybridButler.
2699 @unittest.expectedFailure
2700 def testPickle(self) -> None:
2701 return super().testPickle()
2703 def testStringification(self) -> None:
2704 self.assertEqual(
2705 str(self.server_instance.remote_butler),
2706 "RemoteButler(https://test.example/api/butler/repo/testrepo)",
2707 )
2709 def testTransaction(self) -> None:
2710 # Transactions will never be supported for RemoteButler.
2711 pass
2713 def testPutTemplates(self) -> None:
2714 # The Butler server instance is configured with different file naming
2715 # templates than this test is expecting.
2716 pass
2719def setup_module(module: types.ModuleType) -> None:
2720 """Set up the module for pytest."""
2721 clean_environment()
2724if __name__ == "__main__":
2725 clean_environment()
2726 unittest.main()