Coverage for tests/test_butler.py: 14%
1448 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Tests for Butler.
29"""
30from __future__ import annotations
32import gc
33import json
34import logging
35import os
36import pathlib
37import pickle
38import posixpath
39import random
40import shutil
41import string
42import tempfile
43import unittest
44import uuid
45from collections.abc import Mapping
46from typing import TYPE_CHECKING, Any, cast
48try:
49 import boto3
50 import botocore
51 from lsst.resources.s3utils import clean_test_environment_for_s3
53 try:
54 from moto import mock_aws # v5
55 except ImportError:
56 from moto import mock_s3 as mock_aws
57except ImportError:
58 boto3 = None
60 def mock_aws(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def]
61 """No-op decorator in case moto mock_aws can not be imported."""
62 return None
65try:
66 from lsst.daf.butler.tests.server import create_test_server
67except ImportError:
68 create_test_server = None
70try:
71 # It's possible but silly to have testing.postgresql installed without
72 # having the postgresql server installed (because then nothing in
73 # testing.postgresql would work), so we use the presence of that module
74 # to test whether we can expect the server to be available.
75 import testing.postgresql # type: ignore[import]
76except ImportError:
77 testing = None
79import astropy.time
80import sqlalchemy
81from lsst.daf.butler import (
82 Butler,
83 ButlerConfig,
84 ButlerRepoIndex,
85 CollectionCycleError,
86 CollectionType,
87 Config,
88 DataCoordinate,
89 DatasetExistence,
90 DatasetNotFoundError,
91 DatasetRef,
92 DatasetType,
93 FileDataset,
94 NoDefaultCollectionError,
95 StorageClassFactory,
96 ValidationError,
97 script,
98)
99from lsst.daf.butler.datastore import NullDatastore
100from lsst.daf.butler.datastore.file_templates import FileTemplate, FileTemplateValidationError
101from lsst.daf.butler.datastores.fileDatastore import FileDatastore
102from lsst.daf.butler.direct_butler import DirectButler
103from lsst.daf.butler.registry import (
104 CollectionError,
105 CollectionTypeError,
106 ConflictingDefinitionError,
107 DataIdValueError,
108 MissingCollectionError,
109 OrphanedRecordError,
110)
111from lsst.daf.butler.registry.sql_registry import SqlRegistry
112from lsst.daf.butler.repo_relocation import BUTLER_ROOT_TAG
113from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
114from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir
115from lsst.resources import ResourcePath
116from lsst.utils import doImportType
117from lsst.utils.introspection import get_full_type_name
119if TYPE_CHECKING:
120 import types
122 from lsst.daf.butler import DimensionGroup, Registry, StorageClass
124TESTDIR = os.path.abspath(os.path.dirname(__file__))
127def clean_environment() -> None:
128 """Remove external environment variables that affect the tests."""
129 for k in ("DAF_BUTLER_REPOSITORY_INDEX",):
130 os.environ.pop(k, None)
133def makeExampleMetrics() -> MetricsExample:
134 """Return example dataset suitable for tests."""
135 return MetricsExample(
136 {"AM1": 5.2, "AM2": 30.6},
137 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
138 [563, 234, 456.7, 752, 8, 9, 27],
139 )
142class TransactionTestError(Exception):
143 """Specific error for testing transactions, to prevent misdiagnosing
144 that might otherwise occur when a standard exception is used.
145 """
147 pass
150class ButlerConfigTests(unittest.TestCase):
151 """Simple tests for ButlerConfig that are not tested in any other test
152 cases.
153 """
155 def testSearchPath(self) -> None:
156 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
157 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
158 config1 = ButlerConfig(configFile)
159 self.assertNotIn("testConfigs", "\n".join(cm.output))
161 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
162 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
163 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
164 self.assertIn("testConfigs", "\n".join(cm.output))
166 key = ("datastore", "records", "table")
167 self.assertNotEqual(config1[key], config2[key])
168 self.assertEqual(config2[key], "override_record")
171class ButlerPutGetTests(TestCaseMixin):
172 """Helper method for running a suite of put/get tests from different
173 butler configurations.
174 """
176 root: str
177 default_run = "ingésτ😺"
178 storageClassFactory: StorageClassFactory
179 configFile: str | None
180 tmpConfigFile: str
182 @staticmethod
183 def addDatasetType(
184 datasetTypeName: str, dimensions: DimensionGroup, storageClass: StorageClass | str, registry: Registry
185 ) -> DatasetType:
186 """Create a DatasetType and register it"""
187 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
188 registry.registerDatasetType(datasetType)
189 return datasetType
191 @classmethod
192 def setUpClass(cls) -> None:
193 cls.storageClassFactory = StorageClassFactory()
194 if cls.configFile is not None:
195 cls.storageClassFactory.addFromConfig(cls.configFile)
197 def assertGetComponents(
198 self,
199 butler: Butler,
200 datasetRef: DatasetRef,
201 components: tuple[str, ...],
202 reference: Any,
203 collections: Any = None,
204 ) -> None:
205 datasetType = datasetRef.datasetType
206 dataId = datasetRef.dataId
207 deferred = butler.getDeferred(datasetRef)
209 for component in components:
210 compTypeName = datasetType.componentTypeName(component)
211 result = butler.get(compTypeName, dataId, collections=collections)
212 self.assertEqual(result, getattr(reference, component))
213 result_deferred = deferred.get(component=component)
214 self.assertEqual(result_deferred, result)
216 def tearDown(self) -> None:
217 if self.root is not None:
218 removeTestTempDir(self.root)
220 def create_empty_butler(self, run: str | None = None, writeable: bool | None = None):
221 """Create a Butler for the test repository, without inserting test
222 data.
223 """
224 butler = Butler.from_config(self.tmpConfigFile, run=run, writeable=writeable)
225 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration"
226 return butler
228 def create_butler(
229 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
230 ) -> tuple[Butler, DatasetType]:
231 """Create a Butler for the test repository and insert some test data
232 into it.
233 """
234 butler = self.create_empty_butler(run=run)
236 collections = set(butler.registry.queryCollections())
237 self.assertEqual(collections, {run})
238 # Create and register a DatasetType
239 dimensions = butler.dimensions.conform(["instrument", "visit"])
241 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
243 # Add needed Dimensions
244 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
245 butler.registry.insertDimensionData(
246 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
247 )
248 butler.registry.insertDimensionData(
249 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
250 )
251 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20200101})
252 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
253 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
254 butler.registry.insertDimensionData(
255 "visit",
256 {
257 "instrument": "DummyCamComp",
258 "id": 423,
259 "name": "fourtwentythree",
260 "physical_filter": "d-r",
261 "datetime_begin": visit_start,
262 "datetime_end": visit_end,
263 "day_obs": 20200101,
264 },
265 )
267 # Add more visits for some later tests
268 for visit_id in (424, 425):
269 butler.registry.insertDimensionData(
270 "visit",
271 {
272 "instrument": "DummyCamComp",
273 "id": visit_id,
274 "name": f"fourtwentyfour_{visit_id}",
275 "physical_filter": "d-r",
276 "day_obs": 20200101,
277 },
278 )
279 return butler, datasetType
281 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler:
282 # New datasets will be added to run and tag, but we will only look in
283 # tag when looking up datasets.
284 run = self.default_run
285 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
286 assert butler.run is not None
288 # Create and store a dataset
289 metric = makeExampleMetrics()
290 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423})
292 # Dataset should not exist if we haven't added it
293 with self.assertRaises(DatasetNotFoundError):
294 butler.get(datasetTypeName, dataId)
296 # Put and remove the dataset once as a DatasetRef, once as a dataId,
297 # and once with a DatasetType
299 # Keep track of any collections we add and do not clean up
300 expected_collections = {run}
302 counter = 0
303 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1")
304 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate]
305 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)):
306 # Since we are using subTest we can get cascading failures
307 # here with the first attempt failing and the others failing
308 # immediately because the dataset already exists. Work around
309 # this by using a distinct run collection each time
310 counter += 1
311 this_run = f"put_run_{counter}"
312 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
313 expected_collections.update({this_run})
315 with self.subTest(args=args):
316 kwargs: dict[str, Any] = {}
317 if not isinstance(args[0], DatasetRef): # type: ignore
318 kwargs["run"] = this_run
319 ref = butler.put(metric, *args, **kwargs)
320 self.assertIsInstance(ref, DatasetRef)
322 # Test get of a ref.
323 metricOut = butler.get(ref)
324 self.assertEqual(metric, metricOut)
325 # Test get
326 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
327 self.assertEqual(metric, metricOut)
328 # Test get with a datasetRef
329 metricOut = butler.get(ref)
330 self.assertEqual(metric, metricOut)
331 # Test getDeferred with dataId
332 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
333 self.assertEqual(metric, metricOut)
334 # Test getDeferred with a ref
335 metricOut = butler.getDeferred(ref).get()
336 self.assertEqual(metric, metricOut)
338 # Check we can get components
339 if storageClass.isComposite():
340 self.assertGetComponents(
341 butler, ref, ("summary", "data", "output"), metric, collections=this_run
342 )
344 primary_uri, secondary_uris = butler.getURIs(ref)
345 n_uris = len(secondary_uris)
346 if primary_uri:
347 n_uris += 1
349 # Can the artifacts themselves be retrieved?
350 if not butler._datastore.isEphemeral:
351 # Create a temporary directory to hold the retrieved
352 # artifacts.
353 with tempfile.TemporaryDirectory(
354 prefix="butler-artifacts-", ignore_cleanup_errors=True
355 ) as artifact_root:
356 root_uri = ResourcePath(artifact_root, forceDirectory=True)
358 for preserve_path in (True, False):
359 destination = root_uri.join(f"{preserve_path}_{counter}/")
360 log = logging.getLogger("lsst.x")
361 log.warning("Using destination %s for args %s", destination, args)
362 # Use copy so that we can test that overwrite
363 # protection works (using "auto" for File URIs
364 # would use hard links and subsequent transfer
365 # would work because it knows they are the same
366 # file).
367 transferred = butler.retrieveArtifacts(
368 [ref], destination, preserve_path=preserve_path, transfer="copy"
369 )
370 self.assertGreater(len(transferred), 0)
371 artifacts = list(ResourcePath.findFileResources([destination]))
372 self.assertEqual(set(transferred), set(artifacts))
374 for artifact in transferred:
375 path_in_destination = artifact.relative_to(destination)
376 self.assertIsNotNone(path_in_destination)
377 assert path_in_destination is not None
379 # When path is not preserved there should not
380 # be any path separators.
381 num_seps = path_in_destination.count("/")
382 if preserve_path:
383 self.assertGreater(num_seps, 0)
384 else:
385 self.assertEqual(num_seps, 0)
387 self.assertEqual(
388 len(artifacts),
389 n_uris,
390 "Comparing expected artifacts vs actual:"
391 f" {artifacts} vs {primary_uri} and {secondary_uris}",
392 )
394 if preserve_path:
395 # No need to run these twice
396 with self.assertRaises(ValueError):
397 butler.retrieveArtifacts([ref], destination, transfer="move")
399 with self.assertRaisesRegex(
400 ValueError, "^Destination location must refer to a directory"
401 ):
402 butler.retrieveArtifacts(
403 [ref], ResourcePath("/some/file.txt", forceDirectory=False)
404 )
406 with self.assertRaises(FileExistsError):
407 butler.retrieveArtifacts([ref], destination)
409 transferred_again = butler.retrieveArtifacts(
410 [ref], destination, preserve_path=preserve_path, overwrite=True
411 )
412 self.assertEqual(set(transferred_again), set(transferred))
414 # Now remove the dataset completely.
415 butler.pruneDatasets([ref], purge=True, unstore=True)
416 # Lookup with original args should still fail.
417 kwargs = {"collections": this_run}
418 if isinstance(args[0], DatasetRef):
419 kwargs = {} # Prevent warning from being issued.
420 self.assertFalse(butler.exists(*args, **kwargs))
421 # get() should still fail.
422 with self.assertRaises((FileNotFoundError, DatasetNotFoundError)):
423 butler.get(ref)
424 # Registry shouldn't be able to find it by dataset_id anymore.
425 self.assertIsNone(butler.get_dataset(ref.id))
427 # Do explicit registry removal since we know they are
428 # empty
429 butler.registry.removeCollection(this_run)
430 expected_collections.remove(this_run)
432 # Create DatasetRef for put using default run.
433 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run)
435 # Check that getDeferred fails with standalone ref.
436 with self.assertRaises(LookupError):
437 butler.getDeferred(refIn)
439 # Put the dataset again, since the last thing we did was remove it
440 # and we want to use the default collection.
441 ref = butler.put(metric, refIn)
443 # Get with parameters
444 stop = 4
445 sliced = butler.get(ref, parameters={"slice": slice(stop)})
446 self.assertNotEqual(metric, sliced)
447 self.assertEqual(metric.summary, sliced.summary)
448 self.assertEqual(metric.output, sliced.output)
449 assert metric.data is not None # for mypy
450 self.assertEqual(metric.data[:stop], sliced.data)
451 # getDeferred with parameters
452 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
453 self.assertNotEqual(metric, sliced)
454 self.assertEqual(metric.summary, sliced.summary)
455 self.assertEqual(metric.output, sliced.output)
456 self.assertEqual(metric.data[:stop], sliced.data)
457 # getDeferred with deferred parameters
458 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
459 self.assertNotEqual(metric, sliced)
460 self.assertEqual(metric.summary, sliced.summary)
461 self.assertEqual(metric.output, sliced.output)
462 self.assertEqual(metric.data[:stop], sliced.data)
464 if storageClass.isComposite():
465 # Check that components can be retrieved
466 metricOut = butler.get(ref.datasetType.name, dataId)
467 compNameS = ref.datasetType.componentTypeName("summary")
468 compNameD = ref.datasetType.componentTypeName("data")
469 summary = butler.get(compNameS, dataId)
470 self.assertEqual(summary, metric.summary)
471 data = butler.get(compNameD, dataId)
472 self.assertEqual(data, metric.data)
474 if "counter" in storageClass.derivedComponents:
475 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
476 self.assertEqual(count, len(data))
478 count = butler.get(
479 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
480 )
481 self.assertEqual(count, stop)
483 compRef = butler.find_dataset(compNameS, dataId, collections=butler.collections)
484 assert compRef is not None
485 summary = butler.get(compRef)
486 self.assertEqual(summary, metric.summary)
488 # Create a Dataset type that has the same name but is inconsistent.
489 inconsistentDatasetType = DatasetType(
490 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
491 )
493 # Getting with a dataset type that does not match registry fails
494 with self.assertRaisesRegex(
495 ValueError,
496 "(Supplied dataset type .* inconsistent with registry)"
497 "|(The new storage class .* is not compatible with the existing storage class)",
498 ):
499 butler.get(inconsistentDatasetType, dataId)
501 # Combining a DatasetRef with a dataId should fail
502 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"):
503 butler.get(ref, dataId)
504 # Getting with an explicit ref should fail if the id doesn't match.
505 with self.assertRaises((FileNotFoundError, DatasetNotFoundError)):
506 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run))
508 # Getting a dataset with unknown parameters should fail
509 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"):
510 butler.get(ref, parameters={"unsupported": True})
512 # Check we have a collection
513 collections = set(butler.registry.queryCollections())
514 self.assertEqual(collections, expected_collections)
516 # Clean up to check that we can remove something that may have
517 # already had a component removed
518 butler.pruneDatasets([ref], unstore=True, purge=True)
520 # Add the same ref again, so we can check that duplicate put fails.
521 ref = butler.put(metric, datasetType, dataId)
523 # Repeat put will fail.
524 with self.assertRaisesRegex(
525 ConflictingDefinitionError, "A database constraint failure was triggered"
526 ):
527 butler.put(metric, datasetType, dataId)
529 # Remove the datastore entry.
530 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
532 # Put will still fail
533 with self.assertRaisesRegex(
534 ConflictingDefinitionError, "A database constraint failure was triggered"
535 ):
536 butler.put(metric, datasetType, dataId)
538 # Repeat the same sequence with resolved ref.
539 butler.pruneDatasets([ref], unstore=True, purge=True)
540 ref = butler.put(metric, refIn)
542 # Repeat put will fail.
543 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"):
544 butler.put(metric, refIn)
546 # Remove the datastore entry.
547 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
549 # In case of resolved ref this write will succeed.
550 ref = butler.put(metric, refIn)
552 # Leave the dataset in place since some downstream tests require
553 # something to be present
555 return butler
557 def testDeferredCollectionPassing(self) -> None:
558 # Construct a butler with no run or collection, but make it writeable.
559 butler = self.create_empty_butler(writeable=True)
560 # Create and register a DatasetType
561 dimensions = butler.dimensions.conform(["instrument", "visit"])
562 datasetType = self.addDatasetType(
563 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
564 )
565 # Add needed Dimensions
566 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
567 butler.registry.insertDimensionData(
568 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
569 )
570 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
571 butler.registry.insertDimensionData(
572 "visit",
573 {
574 "instrument": "DummyCamComp",
575 "id": 423,
576 "name": "fourtwentythree",
577 "physical_filter": "d-r",
578 "day_obs": 20250101,
579 },
580 )
581 dataId = {"instrument": "DummyCamComp", "visit": 423}
582 # Create dataset.
583 metric = makeExampleMetrics()
584 # Register a new run and put dataset.
585 run = "deferred"
586 self.assertTrue(butler.registry.registerRun(run))
587 # Second time it will be allowed but indicate no-op
588 self.assertFalse(butler.registry.registerRun(run))
589 ref = butler.put(metric, datasetType, dataId, run=run)
590 # Putting with no run should fail with TypeError.
591 with self.assertRaises(CollectionError):
592 butler.put(metric, datasetType, dataId)
593 # Dataset should exist.
594 self.assertTrue(butler.exists(datasetType, dataId, collections=[run]))
595 # We should be able to get the dataset back, but with and without
596 # a deferred dataset handle.
597 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
598 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
599 # Trying to find the dataset without any collection is an error.
600 with self.assertRaises(NoDefaultCollectionError):
601 butler.exists(datasetType, dataId)
602 with self.assertRaises(CollectionError):
603 butler.get(datasetType, dataId)
604 # Associate the dataset with a different collection.
605 butler.registry.registerCollection("tagged")
606 butler.registry.associate("tagged", [ref])
607 # Deleting the dataset from the new collection should make it findable
608 # in the original collection.
609 butler.pruneDatasets([ref], tags=["tagged"])
610 self.assertTrue(butler.exists(datasetType, dataId, collections=[run]))
613class ButlerTests(ButlerPutGetTests):
614 """Tests for Butler."""
616 useTempRoot = True
617 validationCanFail: bool
618 fullConfigKey: str | None
619 registryStr: str | None
620 datastoreName: list[str] | None
621 datastoreStr: list[str]
622 predictionSupported = True
623 """Does getURIs support 'prediction mode'?"""
625 def setUp(self) -> None:
626 """Create a new butler root for each test."""
627 self.root = makeTestTempDir(TESTDIR)
628 Butler.makeRepo(self.root, config=Config(self.configFile))
629 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
631 def are_uris_equivalent(self, uri1: ResourcePath, uri2: ResourcePath) -> bool:
632 """Return True if two URIs refer to the same resource.
634 Subclasses may override to handle unique requirements.
635 """
636 return uri1 == uri2
638 def testConstructor(self) -> None:
639 """Independent test of constructor."""
640 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
641 self.assertIsInstance(butler, Butler)
643 # Check that butler.yaml is added automatically.
644 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
645 config_dir = self.tmpConfigFile[: -len(end)]
646 butler = Butler.from_config(config_dir, run=self.default_run)
647 self.assertIsInstance(butler, Butler)
649 # Even with a ResourcePath.
650 butler = Butler.from_config(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
651 self.assertIsInstance(butler, Butler)
653 collections = set(butler.registry.queryCollections())
654 self.assertEqual(collections, {self.default_run})
656 # Check that some special characters can be included in run name.
657 special_run = "u@b.c-A"
658 butler_special = Butler.from_config(butler=butler, run=special_run)
659 collections = set(butler_special.registry.queryCollections("*@*"))
660 self.assertEqual(collections, {special_run})
662 butler2 = Butler.from_config(butler=butler, collections=["other"])
663 self.assertEqual(butler2.collections, ("other",))
664 self.assertIsNone(butler2.run)
665 self.assertEqual(type(butler._datastore), type(butler2._datastore))
666 self.assertEqual(butler._datastore.config, butler2._datastore.config)
668 # Test that we can use an environment variable to find this
669 # repository.
670 butler_index = Config()
671 butler_index["label"] = self.tmpConfigFile
672 for suffix in (".yaml", ".json"):
673 # Ensure that the content differs so that we know that
674 # we aren't reusing the cache.
675 bad_label = f"file://bucket/not_real{suffix}"
676 butler_index["bad_label"] = bad_label
677 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
678 butler_index.dumpToUri(temp_file)
679 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
680 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"})
681 uri = Butler.get_repo_uri("bad_label")
682 self.assertEqual(uri, ResourcePath(bad_label))
683 uri = Butler.get_repo_uri("label")
684 butler = Butler.from_config(uri, writeable=False)
685 self.assertIsInstance(butler, Butler)
686 butler = Butler.from_config("label", writeable=False)
687 self.assertIsInstance(butler, Butler)
688 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
689 Butler.from_config("not_there", writeable=False)
690 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"):
691 Butler.from_config("bad_label")
692 with self.assertRaises(FileNotFoundError):
693 # Should ignore aliases.
694 Butler.from_config(ResourcePath("label", forceAbsolute=False))
695 with self.assertRaises(KeyError) as cm:
696 Butler.get_repo_uri("missing")
697 self.assertEqual(
698 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False)
699 )
700 self.assertIn("not known to", str(cm.exception))
701 # Should report no failure.
702 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "")
703 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
704 # Now with empty configuration.
705 butler_index = Config()
706 butler_index.dumpToUri(temp_file)
707 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
708 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"):
709 Butler.from_config("label")
710 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
711 # Now with bad contents.
712 with open(temp_file.ospath, "w") as fh:
713 print("'", file=fh)
714 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
715 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"):
716 Butler.from_config("label")
717 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
718 with self.assertRaises(FileNotFoundError):
719 Butler.get_repo_uri("label")
720 self.assertEqual(Butler.get_known_repos(), set())
722 with self.assertRaisesRegex(FileNotFoundError, "index file not found"):
723 Butler.from_config("label")
725 # Check that we can create Butler when the alias file is not found.
726 butler = Butler.from_config(self.tmpConfigFile, writeable=False)
727 self.assertIsInstance(butler, Butler)
728 with self.assertRaises(RuntimeError) as cm:
729 # No environment variable set.
730 Butler.get_repo_uri("label")
731 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False))
732 self.assertIn("No repository index defined", str(cm.exception))
733 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"):
734 # No aliases registered.
735 Butler.from_config("not_there")
736 self.assertEqual(Butler.get_known_repos(), set())
738 def testDafButlerRepositories(self):
739 with unittest.mock.patch.dict(
740 os.environ,
741 {"DAF_BUTLER_REPOSITORIES": "label: 'https://someuri.com'\notherLabel: 'https://otheruri.com'\n"},
742 ):
743 self.assertEqual(str(Butler.get_repo_uri("label")), "https://someuri.com")
745 with unittest.mock.patch.dict(
746 os.environ,
747 {
748 "DAF_BUTLER_REPOSITORIES": "label: https://someuri.com",
749 "DAF_BUTLER_REPOSITORY_INDEX": "https://someuri.com",
750 },
751 ):
752 with self.assertRaisesRegex(RuntimeError, "Only one of the environment variables"):
753 Butler.get_repo_uri("label")
755 with unittest.mock.patch.dict(
756 os.environ,
757 {"DAF_BUTLER_REPOSITORIES": "invalid"},
758 ):
759 with self.assertRaisesRegex(ValueError, "Repository index not in expected format"):
760 Butler.get_repo_uri("label")
762 def testBasicPutGet(self) -> None:
763 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
764 self.runPutGetTest(storageClass, "test_metric")
766 def testCompositePutGetConcrete(self) -> None:
767 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
768 butler = self.runPutGetTest(storageClass, "test_metric")
770 # Should *not* be disassembled
771 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
772 self.assertEqual(len(datasets), 1)
773 uri, components = butler.getURIs(datasets[0])
774 self.assertIsInstance(uri, ResourcePath)
775 self.assertFalse(components)
776 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
777 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
779 # Predicted dataset
780 if self.predictionSupported:
781 dataId = {"instrument": "DummyCamComp", "visit": 424}
782 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
783 self.assertFalse(components)
784 self.assertIsInstance(uri, ResourcePath)
785 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
786 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
788 def testCompositePutGetVirtual(self) -> None:
789 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
790 butler = self.runPutGetTest(storageClass, "test_metric_comp")
792 # Should be disassembled
793 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
794 self.assertEqual(len(datasets), 1)
795 uri, components = butler.getURIs(datasets[0])
797 if butler._datastore.isEphemeral:
798 # Never disassemble in-memory datastore
799 self.assertIsInstance(uri, ResourcePath)
800 self.assertFalse(components)
801 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
802 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
803 else:
804 self.assertIsNone(uri)
805 self.assertEqual(set(components), set(storageClass.components))
806 for compuri in components.values():
807 self.assertIsInstance(compuri, ResourcePath)
808 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
809 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
811 if self.predictionSupported:
812 # Predicted dataset
813 dataId = {"instrument": "DummyCamComp", "visit": 424}
814 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
816 if butler._datastore.isEphemeral:
817 # Never disassembled
818 self.assertIsInstance(uri, ResourcePath)
819 self.assertFalse(components)
820 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
821 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
822 else:
823 self.assertIsNone(uri)
824 self.assertEqual(set(components), set(storageClass.components))
825 for compuri in components.values():
826 self.assertIsInstance(compuri, ResourcePath)
827 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
828 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
830 def testStorageClassOverrideGet(self) -> None:
831 """Test storage class conversion on get with override."""
832 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
833 datasetTypeName = "anything"
834 run = self.default_run
836 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
838 # Create and store a dataset.
839 metric = makeExampleMetrics()
840 dataId = {"instrument": "DummyCamComp", "visit": 423}
842 ref = butler.put(metric, datasetType, dataId)
844 # Return native type.
845 retrieved = butler.get(ref)
846 self.assertEqual(retrieved, metric)
848 # Specify an override.
849 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
850 model = butler.get(ref, storageClass=new_sc)
851 self.assertNotEqual(type(model), type(retrieved))
852 self.assertIs(type(model), new_sc.pytype)
853 self.assertEqual(retrieved, model)
855 # Defer but override later.
856 deferred = butler.getDeferred(ref)
857 model = deferred.get(storageClass=new_sc)
858 self.assertIs(type(model), new_sc.pytype)
859 self.assertEqual(retrieved, model)
861 # Defer but override up front.
862 deferred = butler.getDeferred(ref, storageClass=new_sc)
863 model = deferred.get()
864 self.assertIs(type(model), new_sc.pytype)
865 self.assertEqual(retrieved, model)
867 # Retrieve a component. Should be a tuple.
868 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
869 self.assertIs(type(data), tuple)
870 self.assertEqual(data, tuple(retrieved.data))
872 # Parameter on the write storage class should work regardless
873 # of read storage class.
874 data = butler.get(
875 "anything.data",
876 dataId,
877 storageClass="StructuredDataDataTestTuple",
878 parameters={"slice": slice(2, 4)},
879 )
880 self.assertEqual(len(data), 2)
882 # Try a parameter that is known to the read storage class but not
883 # the write storage class.
884 with self.assertRaises(KeyError):
885 butler.get(
886 "anything.data",
887 dataId,
888 storageClass="StructuredDataDataTestTuple",
889 parameters={"xslice": slice(2, 4)},
890 )
892 def testPytypePutCoercion(self) -> None:
893 """Test python type coercion on Butler.get and put."""
894 # Store some data with the normal example storage class.
895 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
896 datasetTypeName = "test_metric"
897 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
899 dataId = {"instrument": "DummyCamComp", "visit": 423}
901 # Put a dict and this should coerce to a MetricsExample
902 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
903 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
904 test_metric = butler.get(metric_ref)
905 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
906 self.assertEqual(test_metric.summary, test_dict["summary"])
907 self.assertEqual(test_metric.output, test_dict["output"])
909 # Check that the put still works if a DatasetType is given with
910 # a definition matching this python type.
911 registry_type = butler.get_dataset_type(datasetTypeName)
912 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
913 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
914 self.assertEqual(metric2_ref.datasetType, registry_type)
916 # The get will return the type expected by registry.
917 test_metric2 = butler.get(metric2_ref)
918 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
920 # Make a new DatasetRef with the compatible but different DatasetType.
921 # This should now return a dict.
922 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
923 test_dict2 = butler.get(new_ref)
924 self.assertEqual(get_full_type_name(test_dict2), "dict")
926 # Get it again with the wrong dataset type definition using get()
927 # rather than get(). This should be consistent with get()
928 # behavior and return the type of the DatasetType.
929 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
930 self.assertEqual(get_full_type_name(test_dict3), "dict")
932 def testIngest(self) -> None:
933 butler = self.create_empty_butler(run=self.default_run)
935 # Create and register a DatasetType
936 dimensions = butler.dimensions.conform(["instrument", "visit", "detector"])
938 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
939 datasetTypeName = "metric"
941 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
943 # Add needed Dimensions
944 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
945 butler.registry.insertDimensionData(
946 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
947 )
948 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
949 for detector in (1, 2):
950 butler.registry.insertDimensionData(
951 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
952 )
954 butler.registry.insertDimensionData(
955 "visit",
956 {
957 "instrument": "DummyCamComp",
958 "id": 423,
959 "name": "fourtwentythree",
960 "physical_filter": "d-r",
961 "day_obs": 20250101,
962 },
963 {
964 "instrument": "DummyCamComp",
965 "id": 424,
966 "name": "fourtwentyfour",
967 "physical_filter": "d-r",
968 "day_obs": 20250101,
969 },
970 )
972 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter")
973 dataRoot = os.path.join(TESTDIR, "data", "basic")
974 datasets = []
975 for detector in (1, 2):
976 detector_name = f"detector_{detector}"
977 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
978 dataId = butler.registry.expandDataId(
979 {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
980 )
981 # Create a DatasetRef for ingest
982 refIn = DatasetRef(datasetType, dataId, run=self.default_run)
984 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
986 butler.ingest(*datasets, transfer="copy")
988 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
989 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
991 metrics1 = butler.get(datasetTypeName, dataId1)
992 metrics2 = butler.get(datasetTypeName, dataId2)
993 self.assertNotEqual(metrics1, metrics2)
995 # Compare URIs
996 uri1 = butler.getURI(datasetTypeName, dataId1)
997 uri2 = butler.getURI(datasetTypeName, dataId2)
998 self.assertFalse(self.are_uris_equivalent(uri1, uri2), f"Cf. {uri1} with {uri2}")
1000 # Now do a multi-dataset but single file ingest
1001 metricFile = os.path.join(dataRoot, "detectors.yaml")
1002 refs = []
1003 for detector in (1, 2):
1004 detector_name = f"detector_{detector}"
1005 dataId = butler.registry.expandDataId(
1006 {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
1007 )
1008 # Create a DatasetRef for ingest
1009 refs.append(DatasetRef(datasetType, dataId, run=self.default_run))
1011 # Test "move" transfer to ensure that the files themselves
1012 # have disappeared following ingest.
1013 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
1014 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
1016 datasets = []
1017 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
1019 # For first ingest use copy.
1020 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
1022 # Now try to ingest again in "execution butler" mode where
1023 # the registry entries exist but the datastore does not have
1024 # the files. We also need to strip the dimension records to ensure
1025 # that they will be re-added by the ingest.
1026 ref = datasets[0].refs[0]
1027 datasets[0].refs = [
1028 cast(
1029 DatasetRef,
1030 butler.find_dataset(ref.datasetType, data_id=ref.dataId, collections=ref.run),
1031 )
1032 for ref in datasets[0].refs
1033 ]
1034 all_refs = []
1035 for dataset in datasets:
1036 refs = []
1037 for ref in dataset.refs:
1038 # Create a dict from the dataId to drop the records.
1039 new_data_id = dict(ref.dataId.required)
1040 new_ref = butler.find_dataset(ref.datasetType, new_data_id, collections=ref.run)
1041 assert new_ref is not None
1042 self.assertFalse(new_ref.dataId.hasRecords())
1043 refs.append(new_ref)
1044 dataset.refs = refs
1045 all_refs.extend(dataset.refs)
1046 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False)
1048 # Use move mode to test that the file is deleted. Also
1049 # disable recording of file size.
1050 butler.ingest(*datasets, transfer="move", record_validation_info=False)
1052 # Check that every ref now has records.
1053 for dataset in datasets:
1054 for ref in dataset.refs:
1055 self.assertTrue(ref.dataId.hasRecords())
1057 # Ensure that the file has disappeared.
1058 self.assertFalse(tempFile.exists())
1060 # Check that the datastore recorded no file size.
1061 # Not all datastores can support this.
1062 try:
1063 infos = butler._datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined]
1064 self.assertEqual(infos[0].file_size, -1)
1065 except AttributeError:
1066 pass
1068 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
1069 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
1071 multi1 = butler.get(datasetTypeName, dataId1)
1072 multi2 = butler.get(datasetTypeName, dataId2)
1074 self.assertEqual(multi1, metrics1)
1075 self.assertEqual(multi2, metrics2)
1077 # Compare URIs
1078 uri1 = butler.getURI(datasetTypeName, dataId1)
1079 uri2 = butler.getURI(datasetTypeName, dataId2)
1080 self.assertTrue(self.are_uris_equivalent(uri1, uri2), f"Cf. {uri1} with {uri2}")
1082 # Test that removing one does not break the second
1083 # This line will issue a warning log message for a ChainedDatastore
1084 # that uses an InMemoryDatastore since in-memory can not ingest
1085 # files.
1086 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
1087 self.assertFalse(butler.exists(datasetTypeName, dataId1))
1088 self.assertTrue(butler.exists(datasetTypeName, dataId2))
1089 multi2b = butler.get(datasetTypeName, dataId2)
1090 self.assertEqual(multi2, multi2b)
1092 # Ensure we can ingest 0 datasets
1093 datasets = []
1094 butler.ingest(*datasets)
1096 def testPickle(self) -> None:
1097 """Test pickle support."""
1098 butler = self.create_empty_butler(run=self.default_run)
1099 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration"
1100 butlerOut = pickle.loads(pickle.dumps(butler))
1101 self.assertIsInstance(butlerOut, Butler)
1102 self.assertEqual(butlerOut._config, butler._config)
1103 self.assertEqual(butlerOut.collections, butler.collections)
1104 self.assertEqual(butlerOut.run, butler.run)
1106 def testGetDatasetTypes(self) -> None:
1107 butler = self.create_empty_butler(run=self.default_run)
1108 dimensions = butler.dimensions.conform(["instrument", "visit", "physical_filter"])
1109 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [
1110 (
1111 "instrument",
1112 [
1113 {"instrument": "DummyCam"},
1114 {"instrument": "DummyHSC"},
1115 {"instrument": "DummyCamComp"},
1116 ],
1117 ),
1118 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]),
1119 ("day_obs", [{"instrument": "DummyCam", "id": 20250101}]),
1120 (
1121 "visit",
1122 [
1123 {
1124 "instrument": "DummyCam",
1125 "id": 42,
1126 "name": "fortytwo",
1127 "physical_filter": "d-r",
1128 "day_obs": 20250101,
1129 }
1130 ],
1131 ),
1132 ]
1133 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1134 # Add needed Dimensions
1135 for element, data in dimensionEntries:
1136 butler.registry.insertDimensionData(element, *data)
1138 # When a DatasetType is added to the registry entries are not created
1139 # for components but querying them can return the components.
1140 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
1141 components = set()
1142 for datasetTypeName in datasetTypeNames:
1143 # Create and register a DatasetType
1144 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1146 for componentName in storageClass.components:
1147 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
1149 fromRegistry: set[DatasetType] = set()
1150 for parent_dataset_type in butler.registry.queryDatasetTypes():
1151 fromRegistry.add(parent_dataset_type)
1152 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
1153 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
1155 # Now that we have some dataset types registered, validate them
1156 butler.validateConfiguration(
1157 ignore=[
1158 "test_metric_comp",
1159 "metric3",
1160 "metric5",
1161 "calexp",
1162 "DummySC",
1163 "datasetType.component",
1164 "random_data",
1165 "random_data_2",
1166 ]
1167 )
1169 # Add a new datasetType that will fail template validation
1170 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
1171 if self.validationCanFail:
1172 with self.assertRaises(ValidationError):
1173 butler.validateConfiguration()
1175 # Rerun validation but with a subset of dataset type names
1176 butler.validateConfiguration(datasetTypeNames=["metric4"])
1178 # Rerun validation but ignore the bad datasetType
1179 butler.validateConfiguration(
1180 ignore=[
1181 "test_metric_comp",
1182 "metric3",
1183 "metric5",
1184 "calexp",
1185 "DummySC",
1186 "datasetType.component",
1187 "random_data",
1188 "random_data_2",
1189 ]
1190 )
1192 def testTransaction(self) -> None:
1193 butler = self.create_empty_butler(run=self.default_run)
1194 datasetTypeName = "test_metric"
1195 dimensions = butler.dimensions.conform(["instrument", "visit"])
1196 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = (
1197 ("instrument", {"instrument": "DummyCam"}),
1198 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
1199 ("day_obs", {"instrument": "DummyCam", "id": 20250101}),
1200 (
1201 "visit",
1202 {
1203 "instrument": "DummyCam",
1204 "id": 42,
1205 "name": "fortytwo",
1206 "physical_filter": "d-r",
1207 "day_obs": 20250101,
1208 },
1209 ),
1210 )
1211 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1212 metric = makeExampleMetrics()
1213 dataId = {"instrument": "DummyCam", "visit": 42}
1214 # Create and register a DatasetType
1215 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1216 with self.assertRaises(TransactionTestError):
1217 with butler.transaction():
1218 # Add needed Dimensions
1219 for args in dimensionEntries:
1220 butler.registry.insertDimensionData(*args)
1221 # Store a dataset
1222 ref = butler.put(metric, datasetTypeName, dataId)
1223 self.assertIsInstance(ref, DatasetRef)
1224 # Test get of a ref.
1225 metricOut = butler.get(ref)
1226 self.assertEqual(metric, metricOut)
1227 # Test get
1228 metricOut = butler.get(datasetTypeName, dataId)
1229 self.assertEqual(metric, metricOut)
1230 # Check we can get components
1231 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
1232 raise TransactionTestError("This should roll back the entire transaction")
1233 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
1234 butler.registry.expandDataId(dataId)
1235 # Should raise LookupError for missing data ID value
1236 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
1237 butler.get(datasetTypeName, dataId)
1238 # Also check explicitly if Dataset entry is missing
1239 self.assertIsNone(butler.find_dataset(datasetType, dataId, collections=butler.collections))
1240 # Direct retrieval should not find the file in the Datastore
1241 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
1242 butler.get(ref)
1244 def testMakeRepo(self) -> None:
1245 """Test that we can write butler configuration to a new repository via
1246 the Butler.makeRepo interface and then instantiate a butler from the
1247 repo root.
1248 """
1249 # Do not run the test if we know this datastore configuration does
1250 # not support a file system root
1251 if self.fullConfigKey is None:
1252 return
1254 # create two separate directories
1255 root1 = tempfile.mkdtemp(dir=self.root)
1256 root2 = tempfile.mkdtemp(dir=self.root)
1258 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1259 limited = Config(self.configFile)
1260 butler1 = Butler.from_config(butlerConfig)
1261 assert isinstance(butler1, DirectButler), "Expect DirectButler in configuration"
1262 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1263 full = Config(self.tmpConfigFile)
1264 butler2 = Butler.from_config(butlerConfig)
1265 assert isinstance(butler2, DirectButler), "Expect DirectButler in configuration"
1266 # Butlers should have the same configuration regardless of whether
1267 # defaults were expanded.
1268 self.assertEqual(butler1._config, butler2._config)
1269 # Config files loaded directly should not be the same.
1270 self.assertNotEqual(limited, full)
1271 # Make sure "limited" doesn't have a few keys we know it should be
1272 # inheriting from defaults.
1273 self.assertIn(self.fullConfigKey, full)
1274 self.assertNotIn(self.fullConfigKey, limited)
1276 # Collections don't appear until something is put in them
1277 collections1 = set(butler1.registry.queryCollections())
1278 self.assertEqual(collections1, set())
1279 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1281 # Check that a config with no associated file name will not
1282 # work properly with relocatable Butler repo
1283 butlerConfig.configFile = None
1284 with self.assertRaises(ValueError):
1285 Butler.from_config(butlerConfig)
1287 with self.assertRaises(FileExistsError):
1288 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1290 def testStringification(self) -> None:
1291 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
1292 butlerStr = str(butler)
1294 if self.datastoreStr is not None:
1295 for testStr in self.datastoreStr:
1296 self.assertIn(testStr, butlerStr)
1297 if self.registryStr is not None:
1298 self.assertIn(self.registryStr, butlerStr)
1300 datastoreName = butler._datastore.name
1301 if self.datastoreName is not None:
1302 for testStr in self.datastoreName:
1303 self.assertIn(testStr, datastoreName)
1305 def testButlerRewriteDataId(self) -> None:
1306 """Test that dataIds can be rewritten based on dimension records."""
1307 butler = self.create_empty_butler(run=self.default_run)
1309 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1310 datasetTypeName = "random_data"
1312 # Create dimension records.
1313 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1314 butler.registry.insertDimensionData(
1315 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1316 )
1317 butler.registry.insertDimensionData(
1318 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1319 )
1321 dimensions = butler.dimensions.conform(["instrument", "exposure"])
1322 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1323 butler.registry.registerDatasetType(datasetType)
1325 n_exposures = 5
1326 dayobs = 20210530
1328 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": dayobs})
1330 for i in range(n_exposures):
1331 butler.registry.insertDimensionData("group", {"instrument": "DummyCamComp", "name": f"group{i}"})
1332 butler.registry.insertDimensionData(
1333 "exposure",
1334 {
1335 "instrument": "DummyCamComp",
1336 "id": i,
1337 "obs_id": f"exp{i}",
1338 "seq_num": i,
1339 "day_obs": dayobs,
1340 "physical_filter": "d-r",
1341 "group": f"group{i}",
1342 },
1343 )
1345 # Write some data.
1346 for i in range(n_exposures):
1347 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1349 # Use the seq_num for the put to test rewriting.
1350 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1351 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1353 # Check that the exposure is correct in the dataId
1354 self.assertEqual(ref.dataId["exposure"], i)
1356 # and check that we can get the dataset back with the same dataId
1357 new_metric = butler.get(datasetTypeName, dataId=dataId)
1358 self.assertEqual(new_metric, metric)
1360 # Check that we can find the datasets using the day_obs or the
1361 # exposure.day_obs.
1362 datasets_1 = list(
1363 butler.registry.queryDatasets(
1364 datasetType,
1365 collections=self.default_run,
1366 where="day_obs = dayObs AND instrument = instr",
1367 bind={"dayObs": dayobs, "instr": "DummyCamComp"},
1368 )
1369 )
1370 datasets_2 = list(
1371 butler.registry.queryDatasets(
1372 datasetType,
1373 collections=self.default_run,
1374 where="exposure.day_obs = dayObs AND instrument = instr",
1375 bind={"dayObs": dayobs, "instr": "DummyCamComp"},
1376 )
1377 )
1378 self.assertEqual(datasets_1, datasets_2)
1380 def testGetDatasetCollectionCaching(self):
1381 # Prior to DM-41117, there was a bug where get_dataset would throw
1382 # MissingCollectionError if you tried to fetch a dataset that was added
1383 # after the collection cache was last updated.
1384 reader_butler, datasetType = self.create_butler(self.default_run, "int", "datasettypename")
1385 writer_butler = self.create_empty_butler(writeable=True, run="new_run")
1386 dataId = {"instrument": "DummyCamComp", "visit": 423}
1387 put_ref = writer_butler.put(123, datasetType, dataId)
1388 get_ref = reader_butler.get_dataset(put_ref.id)
1389 self.assertEqual(get_ref.id, put_ref.id)
1391 def testCollectionChainPrepend(self):
1392 butler = self.create_empty_butler(writeable=True)
1394 butler.registry.registerCollection("chain", CollectionType.CHAINED)
1396 runs = ["a", "b", "c", "d"]
1397 for run in runs:
1398 butler.registry.registerCollection(run)
1400 butler.registry.registerCollection("staticchain", CollectionType.CHAINED)
1401 butler.registry.setCollectionChain("staticchain", ["a", "b"])
1403 def check_chain(expected: list[str]) -> None:
1404 children = butler.registry.getCollectionChain("chain")
1405 self.assertEqual(expected, list(children))
1407 # Duplicates are removed from the list of children
1408 butler.prepend_collection_chain("chain", ["c", "b", "c"])
1409 check_chain(["c", "b"])
1411 # Prepend goes on the front of existing chain
1412 butler.prepend_collection_chain("chain", ["a"])
1413 check_chain(["a", "c", "b"])
1415 # Empty prepend does nothing
1416 butler.prepend_collection_chain("chain", [])
1417 check_chain(["a", "c", "b"])
1419 # Prepending children that already exist in the chain removes them from
1420 # their current position.
1421 butler.prepend_collection_chain("chain", ["d", "b", "c"])
1422 check_chain(["d", "b", "c", "a"])
1424 # Missing parent collection
1425 with self.assertRaises(MissingCollectionError):
1426 butler.prepend_collection_chain("doesnotexist", [])
1427 # Missing child collection
1428 with self.assertRaises(MissingCollectionError):
1429 butler.prepend_collection_chain("chain", ["doesnotexist"])
1430 # Forbid operations on non-chained collections
1431 with self.assertRaises(CollectionTypeError):
1432 butler.prepend_collection_chain("d", ["a"])
1434 # Prevent collection cycles
1435 butler.registry.registerCollection("chain2", CollectionType.CHAINED)
1436 butler.prepend_collection_chain("chain2", "chain")
1437 with self.assertRaises(CollectionCycleError):
1438 butler.prepend_collection_chain("chain", "chain2")
1440 # Make sure none of those operations interfered with unrelated chains
1441 self.assertEqual(["a", "b"], list(butler.registry.getCollectionChain("staticchain")))
1443 with butler._caching_context():
1444 with self.assertRaisesRegex(RuntimeError, "Chained collection modification not permitted"):
1445 butler.prepend_collection_chain("chain", "a")
1448class FileDatastoreButlerTests(ButlerTests):
1449 """Common tests and specialization of ButlerTests for butlers backed
1450 by datastores that inherit from FileDatastore.
1451 """
1453 trustModeSupported = True
1455 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool:
1456 """Check if file exists at a given path (relative to root).
1458 Test testPutTemplates verifies actual physical existance of the files
1459 in the requested location.
1460 """
1461 uri = ResourcePath(root, forceDirectory=True)
1462 return uri.join(relpath).exists()
1464 def testPutTemplates(self) -> None:
1465 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1466 butler = self.create_empty_butler(run=self.default_run)
1468 # Add needed Dimensions
1469 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1470 butler.registry.insertDimensionData(
1471 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1472 )
1473 butler.registry.insertDimensionData("day_obs", {"instrument": "DummyCamComp", "id": 20250101})
1474 butler.registry.insertDimensionData(
1475 "visit",
1476 {
1477 "instrument": "DummyCamComp",
1478 "id": 423,
1479 "name": "v423",
1480 "physical_filter": "d-r",
1481 "day_obs": 20250101,
1482 },
1483 )
1484 butler.registry.insertDimensionData(
1485 "visit",
1486 {
1487 "instrument": "DummyCamComp",
1488 "id": 425,
1489 "name": "v425",
1490 "physical_filter": "d-r",
1491 "day_obs": 20250101,
1492 },
1493 )
1495 # Create and store a dataset
1496 metric = makeExampleMetrics()
1498 # Create two almost-identical DatasetTypes (both will use default
1499 # template)
1500 dimensions = butler.dimensions.conform(["instrument", "visit"])
1501 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1502 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1503 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1505 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1506 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1508 # Put with exactly the data ID keys needed
1509 ref = butler.put(metric, "metric1", dataId1)
1510 uri = butler.getURI(ref)
1511 self.assertTrue(uri.exists())
1512 self.assertTrue(
1513 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1514 )
1516 # Check the template based on dimensions
1517 if hasattr(butler._datastore, "templates"):
1518 butler._datastore.templates.validateTemplates([ref])
1520 # Put with extra data ID keys (physical_filter is an optional
1521 # dependency); should not change template (at least the way we're
1522 # defining them to behave now; the important thing is that they
1523 # must be consistent).
1524 ref = butler.put(metric, "metric2", dataId2)
1525 uri = butler.getURI(ref)
1526 self.assertTrue(uri.exists())
1527 self.assertTrue(
1528 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1529 )
1531 # Check the template based on dimensions
1532 if hasattr(butler._datastore, "templates"):
1533 butler._datastore.templates.validateTemplates([ref])
1535 # Use a template that has a typo in dimension record metadata.
1536 # Easier to test with a butler that has a ref with records attached.
1537 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1538 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"):
1539 path = template.format(ref)
1540 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1542 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1543 with self.assertRaises(KeyError):
1544 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"):
1545 template.format(ref)
1547 # Now use a file template that will not result in unique filenames
1548 with self.assertRaises(FileTemplateValidationError):
1549 butler.put(metric, "metric3", dataId1)
1551 def testImportExport(self) -> None:
1552 # Run put/get tests just to create and populate a repo.
1553 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1554 self.runImportExportTest(storageClass)
1556 @unittest.expectedFailure
1557 def testImportExportVirtualComposite(self) -> None:
1558 # Run put/get tests just to create and populate a repo.
1559 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1560 self.runImportExportTest(storageClass)
1562 def runImportExportTest(self, storageClass: StorageClass) -> None:
1563 """Test exporting and importing.
1565 This test does an export to a temp directory and an import back
1566 into a new temp directory repo. It does not assume a posix datastore.
1567 """
1568 exportButler = self.runPutGetTest(storageClass, "test_metric")
1570 # Test that we must have a file extension.
1571 with self.assertRaises(ValueError):
1572 with exportButler.export(filename="dump", directory=".") as export:
1573 pass
1575 # Test that unknown format is not allowed.
1576 with self.assertRaises(ValueError):
1577 with exportButler.export(filename="dump.fits", directory=".") as export:
1578 pass
1580 # Test that the repo actually has at least one dataset.
1581 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1582 self.assertGreater(len(datasets), 0)
1583 # Add a DimensionRecord that's unused by those datasets.
1584 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1585 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1586 # Export and then import datasets.
1587 with safeTestTempDir(TESTDIR) as exportDir:
1588 exportFile = os.path.join(exportDir, "exports.yaml")
1589 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1590 export.saveDatasets(datasets)
1591 # Export the same datasets again. This should quietly do
1592 # nothing because of internal deduplication, and it shouldn't
1593 # complain about being asked to export the "htm7" elements even
1594 # though there aren't any in these datasets or in the database.
1595 export.saveDatasets(datasets, elements=["htm7"])
1596 # Save one of the data IDs again; this should be harmless
1597 # because of internal deduplication.
1598 export.saveDataIds([datasets[0].dataId])
1599 # Save some dimension records directly.
1600 export.saveDimensionData("skymap", [skymapRecord])
1601 self.assertTrue(os.path.exists(exportFile))
1602 with safeTestTempDir(TESTDIR) as importDir:
1603 # We always want this to be a local posix butler
1604 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1605 # Calling script.butlerImport tests the implementation of the
1606 # butler command line interface "import" subcommand. Functions
1607 # in the script folder are generally considered protected and
1608 # should not be used as public api.
1609 with open(exportFile) as f:
1610 script.butlerImport(
1611 importDir,
1612 export_file=f,
1613 directory=exportDir,
1614 transfer="auto",
1615 skip_dimensions=None,
1616 )
1617 importButler = Butler.from_config(importDir, run=self.default_run)
1618 for ref in datasets:
1619 with self.subTest(ref=ref):
1620 # Test for existence by passing in the DatasetType and
1621 # data ID separately, to avoid lookup by dataset_id.
1622 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId))
1623 self.assertEqual(
1624 list(importButler.registry.queryDimensionRecords("skymap")),
1625 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)],
1626 )
1628 def testRemoveRuns(self) -> None:
1629 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1630 butler = self.create_empty_butler(writeable=True)
1631 # Load registry data with dimensions to hang datasets off of.
1632 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1633 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1634 # Add some RUN-type collection.
1635 run1 = "run1"
1636 butler.registry.registerRun(run1)
1637 run2 = "run2"
1638 butler.registry.registerRun(run2)
1639 # put a dataset in each
1640 metric = makeExampleMetrics()
1641 dimensions = butler.dimensions.conform(["instrument", "physical_filter"])
1642 datasetType = self.addDatasetType(
1643 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1644 )
1645 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1646 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1647 uri1 = butler.getURI(ref1)
1648 uri2 = butler.getURI(ref2)
1650 with self.assertRaises(OrphanedRecordError):
1651 butler.registry.removeDatasetType(datasetType.name)
1653 # Remove from both runs with different values for unstore.
1654 butler.removeRuns([run1], unstore=True)
1655 butler.removeRuns([run2], unstore=False)
1656 # Should be nothing in registry for either one, and datastore should
1657 # not think either exists.
1658 with self.assertRaises(MissingCollectionError):
1659 butler.registry.getCollectionType(run1)
1660 with self.assertRaises(MissingCollectionError):
1661 butler.registry.getCollectionType(run2)
1662 self.assertFalse(butler.stored(ref1))
1663 self.assertFalse(butler.stored(ref2))
1664 # The ref we unstored should be gone according to the URI, but the
1665 # one we forgot should still be around.
1666 self.assertFalse(uri1.exists())
1667 self.assertTrue(uri2.exists())
1669 # Now that the collections have been pruned we can remove the
1670 # dataset type
1671 butler.registry.removeDatasetType(datasetType.name)
1673 with self.assertLogs("lsst.daf.butler.registry", "INFO") as cm:
1674 butler.registry.removeDatasetType(("test*", "test*"))
1675 self.assertIn("not defined", "\n".join(cm.output))
1677 def remove_dataset_out_of_band(self, butler: Butler, ref: DatasetRef) -> None:
1678 """Simulate an external actor removing a file outside of Butler's
1679 knowledge.
1681 Subclasses may override to handle more complicated datastore
1682 configurations.
1683 """
1684 uri = butler.getURI(ref)
1685 uri.remove()
1686 datastore = cast(FileDatastore, butler._datastore)
1687 datastore.cacheManager.remove_from_cache(ref)
1689 def testPruneDatasets(self) -> None:
1690 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1691 butler = self.create_empty_butler(writeable=True)
1692 # Load registry data with dimensions to hang datasets off of.
1693 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1694 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1695 # Add some RUN-type collections.
1696 run1 = "run1"
1697 butler.registry.registerRun(run1)
1698 run2 = "run2"
1699 butler.registry.registerRun(run2)
1700 # put some datasets. ref1 and ref2 have the same data ID, and are in
1701 # different runs. ref3 has a different data ID.
1702 metric = makeExampleMetrics()
1703 dimensions = butler.dimensions.conform(["instrument", "physical_filter"])
1704 datasetType = self.addDatasetType(
1705 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1706 )
1707 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1708 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1709 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1711 many_stored = butler.stored_many([ref1, ref2, ref3])
1712 for ref, stored in many_stored.items():
1713 self.assertTrue(stored, f"Ref {ref} should be stored")
1715 many_exists = butler._exists_many([ref1, ref2, ref3])
1716 for ref, exists in many_exists.items():
1717 self.assertTrue(exists, f"Checking ref {ref} exists.")
1718 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored")
1720 # Simple prune.
1721 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1722 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1))
1724 many_stored = butler.stored_many([ref1, ref2, ref3])
1725 for ref, stored in many_stored.items():
1726 self.assertFalse(stored, f"Ref {ref} should not be stored")
1728 many_exists = butler._exists_many([ref1, ref2, ref3])
1729 for ref, exists in many_exists.items():
1730 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored")
1732 # Put data back.
1733 ref1_new = butler.put(metric, ref1)
1734 self.assertEqual(ref1_new, ref1) # Reuses original ID.
1735 ref2 = butler.put(metric, ref2)
1737 many_stored = butler.stored_many([ref1, ref2, ref3])
1738 self.assertTrue(many_stored[ref1])
1739 self.assertTrue(many_stored[ref2])
1740 self.assertFalse(many_stored[ref3])
1742 ref3 = butler.put(metric, ref3)
1744 many_exists = butler._exists_many([ref1, ref2, ref3])
1745 for ref, exists in many_exists.items():
1746 self.assertTrue(exists, f"Ref {ref} should not be stored")
1748 # Clear out the datasets from registry and start again.
1749 refs = [ref1, ref2, ref3]
1750 butler.pruneDatasets(refs, purge=True, unstore=True)
1751 for ref in refs:
1752 butler.put(metric, ref)
1754 # Confirm we can retrieve deferred.
1755 dref1 = butler.getDeferred(ref1) # known and exists
1756 metric1 = dref1.get()
1757 self.assertEqual(metric1, metric)
1759 # Test different forms of file availability.
1760 # Need to be in a state where:
1761 # - one ref just has registry record.
1762 # - one ref has a missing file but a datastore record.
1763 # - one ref has a missing datastore record but file is there.
1764 # - one ref does not exist anywhere.
1765 # Do not need to test a ref that has everything since that is tested
1766 # above.
1767 ref0 = DatasetRef(
1768 datasetType,
1769 DataCoordinate.standardize(
1770 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions
1771 ),
1772 run=run1,
1773 )
1775 # Delete from datastore and retain in Registry.
1776 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False)
1778 # File has been removed.
1779 self.remove_dataset_out_of_band(butler, ref2)
1781 # Datastore has lost track.
1782 butler._datastore.forget([ref3])
1784 # First test with a standard butler.
1785 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True)
1786 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1787 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED)
1788 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE)
1789 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED)
1791 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False)
1792 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1793 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED)
1794 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN)
1795 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED)
1796 self.assertTrue(exists_many[ref2])
1798 # Check that per-ref query gives the same answer as many query.
1799 for ref, exists in exists_many.items():
1800 self.assertEqual(butler.exists(ref, full_check=False), exists)
1802 # Get deferred checks for existence before it allows it to be
1803 # retrieved.
1804 with self.assertRaises(LookupError):
1805 butler.getDeferred(ref3) # not known, file exists
1806 dref2 = butler.getDeferred(ref2) # known but file missing
1807 with self.assertRaises(FileNotFoundError):
1808 dref2.get()
1810 # Test again with a trusting butler.
1811 if self.trustModeSupported:
1812 butler._datastore.trustGetRequest = True
1813 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True)
1814 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED)
1815 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED)
1816 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE)
1817 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT)
1819 # When trusting we can get a deferred dataset handle that is not
1820 # known but does exist.
1821 dref3 = butler.getDeferred(ref3)
1822 metric3 = dref3.get()
1823 self.assertEqual(metric3, metric)
1825 # Check that per-ref query gives the same answer as many query.
1826 for ref, exists in exists_many.items():
1827 self.assertEqual(butler.exists(ref, full_check=True), exists)
1829 # Create a ref that surprisingly has the UUID of an existing ref
1830 # but is not the same.
1831 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id)
1832 with self.assertRaises(ValueError):
1833 butler.exists(ref_bad)
1835 # Create a ref that has a compatible storage class.
1836 ref_compat = ref2.overrideStorageClass("StructuredDataDict")
1837 exists = butler.exists(ref_compat)
1838 self.assertEqual(exists, exists_many[ref2])
1840 # Remove everything and start from scratch.
1841 butler._datastore.trustGetRequest = False
1842 butler.pruneDatasets(refs, purge=True, unstore=True)
1843 for ref in refs:
1844 butler.put(metric, ref)
1846 # These tests mess directly with the trash table and can leave the
1847 # datastore in an odd state. Do them at the end.
1848 # Check that in normal mode, deleting the record will lead to
1849 # trash not touching the file.
1850 uri1 = butler.getURI(ref1)
1851 butler._datastore.bridge.moveToTrash(
1852 [ref1], transaction=None
1853 ) # Update the dataset_location table
1854 butler._datastore.forget([ref1])
1855 butler._datastore.trash(ref1)
1856 butler._datastore.emptyTrash()
1857 self.assertTrue(uri1.exists())
1858 uri1.remove() # Clean it up.
1860 # Simulate execution butler setup by deleting the datastore
1861 # record but keeping the file around and trusting.
1862 butler._datastore.trustGetRequest = True
1863 uris = butler.get_many_uris([ref2, ref3])
1864 uri2 = uris[ref2].primaryURI
1865 uri3 = uris[ref3].primaryURI
1866 self.assertTrue(uri2.exists())
1867 self.assertTrue(uri3.exists())
1869 # Remove the datastore record.
1870 butler._datastore.bridge.moveToTrash(
1871 [ref2], transaction=None
1872 ) # Update the dataset_location table
1873 butler._datastore.forget([ref2])
1874 self.assertTrue(uri2.exists())
1875 butler._datastore.trash([ref2, ref3])
1876 # Immediate removal for ref2 file
1877 self.assertFalse(uri2.exists())
1878 # But ref3 has to wait for the empty.
1879 self.assertTrue(uri3.exists())
1880 butler._datastore.emptyTrash()
1881 self.assertFalse(uri3.exists())
1883 # Clear out the datasets from registry.
1884 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1887class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1888 """PosixDatastore specialization of a butler"""
1890 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1891 fullConfigKey: str | None = ".datastore.formatters"
1892 validationCanFail = True
1893 datastoreStr = ["/tmp"]
1894 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1895 registryStr = "/gen3.sqlite3"
1897 def testPathConstructor(self) -> None:
1898 """Independent test of constructor using PathLike."""
1899 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
1900 self.assertIsInstance(butler, Butler)
1902 # And again with a Path object with the butler yaml
1903 path = pathlib.Path(self.tmpConfigFile)
1904 butler = Butler.from_config(path, writeable=False)
1905 self.assertIsInstance(butler, Butler)
1907 # And again with a Path object without the butler yaml
1908 # (making sure we skip it if the tmp config doesn't end
1909 # in butler.yaml -- which is the case for a subclass)
1910 if self.tmpConfigFile.endswith("butler.yaml"):
1911 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1912 butler = Butler.from_config(path, writeable=False)
1913 self.assertIsInstance(butler, Butler)
1915 def testExportTransferCopy(self) -> None:
1916 """Test local export using all transfer modes"""
1917 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1918 exportButler = self.runPutGetTest(storageClass, "test_metric")
1919 # Test that the repo actually has at least one dataset.
1920 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1921 self.assertGreater(len(datasets), 0)
1922 uris = [exportButler.getURI(d) for d in datasets]
1923 assert isinstance(exportButler._datastore, FileDatastore)
1924 datastoreRoot = exportButler.get_datastore_roots()[exportButler.get_datastore_names()[0]]
1926 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1928 for path in pathsInStore:
1929 # Assume local file system
1930 assert path is not None
1931 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1933 for transfer in ("copy", "link", "symlink", "relsymlink"):
1934 with safeTestTempDir(TESTDIR) as exportDir:
1935 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1936 export.saveDatasets(datasets)
1937 for path in pathsInStore:
1938 assert path is not None
1939 self.assertTrue(
1940 self.checkFileExists(exportDir, path),
1941 f"Check that mode {transfer} exported files",
1942 )
1944 def testPytypeCoercion(self) -> None:
1945 """Test python type coercion on Butler.get and put."""
1946 # Store some data with the normal example storage class.
1947 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1948 datasetTypeName = "test_metric"
1949 butler = self.runPutGetTest(storageClass, datasetTypeName)
1951 dataId = {"instrument": "DummyCamComp", "visit": 423}
1952 metric = butler.get(datasetTypeName, dataId=dataId)
1953 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1955 datasetType_ori = butler.get_dataset_type(datasetTypeName)
1956 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1958 # Now need to hack the registry dataset type definition.
1959 # There is no API for this.
1960 assert isinstance(butler._registry, SqlRegistry)
1961 manager = butler._registry._managers.datasets
1962 assert hasattr(manager, "_db") and hasattr(manager, "_static")
1963 manager._db.update(
1964 manager._static.dataset_type,
1965 {"name": datasetTypeName},
1966 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1967 )
1969 # Force reset of dataset type cache
1970 butler.registry.refresh()
1972 datasetType_new = butler.get_dataset_type(datasetTypeName)
1973 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1974 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1976 metric_model = butler.get(datasetTypeName, dataId=dataId)
1977 self.assertNotEqual(type(metric_model), type(metric))
1978 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1980 # Put the model and read it back to show that everything now
1981 # works as normal.
1982 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1983 metric_model_new = butler.get(metric_ref)
1984 self.assertEqual(metric_model_new, metric_model)
1986 # Hack the storage class again to something that will fail on the
1987 # get with no conversion class.
1988 manager._db.update(
1989 manager._static.dataset_type,
1990 {"name": datasetTypeName},
1991 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1992 )
1993 butler.registry.refresh()
1995 with self.assertRaises(ValueError):
1996 butler.get(datasetTypeName, dataId=dataId)
1999@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
2000class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
2001 """PosixDatastore specialization of a butler using Postgres"""
2003 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2004 fullConfigKey = ".datastore.formatters"
2005 validationCanFail = True
2006 datastoreStr = ["/tmp"]
2007 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
2008 registryStr = "PostgreSQL@test"
2009 postgresql: Any
2011 @staticmethod
2012 def _handler(postgresql: Any) -> None:
2013 engine = sqlalchemy.engine.create_engine(postgresql.url())
2014 with engine.begin() as connection:
2015 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
2017 @classmethod
2018 def setUpClass(cls) -> None:
2019 # Create the postgres test server.
2020 cls.postgresql = testing.postgresql.PostgresqlFactory(
2021 cache_initialized_db=True, on_initialized=cls._handler
2022 )
2023 super().setUpClass()
2025 @classmethod
2026 def tearDownClass(cls) -> None:
2027 # Clean up any lingering SQLAlchemy engines/connections
2028 # so they're closed before we shut down the server.
2029 gc.collect()
2030 cls.postgresql.clear_cache()
2031 super().tearDownClass()
2033 def setUp(self) -> None:
2034 self.server = self.postgresql()
2036 # Need to add a registry section to the config.
2037 self._temp_config = False
2038 config = Config(self.configFile)
2039 config["registry", "db"] = self.server.url()
2040 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
2041 config.dump(fh)
2042 self.configFile = fh.name
2043 self._temp_config = True
2044 super().setUp()
2046 def tearDown(self) -> None:
2047 self.server.stop()
2048 if self._temp_config and os.path.exists(self.configFile):
2049 os.remove(self.configFile)
2050 super().tearDown()
2052 def testMakeRepo(self) -> None:
2053 # The base class test assumes that it's using sqlite and assumes
2054 # the config file is acceptable to sqlite.
2055 raise unittest.SkipTest("Postgres config is not compatible with this test.")
2058@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
2059class ClonedPostgresPosixDatastoreButlerTestCase(PostgresPosixDatastoreButlerTestCase, unittest.TestCase):
2060 """Test that Butler with a Postgres registry still works after cloning."""
2062 def create_butler(
2063 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
2064 ) -> tuple[DirectButler, DatasetType]:
2065 butler, datasetType = super().create_butler(run, storageClass, datasetTypeName)
2066 return butler._clone(run=run), datasetType
2069class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
2070 """InMemoryDatastore specialization of a butler"""
2072 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
2073 fullConfigKey = None
2074 useTempRoot = False
2075 validationCanFail = False
2076 datastoreStr = ["datastore='InMemory"]
2077 datastoreName = ["InMemoryDatastore@"]
2078 registryStr = "/gen3.sqlite3"
2080 def testIngest(self) -> None:
2081 pass
2084class ClonedSqliteButlerTestCase(InMemoryDatastoreButlerTestCase, unittest.TestCase):
2085 """Test that a Butler with a Sqlite registry still works after cloning."""
2087 def create_butler(
2088 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
2089 ) -> tuple[DirectButler, DatasetType]:
2090 butler, datasetType = super().create_butler(run, storageClass, datasetTypeName)
2091 return butler._clone(run=run), datasetType
2094class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
2095 """PosixDatastore specialization"""
2097 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2098 fullConfigKey = ".datastore.datastores.1.formatters"
2099 validationCanFail = True
2100 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
2101 datastoreName = [
2102 "InMemoryDatastore@",
2103 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
2104 "SecondDatastore",
2105 ]
2106 registryStr = "/gen3.sqlite3"
2108 def testPruneDatasets(self) -> None:
2109 # This test relies on manipulating files out-of-band, which is
2110 # impossible for this configuration because of the InMemoryDatastore in
2111 # the ChainedDatastore.
2112 pass
2115class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
2116 """Test that a yaml file in one location can refer to a root in another."""
2118 datastoreStr = ["dir1"]
2119 # Disable the makeRepo test since we are deliberately not using
2120 # butler.yaml as the config name.
2121 fullConfigKey = None
2123 def setUp(self) -> None:
2124 self.root = makeTestTempDir(TESTDIR)
2126 # Make a new repository in one place
2127 self.dir1 = os.path.join(self.root, "dir1")
2128 Butler.makeRepo(self.dir1, config=Config(self.configFile))
2130 # Move the yaml file to a different place and add a "root"
2131 self.dir2 = os.path.join(self.root, "dir2")
2132 os.makedirs(self.dir2, exist_ok=True)
2133 configFile1 = os.path.join(self.dir1, "butler.yaml")
2134 config = Config(configFile1)
2135 config["root"] = self.dir1
2136 configFile2 = os.path.join(self.dir2, "butler2.yaml")
2137 config.dumpToUri(configFile2)
2138 os.remove(configFile1)
2139 self.tmpConfigFile = configFile2
2141 def testFileLocations(self) -> None:
2142 self.assertNotEqual(self.dir1, self.dir2)
2143 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
2144 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
2145 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
2148class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
2149 """Test that a config file created by makeRepo outside of repo works."""
2151 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2153 def setUp(self) -> None:
2154 self.root = makeTestTempDir(TESTDIR)
2155 self.root2 = makeTestTempDir(TESTDIR)
2157 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
2158 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2160 def tearDown(self) -> None:
2161 if os.path.exists(self.root2):
2162 shutil.rmtree(self.root2, ignore_errors=True)
2163 super().tearDown()
2165 def testConfigExistence(self) -> None:
2166 c = Config(self.tmpConfigFile)
2167 uri_config = ResourcePath(c["root"])
2168 uri_expected = ResourcePath(self.root, forceDirectory=True)
2169 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
2170 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
2172 def testPutGet(self) -> None:
2173 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
2174 self.runPutGetTest(storageClass, "test_metric")
2177class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
2178 """Test that a config file created by makeRepo outside of repo works."""
2180 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2182 def setUp(self) -> None:
2183 self.root = makeTestTempDir(TESTDIR)
2184 self.root2 = makeTestTempDir(TESTDIR)
2186 self.tmpConfigFile = self.root2
2187 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2189 def testConfigExistence(self) -> None:
2190 # Append the yaml file else Config constructor does not know the file
2191 # type.
2192 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
2193 super().testConfigExistence()
2196class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
2197 """Test that a config file created by makeRepo outside of repo works."""
2199 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2201 def setUp(self) -> None:
2202 self.root = makeTestTempDir(TESTDIR)
2203 self.root2 = makeTestTempDir(TESTDIR)
2205 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
2206 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
2209@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
2210class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
2211 """S3Datastore specialization of a butler; an S3 storage Datastore +
2212 a local in-memory SqlRegistry.
2213 """
2215 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
2216 fullConfigKey = None
2217 validationCanFail = True
2219 bucketName = "anybucketname"
2220 """Name of the Bucket that will be used in the tests. The name is read from
2221 the config file used with the tests during set-up.
2222 """
2224 root = "butlerRoot/"
2225 """Root repository directory expected to be used in case useTempRoot=False.
2226 Otherwise the root is set to a 20 characters long randomly generated string
2227 during set-up.
2228 """
2230 datastoreStr = [f"datastore={root}"]
2231 """Contains all expected root locations in a format expected to be
2232 returned by Butler stringification.
2233 """
2235 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
2236 """The expected format of the S3 Datastore string."""
2238 registryStr = "/gen3.sqlite3"
2239 """Expected format of the Registry string."""
2241 mock_aws = mock_aws()
2242 """The mocked s3 interface from moto."""
2244 def genRoot(self) -> str:
2245 """Return a random string of len 20 to serve as a root
2246 name for the temporary bucket repo.
2248 This is equivalent to tempfile.mkdtemp as this is what self.root
2249 becomes when useTempRoot is True.
2250 """
2251 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
2252 return rndstr + "/"
2254 def setUp(self) -> None:
2255 config = Config(self.configFile)
2256 uri = ResourcePath(config[".datastore.datastore.root"])
2257 self.bucketName = uri.netloc
2259 # Enable S3 mocking of tests.
2260 self.enterContext(clean_test_environment_for_s3())
2261 self.mock_aws.start()
2263 if self.useTempRoot:
2264 self.root = self.genRoot()
2265 rooturi = f"s3://{self.bucketName}/{self.root}"
2266 config.update({"datastore": {"datastore": {"root": rooturi}}})
2268 # need local folder to store registry database
2269 self.reg_dir = makeTestTempDir(TESTDIR)
2270 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
2272 # MOTO needs to know that we expect Bucket bucketname to exist
2273 # (this used to be the class attribute bucketName)
2274 s3 = boto3.resource("s3")
2275 s3.create_bucket(Bucket=self.bucketName)
2277 self.datastoreStr = [f"datastore='{rooturi}'"]
2278 self.datastoreName = [f"FileDatastore@{rooturi}"]
2279 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
2280 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
2282 def tearDown(self) -> None:
2283 s3 = boto3.resource("s3")
2284 bucket = s3.Bucket(self.bucketName)
2285 try:
2286 bucket.objects.all().delete()
2287 except botocore.exceptions.ClientError as e:
2288 if e.response["Error"]["Code"] == "404":
2289 # the key was not reachable - pass
2290 pass
2291 else:
2292 raise
2294 bucket = s3.Bucket(self.bucketName)
2295 bucket.delete()
2297 # Stop the S3 mock.
2298 self.mock_aws.stop()
2300 if self.reg_dir is not None and os.path.exists(self.reg_dir):
2301 shutil.rmtree(self.reg_dir, ignore_errors=True)
2303 if self.useTempRoot and os.path.exists(self.root):
2304 shutil.rmtree(self.root, ignore_errors=True)
2306 super().tearDown()
2309class PosixDatastoreTransfers(unittest.TestCase):
2310 """Test data transfers between butlers.
2312 Test for different managers. UUID to UUID and integer to integer are
2313 tested. UUID to integer is not supported since we do not currently
2314 want to allow that. Integer to UUID is supported with the caveat
2315 that UUID4 will be generated and this will be incorrect for raw
2316 dataset types. The test ignores that.
2317 """
2319 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2320 storageClassFactory: StorageClassFactory
2322 @classmethod
2323 def setUpClass(cls) -> None:
2324 cls.storageClassFactory = StorageClassFactory()
2325 cls.storageClassFactory.addFromConfig(cls.configFile)
2327 def setUp(self) -> None:
2328 self.root = makeTestTempDir(TESTDIR)
2329 self.config = Config(self.configFile)
2331 def tearDown(self) -> None:
2332 removeTestTempDir(self.root)
2334 def create_butler(self, manager: str, label: str) -> Butler:
2335 config = Config(self.configFile)
2336 config["registry", "managers", "datasets"] = manager
2337 return Butler.from_config(
2338 Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True
2339 )
2341 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None:
2342 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID"
2343 if manager1 is None:
2344 manager1 = default
2345 if manager2 is None:
2346 manager2 = default
2347 self.source_butler = self.create_butler(manager1, "1")
2348 self.target_butler = self.create_butler(manager2, "2")
2350 def testTransferUuidToUuid(self) -> None:
2351 self.create_butlers()
2352 self.assertButlerTransfers()
2354 def testTransferMissing(self) -> None:
2355 """Test transfers where datastore records are missing.
2357 This is how execution butler works.
2358 """
2359 self.create_butlers()
2361 # Configure the source butler to allow trust.
2362 self.source_butler._datastore._set_trust_mode(True)
2364 self.assertButlerTransfers(purge=True)
2366 def testTransferMissingDisassembly(self) -> None:
2367 """Test transfers where datastore records are missing.
2369 This is how execution butler works.
2370 """
2371 self.create_butlers()
2373 # Configure the source butler to allow trust.
2374 self.source_butler._datastore._set_trust_mode(True)
2376 # Test disassembly.
2377 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
2379 def testAbsoluteURITransferDirect(self) -> None:
2380 """Test transfer using an absolute URI."""
2381 self._absolute_transfer("auto")
2383 def testAbsoluteURITransferCopy(self) -> None:
2384 """Test transfer using an absolute URI."""
2385 self._absolute_transfer("copy")
2387 def _absolute_transfer(self, transfer: str) -> None:
2388 self.create_butlers()
2390 storageClassName = "StructuredData"
2391 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2392 datasetTypeName = "random_data"
2393 run = "run1"
2394 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2396 dimensions = self.source_butler.dimensions.conform(())
2397 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2398 self.source_butler.registry.registerDatasetType(datasetType)
2400 metrics = makeExampleMetrics()
2401 with ResourcePath.temporary_uri(suffix=".json") as temp:
2402 dataId = DataCoordinate.make_empty(self.source_butler.dimensions)
2403 source_refs = [DatasetRef(datasetType, dataId, run=run)]
2404 temp.write(json.dumps(metrics.exportAsDict()).encode())
2405 dataset = FileDataset(path=temp, refs=source_refs)
2406 self.source_butler.ingest(dataset, transfer="direct")
2408 self.target_butler.transfer_from(
2409 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer
2410 )
2412 uri = self.target_butler.getURI(dataset.refs[0])
2413 if transfer == "auto":
2414 self.assertEqual(uri, temp)
2415 else:
2416 self.assertNotEqual(uri, temp)
2418 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None:
2419 """Test that a run can be transferred to another butler."""
2420 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
2421 datasetTypeName = "random_data"
2423 # Test will create 3 collections and we will want to transfer
2424 # two of those three.
2425 runs = ["run1", "run2", "other"]
2427 # Also want to use two different dataset types to ensure that
2428 # grouping works.
2429 datasetTypeNames = ["random_data", "random_data_2"]
2431 # Create the run collections in the source butler.
2432 for run in runs:
2433 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
2435 # Create dimensions in source butler.
2436 n_exposures = 30
2437 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
2438 self.source_butler.registry.insertDimensionData(
2439 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
2440 )
2441 self.source_butler.registry.insertDimensionData(
2442 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
2443 )
2444 self.source_butler.registry.insertDimensionData(
2445 "day_obs",
2446 {
2447 "instrument": "DummyCamComp",
2448 "id": 20250101,
2449 },
2450 )
2452 for i in range(n_exposures):
2453 self.source_butler.registry.insertDimensionData(
2454 "group", {"instrument": "DummyCamComp", "name": f"group{i}"}
2455 )
2456 self.source_butler.registry.insertDimensionData(
2457 "exposure",
2458 {
2459 "instrument": "DummyCamComp",
2460 "id": i,
2461 "obs_id": f"exp{i}",
2462 "physical_filter": "d-r",
2463 "group": f"group{i}",
2464 "day_obs": 20250101,
2465 },
2466 )
2468 # Create dataset types in the source butler.
2469 dimensions = self.source_butler.dimensions.conform(["instrument", "exposure"])
2470 for datasetTypeName in datasetTypeNames:
2471 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2472 self.source_butler.registry.registerDatasetType(datasetType)
2474 # Write a dataset to an unrelated run -- this will ensure that
2475 # we are rewriting integer dataset ids in the target if necessary.
2476 # Will not be relevant for UUID.
2477 run = "distraction"
2478 butler = Butler.from_config(butler=self.source_butler, run=run)
2479 butler.put(
2480 makeExampleMetrics(),
2481 datasetTypeName,
2482 exposure=1,
2483 instrument="DummyCamComp",
2484 physical_filter="d-r",
2485 )
2487 # Write some example metrics to the source
2488 butler = Butler.from_config(butler=self.source_butler)
2490 # Set of DatasetRefs that should be in the list of refs to transfer
2491 # but which will not be transferred.
2492 deleted: set[DatasetRef] = set()
2494 n_expected = 20 # Number of datasets expected to be transferred
2495 source_refs = []
2496 for i in range(n_exposures):
2497 # Put a third of datasets into each collection, only retain
2498 # two thirds.
2499 index = i % 3
2500 run = runs[index]
2501 datasetTypeName = datasetTypeNames[i % 2]
2503 metric = MetricsExample(
2504 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)]
2505 )
2506 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2507 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2509 # Remove the datastore record using low-level API, but only
2510 # for a specific index.
2511 if purge and index == 1:
2512 # For one of these delete the file as well.
2513 # This allows the "missing" code to filter the
2514 # file out.
2515 # Access the individual datastores.
2516 datastores = []
2517 if hasattr(butler._datastore, "datastores"):
2518 datastores.extend(butler._datastore.datastores)
2519 else:
2520 datastores.append(butler._datastore)
2522 if not deleted:
2523 # For a chained datastore we need to remove
2524 # files in each chain.
2525 for datastore in datastores:
2526 # The file might not be known to the datastore
2527 # if constraints are used.
2528 try:
2529 primary, uris = datastore.getURIs(ref)
2530 except FileNotFoundError:
2531 continue
2532 if primary and primary.scheme != "mem":
2533 primary.remove()
2534 for uri in uris.values():
2535 if uri.scheme != "mem":
2536 uri.remove()
2537 n_expected -= 1
2538 deleted.add(ref)
2540 # Remove the datastore record.
2541 for datastore in datastores:
2542 if hasattr(datastore, "removeStoredItemInfo"):
2543 datastore.removeStoredItemInfo(ref)
2545 if index < 2:
2546 source_refs.append(ref)
2547 if ref not in deleted:
2548 new_metric = butler.get(ref)
2549 self.assertEqual(new_metric, metric)
2551 # Create some bad dataset types to ensure we check for inconsistent
2552 # definitions.
2553 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2554 for datasetTypeName in datasetTypeNames:
2555 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2556 self.target_butler.registry.registerDatasetType(datasetType)
2557 with self.assertRaises(ConflictingDefinitionError) as cm:
2558 self.target_butler.transfer_from(self.source_butler, source_refs)
2559 self.assertIn("dataset type differs", str(cm.exception))
2561 # And remove the bad definitions.
2562 for datasetTypeName in datasetTypeNames:
2563 self.target_butler.registry.removeDatasetType(datasetTypeName)
2565 # Transfer without creating dataset types should fail.
2566 with self.assertRaises(KeyError):
2567 self.target_butler.transfer_from(self.source_butler, source_refs)
2569 # Transfer without creating dimensions should fail.
2570 with self.assertRaises(ConflictingDefinitionError) as cm:
2571 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True)
2572 self.assertIn("dimension", str(cm.exception))
2574 # The failed transfer above leaves registry in an inconsistent
2575 # state because the run is created but then rolled back without
2576 # the collection cache being cleared. For now force a refresh.
2577 # Can remove with DM-35498.
2578 self.target_butler.registry.refresh()
2580 # Do a dry run -- this should not have any effect on the target butler.
2581 self.target_butler.transfer_from(self.source_butler, source_refs, dry_run=True)
2583 # Transfer the records for one ref to test the alternative API.
2584 with self.assertLogs(logger="lsst", level=logging.DEBUG) as log_cm:
2585 self.target_butler.transfer_dimension_records_from(self.source_butler, [source_refs[0]])
2586 self.assertIn("number of records transferred: 1", ";".join(log_cm.output))
2588 # Now transfer them to the second butler, including dimensions.
2589 with self.assertLogs(logger="lsst", level=logging.DEBUG) as log_cm:
2590 transferred = self.target_butler.transfer_from(
2591 self.source_butler,
2592 source_refs,
2593 register_dataset_types=True,
2594 transfer_dimensions=True,
2595 )
2596 self.assertEqual(len(transferred), n_expected)
2597 log_output = ";".join(log_cm.output)
2599 # A ChainedDatastore will use the in-memory datastore for mexists
2600 # so we can not rely on the mexists log message.
2601 self.assertIn("Number of datastore records found in source", log_output)
2602 self.assertIn("Creating output run", log_output)
2604 # Do the transfer twice to ensure that it will do nothing extra.
2605 # Only do this if purge=True because it does not work for int
2606 # dataset_id.
2607 if purge:
2608 # This should not need to register dataset types.
2609 transferred = self.target_butler.transfer_from(self.source_butler, source_refs)
2610 self.assertEqual(len(transferred), n_expected)
2612 # Also do an explicit low-level transfer to trigger some
2613 # edge cases.
2614 with self.assertLogs(level=logging.DEBUG) as log_cm:
2615 self.target_butler._datastore.transfer_from(self.source_butler._datastore, source_refs)
2616 log_output = ";".join(log_cm.output)
2617 self.assertIn("no file artifacts exist", log_output)
2619 with self.assertRaises((TypeError, AttributeError)):
2620 self.target_butler._datastore.transfer_from(self.source_butler, source_refs) # type: ignore
2622 with self.assertRaises(ValueError):
2623 self.target_butler._datastore.transfer_from(
2624 self.source_butler._datastore, source_refs, transfer="split"
2625 )
2627 # Now try to get the same refs from the new butler.
2628 for ref in source_refs:
2629 if ref not in deleted:
2630 new_metric = self.target_butler.get(ref)
2631 old_metric = self.source_butler.get(ref)
2632 self.assertEqual(new_metric, old_metric)
2634 # Now prune run2 collection and create instead a CHAINED collection.
2635 # This should block the transfer.
2636 self.target_butler.removeRuns(["run2"], unstore=True)
2637 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2638 with self.assertRaises(CollectionTypeError):
2639 # Re-importing the run1 datasets can be problematic if they
2640 # use integer IDs so filter those out.
2641 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2642 self.target_butler.transfer_from(self.source_butler, to_transfer)
2645class ChainedDatastoreTransfers(PosixDatastoreTransfers):
2646 """Test transfers using a chained datastore."""
2648 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2651class NullDatastoreTestCase(unittest.TestCase):
2652 """Test that we can fall back to a null datastore."""
2654 # Need a good config to create the repo.
2655 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
2656 storageClassFactory: StorageClassFactory
2658 @classmethod
2659 def setUpClass(cls) -> None:
2660 cls.storageClassFactory = StorageClassFactory()
2661 cls.storageClassFactory.addFromConfig(cls.configFile)
2663 def setUp(self) -> None:
2664 """Create a new butler root for each test."""
2665 self.root = makeTestTempDir(TESTDIR)
2666 Butler.makeRepo(self.root, config=Config(self.configFile))
2668 def tearDown(self) -> None:
2669 removeTestTempDir(self.root)
2671 def test_fallback(self) -> None:
2672 # Read the butler config and mess with the datastore section.
2673 config_path = os.path.join(self.root, "butler.yaml")
2674 bad_config = Config(config_path)
2675 bad_config["datastore", "cls"] = "lsst.not.a.datastore.Datastore"
2676 bad_config.dumpToUri(config_path)
2678 with self.assertRaises(RuntimeError):
2679 Butler(self.root, without_datastore=False)
2681 with self.assertRaises(RuntimeError):
2682 Butler.from_config(self.root, without_datastore=False)
2684 butler = Butler.from_config(self.root, writeable=True, without_datastore=True)
2685 self.assertIsInstance(butler._datastore, NullDatastore)
2687 # Check that registry is working.
2688 butler.registry.registerRun("MYRUN")
2689 collections = butler.registry.queryCollections(...)
2690 self.assertIn("MYRUN", set(collections))
2692 # Create a ref.
2693 dimensions = butler.dimensions.conform([])
2694 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
2695 datasetTypeName = "metric"
2696 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
2697 butler.registry.registerDatasetType(datasetType)
2698 ref = DatasetRef(datasetType, {}, run="MYRUN")
2700 # Check that datastore will complain.
2701 with self.assertRaises(FileNotFoundError):
2702 butler.get(ref)
2703 with self.assertRaises(FileNotFoundError):
2704 butler.getURI(ref)
2707@unittest.skipIf(create_test_server is None, "Server dependencies not installed.")
2708class ButlerServerTests(FileDatastoreButlerTests, unittest.TestCase):
2709 """Test RemoteButler and Butler server."""
2711 configFile = None
2712 predictionSupported = False
2713 trustModeSupported = False
2715 def setUp(self):
2716 self.server_instance = self.enterContext(create_test_server(TESTDIR))
2718 def tearDown(self):
2719 pass
2721 def are_uris_equivalent(self, uri1: ResourcePath, uri2: ResourcePath) -> bool:
2722 # S3 pre-signed URLs may end up with differing expiration times in the
2723 # query parameters, so ignore query parameters when comparing.
2724 return uri1.scheme == uri2.scheme and uri1.netloc == uri2.netloc and uri1.path == uri2.path
2726 def create_empty_butler(self, run: str | None = None, writeable: bool | None = None) -> Butler:
2727 return self.server_instance.hybrid_butler._clone(run=run)
2729 def remove_dataset_out_of_band(self, butler: Butler, ref: DatasetRef) -> None:
2730 # Can't delete a file via S3 signed URLs, so we need to reach in
2731 # through DirectButler to delete the dataset.
2732 uri = self.server_instance.direct_butler.getURI(ref)
2733 uri.remove()
2735 def testConstructor(self):
2736 # RemoteButler constructor is tested in test_server.py and
2737 # test_remote_butler.py.
2738 pass
2740 def testDafButlerRepositories(self):
2741 # Loading of RemoteButler via repository index is tested in
2742 # test_server.py.
2743 pass
2745 def testGetDatasetTypes(self) -> None:
2746 # This is mostly a test of validateConfiguration, which is for
2747 # validating Datastore configuration and thus isn't relevant to
2748 # RemoteButler.
2749 pass
2751 def testMakeRepo(self) -> None:
2752 # Only applies to DirectButler.
2753 pass
2755 # Pickling not yet implemented for RemoteButler/HybridButler.
2756 @unittest.expectedFailure
2757 def testPickle(self) -> None:
2758 return super().testPickle()
2760 def testStringification(self) -> None:
2761 self.assertEqual(
2762 str(self.server_instance.remote_butler),
2763 "RemoteButler(https://test.example/api/butler/repo/testrepo)",
2764 )
2766 def testTransaction(self) -> None:
2767 # Transactions will never be supported for RemoteButler.
2768 pass
2770 def testPutTemplates(self) -> None:
2771 # The Butler server instance is configured with different file naming
2772 # templates than this test is expecting.
2773 pass
2776def setup_module(module: types.ModuleType) -> None:
2777 """Set up the module for pytest."""
2778 clean_environment()
2781if __name__ == "__main__":
2782 clean_environment()
2783 unittest.main()