Coverage for tests/test_butler.py: 13%
1172 statements
« prev ^ index » next coverage.py v7.2.6, created at 2023-05-26 02:11 -0700
« prev ^ index » next coverage.py v7.2.6, created at 2023-05-26 02:11 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
24from __future__ import annotations
26import gc
27import json
28import logging
29import os
30import pathlib
31import pickle
32import posixpath
33import random
34import shutil
35import string
36import tempfile
37import unittest
38import uuid
39from collections.abc import Mapping
40from typing import TYPE_CHECKING, Any, cast
42try:
43 import boto3
44 import botocore
45 from moto import mock_s3 # type: ignore[import]
46except ImportError:
47 boto3 = None
49 def mock_s3(cls):
50 """A no-op decorator in case moto mock_s3 can not be imported."""
51 return cls
54try:
55 # It's possible but silly to have testing.postgresql installed without
56 # having the postgresql server installed (because then nothing in
57 # testing.postgresql would work), so we use the presence of that module
58 # to test whether we can expect the server to be available.
59 import testing.postgresql # type: ignore[import]
60except ImportError:
61 testing = None
63import astropy.time
64import sqlalchemy
65from lsst.daf.butler import (
66 Butler,
67 ButlerConfig,
68 CollectionType,
69 Config,
70 DataCoordinate,
71 DatasetRef,
72 DatasetType,
73 FileDataset,
74 FileTemplate,
75 FileTemplateValidationError,
76 StorageClassFactory,
77 ValidationError,
78 script,
79)
80from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
81from lsst.daf.butler.datastores.fileDatastore import FileDatastore
82from lsst.daf.butler.registries.sql import SqlRegistry
83from lsst.daf.butler.registry import (
84 CollectionError,
85 CollectionTypeError,
86 ConflictingDefinitionError,
87 DataIdValueError,
88 MissingCollectionError,
89 OrphanedRecordError,
90)
91from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
92from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir
93from lsst.resources import ResourcePath
94from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
95from lsst.utils import doImportType
96from lsst.utils.ellipsis import Ellipsis
97from lsst.utils.introspection import get_full_type_name
99if TYPE_CHECKING:
100 from lsst.daf.butler import Datastore, DimensionGraph, Registry, StorageClass
102TESTDIR = os.path.abspath(os.path.dirname(__file__))
105def makeExampleMetrics():
106 return MetricsExample(
107 {"AM1": 5.2, "AM2": 30.6},
108 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
109 [563, 234, 456.7, 752, 8, 9, 27],
110 )
113class TransactionTestError(Exception):
114 """Specific error for testing transactions, to prevent misdiagnosing
115 that might otherwise occur when a standard exception is used.
116 """
118 pass
121class ButlerConfigTests(unittest.TestCase):
122 """Simple tests for ButlerConfig that are not tested in any other test
123 cases."""
125 def testSearchPath(self):
126 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
127 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
128 config1 = ButlerConfig(configFile)
129 self.assertNotIn("testConfigs", "\n".join(cm.output))
131 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
132 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
133 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
134 self.assertIn("testConfigs", "\n".join(cm.output))
136 key = ("datastore", "records", "table")
137 self.assertNotEqual(config1[key], config2[key])
138 self.assertEqual(config2[key], "override_record")
141class ButlerPutGetTests(TestCaseMixin):
142 """Helper method for running a suite of put/get tests from different
143 butler configurations."""
145 root: str
146 default_run = "ingésτ😺"
147 storageClassFactory: StorageClassFactory
148 configFile: str
149 tmpConfigFile: str
151 @staticmethod
152 def addDatasetType(
153 datasetTypeName: str, dimensions: DimensionGraph, storageClass: StorageClass | str, registry: Registry
154 ) -> DatasetType:
155 """Create a DatasetType and register it"""
156 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
157 registry.registerDatasetType(datasetType)
158 return datasetType
160 @classmethod
161 def setUpClass(cls) -> None:
162 cls.storageClassFactory = StorageClassFactory()
163 cls.storageClassFactory.addFromConfig(cls.configFile)
165 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None) -> None:
166 datasetType = datasetRef.datasetType
167 dataId = datasetRef.dataId
168 deferred = butler.getDeferred(datasetRef)
170 for component in components:
171 compTypeName = datasetType.componentTypeName(component)
172 result = butler.get(compTypeName, dataId, collections=collections)
173 self.assertEqual(result, getattr(reference, component))
174 result_deferred = deferred.get(component=component)
175 self.assertEqual(result_deferred, result)
177 def tearDown(self) -> None:
178 removeTestTempDir(self.root)
180 def create_butler(
181 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
182 ) -> tuple[Butler, DatasetType]:
183 butler = Butler(self.tmpConfigFile, run=run)
185 collections = set(butler.registry.queryCollections())
186 self.assertEqual(collections, set([run]))
188 # Create and register a DatasetType
189 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
191 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
193 # Add needed Dimensions
194 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
195 butler.registry.insertDimensionData(
196 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
197 )
198 butler.registry.insertDimensionData(
199 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
200 )
201 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
202 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
203 butler.registry.insertDimensionData(
204 "visit",
205 {
206 "instrument": "DummyCamComp",
207 "id": 423,
208 "name": "fourtwentythree",
209 "physical_filter": "d-r",
210 "visit_system": 1,
211 "datetime_begin": visit_start,
212 "datetime_end": visit_end,
213 },
214 )
216 # Add more visits for some later tests
217 for visit_id in (424, 425):
218 butler.registry.insertDimensionData(
219 "visit",
220 {
221 "instrument": "DummyCamComp",
222 "id": visit_id,
223 "name": f"fourtwentyfour_{visit_id}",
224 "physical_filter": "d-r",
225 "visit_system": 1,
226 },
227 )
228 return butler, datasetType
230 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler:
231 # New datasets will be added to run and tag, but we will only look in
232 # tag when looking up datasets.
233 run = self.default_run
234 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
235 assert butler.run is not None
237 # Create and store a dataset
238 metric = makeExampleMetrics()
239 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423})
241 # Put and remove the dataset once as a DatasetRef, once as a dataId,
242 # and once with a DatasetType
244 # Keep track of any collections we add and do not clean up
245 expected_collections = {run}
247 counter = 0
248 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1")
249 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate]
250 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)):
251 # Since we are using subTest we can get cascading failures
252 # here with the first attempt failing and the others failing
253 # immediately because the dataset already exists. Work around
254 # this by using a distinct run collection each time
255 counter += 1
256 this_run = f"put_run_{counter}"
257 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
258 expected_collections.update({this_run})
260 with self.subTest(args=args):
261 ref = butler.put(metric, *args, run=this_run)
262 self.assertIsInstance(ref, DatasetRef)
264 # Test getDirect
265 metricOut = butler.get(ref)
266 self.assertEqual(metric, metricOut)
267 # Test get
268 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
269 self.assertEqual(metric, metricOut)
270 # Test get with a datasetRef
271 metricOut = butler.get(ref, collections=this_run)
272 self.assertEqual(metric, metricOut)
273 # Test getDeferred with dataId
274 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
275 self.assertEqual(metric, metricOut)
276 # Test getDeferred with a datasetRef
277 metricOut = butler.getDeferred(ref, collections=this_run).get()
278 self.assertEqual(metric, metricOut)
279 # and deferred direct with ref
280 metricOut = butler.getDeferred(ref).get()
281 self.assertEqual(metric, metricOut)
283 # Check we can get components
284 if storageClass.isComposite():
285 self.assertGetComponents(
286 butler, ref, ("summary", "data", "output"), metric, collections=this_run
287 )
289 # Can the artifacts themselves be retrieved?
290 if not butler.datastore.isEphemeral:
291 root_uri = ResourcePath(self.root)
293 for preserve_path in (True, False):
294 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
295 # Use copy so that we can test that overwrite
296 # protection works (using "auto" for File URIs would
297 # use hard links and subsequent transfer would work
298 # because it knows they are the same file).
299 transferred = butler.retrieveArtifacts(
300 [ref], destination, preserve_path=preserve_path, transfer="copy"
301 )
302 self.assertGreater(len(transferred), 0)
303 artifacts = list(ResourcePath.findFileResources([destination]))
304 self.assertEqual(set(transferred), set(artifacts))
306 for artifact in transferred:
307 path_in_destination = artifact.relative_to(destination)
308 self.assertIsNotNone(path_in_destination)
309 assert path_in_destination is not None
311 # when path is not preserved there should not be
312 # any path separators.
313 num_seps = path_in_destination.count("/")
314 if preserve_path:
315 self.assertGreater(num_seps, 0)
316 else:
317 self.assertEqual(num_seps, 0)
319 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
320 n_uris = len(secondary_uris)
321 if primary_uri:
322 n_uris += 1
323 self.assertEqual(
324 len(artifacts),
325 n_uris,
326 "Comparing expected artifacts vs actual:"
327 f" {artifacts} vs {primary_uri} and {secondary_uris}",
328 )
330 if preserve_path:
331 # No need to run these twice
332 with self.assertRaises(ValueError):
333 butler.retrieveArtifacts([ref], destination, transfer="move")
335 with self.assertRaises(FileExistsError):
336 butler.retrieveArtifacts([ref], destination)
338 transferred_again = butler.retrieveArtifacts(
339 [ref], destination, preserve_path=preserve_path, overwrite=True
340 )
341 self.assertEqual(set(transferred_again), set(transferred))
343 # Now remove the dataset completely.
344 butler.pruneDatasets([ref], purge=True, unstore=True)
345 # Lookup with original args should still fail.
346 with self.assertRaises(LookupError):
347 butler.datasetExists(*args, collections=this_run)
348 # get() should still fail.
349 with self.assertRaises(FileNotFoundError):
350 butler.get(ref)
351 # Registry shouldn't be able to find it by dataset_id anymore.
352 self.assertIsNone(butler.registry.getDataset(ref.id))
354 # Do explicit registry removal since we know they are
355 # empty
356 butler.registry.removeCollection(this_run)
357 expected_collections.remove(this_run)
359 # Create DatasetRef for put using default run.
360 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run)
362 # Put the dataset again, since the last thing we did was remove it
363 # and we want to use the default collection.
364 ref = butler.put(metric, refIn)
366 # Get with parameters
367 stop = 4
368 sliced = butler.get(ref, parameters={"slice": slice(stop)})
369 self.assertNotEqual(metric, sliced)
370 self.assertEqual(metric.summary, sliced.summary)
371 self.assertEqual(metric.output, sliced.output)
372 self.assertEqual(metric.data[:stop], sliced.data)
373 # getDeferred with parameters
374 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
375 self.assertNotEqual(metric, sliced)
376 self.assertEqual(metric.summary, sliced.summary)
377 self.assertEqual(metric.output, sliced.output)
378 self.assertEqual(metric.data[:stop], sliced.data)
379 # getDeferred with deferred parameters
380 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
381 self.assertNotEqual(metric, sliced)
382 self.assertEqual(metric.summary, sliced.summary)
383 self.assertEqual(metric.output, sliced.output)
384 self.assertEqual(metric.data[:stop], sliced.data)
386 if storageClass.isComposite():
387 # Check that components can be retrieved
388 metricOut = butler.get(ref.datasetType.name, dataId)
389 compNameS = ref.datasetType.componentTypeName("summary")
390 compNameD = ref.datasetType.componentTypeName("data")
391 summary = butler.get(compNameS, dataId)
392 self.assertEqual(summary, metric.summary)
393 data = butler.get(compNameD, dataId)
394 self.assertEqual(data, metric.data)
396 if "counter" in storageClass.derivedComponents:
397 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
398 self.assertEqual(count, len(data))
400 count = butler.get(
401 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
402 )
403 self.assertEqual(count, stop)
405 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
406 assert compRef is not None
407 summary = butler.get(compRef)
408 self.assertEqual(summary, metric.summary)
410 # Create a Dataset type that has the same name but is inconsistent.
411 inconsistentDatasetType = DatasetType(
412 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
413 )
415 # Getting with a dataset type that does not match registry fails
416 with self.assertRaisesRegex(ValueError, "Supplied dataset type .* inconsistent with registry"):
417 butler.get(inconsistentDatasetType, dataId)
419 # Combining a DatasetRef with a dataId should fail
420 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"):
421 butler.get(ref, dataId)
422 # Getting with an explicit ref should fail if the id doesn't match.
423 with self.assertRaises(FileNotFoundError):
424 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run))
426 # Getting a dataset with unknown parameters should fail
427 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"):
428 butler.get(ref, parameters={"unsupported": True})
430 # Check we have a collection
431 collections = set(butler.registry.queryCollections())
432 self.assertEqual(collections, expected_collections)
434 # Clean up to check that we can remove something that may have
435 # already had a component removed
436 butler.pruneDatasets([ref], unstore=True, purge=True)
438 # Add the same ref again, so we can check that duplicate put fails.
439 ref = butler.put(metric, datasetType, dataId)
441 # Repeat put will fail.
442 with self.assertRaisesRegex(
443 ConflictingDefinitionError, "A database constraint failure was triggered"
444 ):
445 butler.put(metric, datasetType, dataId)
447 # Remove the datastore entry.
448 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
450 # Put will still fail
451 with self.assertRaisesRegex(
452 ConflictingDefinitionError, "A database constraint failure was triggered"
453 ):
454 butler.put(metric, datasetType, dataId)
456 # Repeat the same sequence with resolved ref.
457 butler.pruneDatasets([ref], unstore=True, purge=True)
458 ref = butler.put(metric, refIn)
460 # Repeat put will fail.
461 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"):
462 butler.put(metric, refIn)
464 # Remove the datastore entry.
465 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
467 # In case of resolved ref this write will succeed.
468 ref = butler.put(metric, refIn)
470 # Leave the dataset in place since some downstream tests require
471 # something to be present
473 return butler
475 def testDeferredCollectionPassing(self) -> None:
476 # Construct a butler with no run or collection, but make it writeable.
477 butler = Butler(self.tmpConfigFile, writeable=True)
478 # Create and register a DatasetType
479 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
480 datasetType = self.addDatasetType(
481 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
482 )
483 # Add needed Dimensions
484 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
485 butler.registry.insertDimensionData(
486 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
487 )
488 butler.registry.insertDimensionData(
489 "visit",
490 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
491 )
492 dataId = {"instrument": "DummyCamComp", "visit": 423}
493 # Create dataset.
494 metric = makeExampleMetrics()
495 # Register a new run and put dataset.
496 run = "deferred"
497 self.assertTrue(butler.registry.registerRun(run))
498 # Second time it will be allowed but indicate no-op
499 self.assertFalse(butler.registry.registerRun(run))
500 ref = butler.put(metric, datasetType, dataId, run=run)
501 # Putting with no run should fail with TypeError.
502 with self.assertRaises(CollectionError):
503 butler.put(metric, datasetType, dataId)
504 # Dataset should exist.
505 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
506 # We should be able to get the dataset back, but with and without
507 # a deferred dataset handle.
508 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
509 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
510 # Trying to find the dataset without any collection is a TypeError.
511 with self.assertRaises(CollectionError):
512 butler.datasetExists(datasetType, dataId)
513 with self.assertRaises(CollectionError):
514 butler.get(datasetType, dataId)
515 # Associate the dataset with a different collection.
516 butler.registry.registerCollection("tagged")
517 butler.registry.associate("tagged", [ref])
518 # Deleting the dataset from the new collection should make it findable
519 # in the original collection.
520 butler.pruneDatasets([ref], tags=["tagged"])
521 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
524class ButlerTests(ButlerPutGetTests):
525 """Tests for Butler."""
527 useTempRoot = True
528 validationCanFail: bool
529 fullConfigKey: str | None
530 registryStr: str | None
531 datastoreName: list[str] | None
532 datastoreStr: list[str]
534 def setUp(self) -> None:
535 """Create a new butler root for each test."""
536 self.root = makeTestTempDir(TESTDIR)
537 Butler.makeRepo(self.root, config=Config(self.configFile))
538 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
540 def testConstructor(self) -> None:
541 """Independent test of constructor."""
542 butler = Butler(self.tmpConfigFile, run=self.default_run)
543 self.assertIsInstance(butler, Butler)
545 # Check that butler.yaml is added automatically.
546 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
547 config_dir = self.tmpConfigFile[: -len(end)]
548 butler = Butler(config_dir, run=self.default_run)
549 self.assertIsInstance(butler, Butler)
551 # Even with a ResourcePath.
552 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
553 self.assertIsInstance(butler, Butler)
555 collections = set(butler.registry.queryCollections())
556 self.assertEqual(collections, {self.default_run})
558 # Check that some special characters can be included in run name.
559 special_run = "u@b.c-A"
560 butler_special = Butler(butler=butler, run=special_run)
561 collections = set(butler_special.registry.queryCollections("*@*"))
562 self.assertEqual(collections, {special_run})
564 butler2 = Butler(butler=butler, collections=["other"])
565 self.assertEqual(butler2.collections, ("other",))
566 self.assertIsNone(butler2.run)
567 self.assertIs(butler.datastore, butler2.datastore)
569 # Test that we can use an environment variable to find this
570 # repository.
571 butler_index = Config()
572 butler_index["label"] = self.tmpConfigFile
573 for suffix in (".yaml", ".json"):
574 # Ensure that the content differs so that we know that
575 # we aren't reusing the cache.
576 bad_label = f"s3://bucket/not_real{suffix}"
577 butler_index["bad_label"] = bad_label
578 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
579 butler_index.dumpToUri(temp_file)
580 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
581 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
582 uri = Butler.get_repo_uri("bad_label")
583 self.assertEqual(uri, ResourcePath(bad_label))
584 uri = Butler.get_repo_uri("label")
585 butler = Butler(uri, writeable=False)
586 self.assertIsInstance(butler, Butler)
587 butler = Butler("label", writeable=False)
588 self.assertIsInstance(butler, Butler)
589 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
590 Butler("not_there", writeable=False)
591 with self.assertRaises(KeyError) as cm:
592 Butler.get_repo_uri("missing")
593 self.assertIn("not known to", str(cm.exception))
594 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
595 with self.assertRaises(FileNotFoundError):
596 Butler.get_repo_uri("label")
597 self.assertEqual(Butler.get_known_repos(), set())
598 with self.assertRaises(KeyError) as cm:
599 # No environment variable set.
600 Butler.get_repo_uri("label")
601 self.assertIn("No repository index defined", str(cm.exception))
602 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
603 # No aliases registered.
604 Butler("not_there")
605 self.assertEqual(Butler.get_known_repos(), set())
607 def testBasicPutGet(self) -> None:
608 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
609 self.runPutGetTest(storageClass, "test_metric")
611 def testCompositePutGetConcrete(self) -> None:
612 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
613 butler = self.runPutGetTest(storageClass, "test_metric")
615 # Should *not* be disassembled
616 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
617 self.assertEqual(len(datasets), 1)
618 uri, components = butler.getURIs(datasets[0])
619 self.assertIsInstance(uri, ResourcePath)
620 self.assertFalse(components)
621 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
622 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
624 # Predicted dataset
625 dataId = {"instrument": "DummyCamComp", "visit": 424}
626 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
627 self.assertFalse(components)
628 self.assertIsInstance(uri, ResourcePath)
629 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
630 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
632 def testCompositePutGetVirtual(self) -> None:
633 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
634 butler = self.runPutGetTest(storageClass, "test_metric_comp")
636 # Should be disassembled
637 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
638 self.assertEqual(len(datasets), 1)
639 uri, components = butler.getURIs(datasets[0])
641 if butler.datastore.isEphemeral:
642 # Never disassemble in-memory datastore
643 self.assertIsInstance(uri, ResourcePath)
644 self.assertFalse(components)
645 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
646 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
647 else:
648 self.assertIsNone(uri)
649 self.assertEqual(set(components), set(storageClass.components))
650 for compuri in components.values():
651 self.assertIsInstance(compuri, ResourcePath)
652 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
653 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
655 # Predicted dataset
656 dataId = {"instrument": "DummyCamComp", "visit": 424}
657 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
659 if butler.datastore.isEphemeral:
660 # Never disassembled
661 self.assertIsInstance(uri, ResourcePath)
662 self.assertFalse(components)
663 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
664 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
665 else:
666 self.assertIsNone(uri)
667 self.assertEqual(set(components), set(storageClass.components))
668 for compuri in components.values():
669 self.assertIsInstance(compuri, ResourcePath)
670 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
671 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
673 def testStorageClassOverrideGet(self) -> None:
674 """Test storage class conversion on get with override."""
675 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
676 datasetTypeName = "anything"
677 run = self.default_run
679 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
681 # Create and store a dataset.
682 metric = makeExampleMetrics()
683 dataId = {"instrument": "DummyCamComp", "visit": 423}
685 ref = butler.put(metric, datasetType, dataId)
687 # Return native type.
688 retrieved = butler.get(ref)
689 self.assertEqual(retrieved, metric)
691 # Specify an override.
692 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
693 model = butler.get(ref, storageClass=new_sc)
694 self.assertNotEqual(type(model), type(retrieved))
695 self.assertIs(type(model), new_sc.pytype)
696 self.assertEqual(retrieved, model)
698 # Defer but override later.
699 deferred = butler.getDeferred(ref)
700 model = deferred.get(storageClass=new_sc)
701 self.assertIs(type(model), new_sc.pytype)
702 self.assertEqual(retrieved, model)
704 # Defer but override up front.
705 deferred = butler.getDeferred(ref, storageClass=new_sc)
706 model = deferred.get()
707 self.assertIs(type(model), new_sc.pytype)
708 self.assertEqual(retrieved, model)
710 # Retrieve a component. Should be a tuple.
711 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
712 self.assertIs(type(data), tuple)
713 self.assertEqual(data, tuple(retrieved.data))
715 # Parameter on the write storage class should work regardless
716 # of read storage class.
717 data = butler.get(
718 "anything.data",
719 dataId,
720 storageClass="StructuredDataDataTestTuple",
721 parameters={"slice": slice(2, 4)},
722 )
723 self.assertEqual(len(data), 2)
725 # Try a parameter that is known to the read storage class but not
726 # the write storage class.
727 with self.assertRaises(KeyError):
728 butler.get(
729 "anything.data",
730 dataId,
731 storageClass="StructuredDataDataTestTuple",
732 parameters={"xslice": slice(2, 4)},
733 )
735 def testPytypePutCoercion(self) -> None:
736 """Test python type coercion on Butler.get and put."""
738 # Store some data with the normal example storage class.
739 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
740 datasetTypeName = "test_metric"
741 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
743 dataId = {"instrument": "DummyCamComp", "visit": 423}
745 # Put a dict and this should coerce to a MetricsExample
746 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
747 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
748 test_metric = butler.get(metric_ref)
749 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
750 self.assertEqual(test_metric.summary, test_dict["summary"])
751 self.assertEqual(test_metric.output, test_dict["output"])
753 # Check that the put still works if a DatasetType is given with
754 # a definition matching this python type.
755 registry_type = butler.registry.getDatasetType(datasetTypeName)
756 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
757 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
758 self.assertEqual(metric2_ref.datasetType, registry_type)
760 # The get will return the type expected by registry.
761 test_metric2 = butler.get(metric2_ref)
762 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
764 # Make a new DatasetRef with the compatible but different DatasetType.
765 # This should now return a dict.
766 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
767 test_dict2 = butler.get(new_ref)
768 self.assertEqual(get_full_type_name(test_dict2), "dict")
770 # Get it again with the wrong dataset type definition using get()
771 # rather than get(). This should be consistent with get()
772 # behavior and return the type of the DatasetType.
773 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
774 self.assertEqual(get_full_type_name(test_dict3), "dict")
776 def testIngest(self) -> None:
777 butler = Butler(self.tmpConfigFile, run=self.default_run)
779 # Create and register a DatasetType
780 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
782 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
783 datasetTypeName = "metric"
785 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
787 # Add needed Dimensions
788 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
789 butler.registry.insertDimensionData(
790 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
791 )
792 for detector in (1, 2):
793 butler.registry.insertDimensionData(
794 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
795 )
797 butler.registry.insertDimensionData(
798 "visit",
799 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
800 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
801 )
803 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter")
804 dataRoot = os.path.join(TESTDIR, "data", "basic")
805 datasets = []
806 for detector in (1, 2):
807 detector_name = f"detector_{detector}"
808 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
809 dataId = butler.registry.expandDataId(
810 {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
811 )
812 # Create a DatasetRef for ingest
813 refIn = DatasetRef(datasetType, dataId, run=self.default_run)
815 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
817 butler.ingest(*datasets, transfer="copy")
819 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
820 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
822 metrics1 = butler.get(datasetTypeName, dataId1)
823 metrics2 = butler.get(datasetTypeName, dataId2)
824 self.assertNotEqual(metrics1, metrics2)
826 # Compare URIs
827 uri1 = butler.getURI(datasetTypeName, dataId1)
828 uri2 = butler.getURI(datasetTypeName, dataId2)
829 self.assertNotEqual(uri1, uri2)
831 # Now do a multi-dataset but single file ingest
832 metricFile = os.path.join(dataRoot, "detectors.yaml")
833 refs = []
834 for detector in (1, 2):
835 detector_name = f"detector_{detector}"
836 dataId = butler.registry.expandDataId(
837 {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
838 )
839 # Create a DatasetRef for ingest
840 refs.append(DatasetRef(datasetType, dataId, run=self.default_run))
842 # Test "move" transfer to ensure that the files themselves
843 # have disappeared following ingest.
844 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
845 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
847 datasets = []
848 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
850 # For first ingest use copy.
851 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
853 # Now try to ingest again in "execution butler" mode where
854 # the registry entries exist but the datastore does not have
855 # the files. We also need to strip the dimension records to ensure
856 # that they will be re-added by the ingest.
857 ref = datasets[0].refs[0]
858 datasets[0].refs = [
859 cast(
860 DatasetRef,
861 butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run),
862 )
863 for ref in datasets[0].refs
864 ]
865 all_refs = []
866 for dataset in datasets:
867 refs = []
868 for ref in dataset.refs:
869 # Create a dict from the dataId to drop the records.
870 new_data_id = {str(k): v for k, v in ref.dataId.items()}
871 new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run)
872 assert new_ref is not None
873 self.assertFalse(new_ref.dataId.hasRecords())
874 refs.append(new_ref)
875 dataset.refs = refs
876 all_refs.extend(dataset.refs)
877 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False)
879 # Use move mode to test that the file is deleted. Also
880 # disable recording of file size.
881 butler.ingest(*datasets, transfer="move", record_validation_info=False)
883 # Check that every ref now has records.
884 for dataset in datasets:
885 for ref in dataset.refs:
886 self.assertTrue(ref.dataId.hasRecords())
888 # Ensure that the file has disappeared.
889 self.assertFalse(tempFile.exists())
891 # Check that the datastore recorded no file size.
892 # Not all datastores can support this.
893 try:
894 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined]
895 self.assertEqual(infos[0].file_size, -1)
896 except AttributeError:
897 pass
899 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
900 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
902 multi1 = butler.get(datasetTypeName, dataId1)
903 multi2 = butler.get(datasetTypeName, dataId2)
905 self.assertEqual(multi1, metrics1)
906 self.assertEqual(multi2, metrics2)
908 # Compare URIs
909 uri1 = butler.getURI(datasetTypeName, dataId1)
910 uri2 = butler.getURI(datasetTypeName, dataId2)
911 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
913 # Test that removing one does not break the second
914 # This line will issue a warning log message for a ChainedDatastore
915 # that uses an InMemoryDatastore since in-memory can not ingest
916 # files.
917 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
918 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
919 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
920 multi2b = butler.get(datasetTypeName, dataId2)
921 self.assertEqual(multi2, multi2b)
923 # Ensure we can ingest 0 datasets
924 datasets = []
925 butler.ingest(*datasets)
927 def testPickle(self) -> None:
928 """Test pickle support."""
929 butler = Butler(self.tmpConfigFile, run=self.default_run)
930 butlerOut = pickle.loads(pickle.dumps(butler))
931 self.assertIsInstance(butlerOut, Butler)
932 self.assertEqual(butlerOut._config, butler._config)
933 self.assertEqual(butlerOut.collections, butler.collections)
934 self.assertEqual(butlerOut.run, butler.run)
936 def testGetDatasetTypes(self) -> None:
937 butler = Butler(self.tmpConfigFile, run=self.default_run)
938 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
939 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [
940 (
941 "instrument",
942 [
943 {"instrument": "DummyCam"},
944 {"instrument": "DummyHSC"},
945 {"instrument": "DummyCamComp"},
946 ],
947 ),
948 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]),
949 ("visit", [{"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}]),
950 ]
951 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
952 # Add needed Dimensions
953 for element, data in dimensionEntries:
954 butler.registry.insertDimensionData(element, *data)
956 # When a DatasetType is added to the registry entries are not created
957 # for components but querying them can return the components.
958 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
959 components = set()
960 for datasetTypeName in datasetTypeNames:
961 # Create and register a DatasetType
962 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
964 for componentName in storageClass.components:
965 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
967 fromRegistry: set[DatasetType] = set()
968 for parent_dataset_type in butler.registry.queryDatasetTypes():
969 fromRegistry.add(parent_dataset_type)
970 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
971 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
973 # Now that we have some dataset types registered, validate them
974 butler.validateConfiguration(
975 ignore=[
976 "test_metric_comp",
977 "metric3",
978 "metric5",
979 "calexp",
980 "DummySC",
981 "datasetType.component",
982 "random_data",
983 "random_data_2",
984 ]
985 )
987 # Add a new datasetType that will fail template validation
988 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
989 if self.validationCanFail:
990 with self.assertRaises(ValidationError):
991 butler.validateConfiguration()
993 # Rerun validation but with a subset of dataset type names
994 butler.validateConfiguration(datasetTypeNames=["metric4"])
996 # Rerun validation but ignore the bad datasetType
997 butler.validateConfiguration(
998 ignore=[
999 "test_metric_comp",
1000 "metric3",
1001 "metric5",
1002 "calexp",
1003 "DummySC",
1004 "datasetType.component",
1005 "random_data",
1006 "random_data_2",
1007 ]
1008 )
1010 def testTransaction(self) -> None:
1011 butler = Butler(self.tmpConfigFile, run=self.default_run)
1012 datasetTypeName = "test_metric"
1013 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1014 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = (
1015 ("instrument", {"instrument": "DummyCam"}),
1016 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
1017 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
1018 )
1019 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1020 metric = makeExampleMetrics()
1021 dataId = {"instrument": "DummyCam", "visit": 42}
1022 # Create and register a DatasetType
1023 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1024 with self.assertRaises(TransactionTestError):
1025 with butler.transaction():
1026 # Add needed Dimensions
1027 for args in dimensionEntries:
1028 butler.registry.insertDimensionData(*args)
1029 # Store a dataset
1030 ref = butler.put(metric, datasetTypeName, dataId)
1031 self.assertIsInstance(ref, DatasetRef)
1032 # Test getDirect
1033 metricOut = butler.get(ref)
1034 self.assertEqual(metric, metricOut)
1035 # Test get
1036 metricOut = butler.get(datasetTypeName, dataId)
1037 self.assertEqual(metric, metricOut)
1038 # Check we can get components
1039 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
1040 raise TransactionTestError("This should roll back the entire transaction")
1041 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
1042 butler.registry.expandDataId(dataId)
1043 # Should raise LookupError for missing data ID value
1044 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
1045 butler.get(datasetTypeName, dataId)
1046 # Also check explicitly if Dataset entry is missing
1047 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
1048 # Direct retrieval should not find the file in the Datastore
1049 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
1050 butler.get(ref)
1052 def testMakeRepo(self) -> None:
1053 """Test that we can write butler configuration to a new repository via
1054 the Butler.makeRepo interface and then instantiate a butler from the
1055 repo root.
1056 """
1057 # Do not run the test if we know this datastore configuration does
1058 # not support a file system root
1059 if self.fullConfigKey is None:
1060 return
1062 # create two separate directories
1063 root1 = tempfile.mkdtemp(dir=self.root)
1064 root2 = tempfile.mkdtemp(dir=self.root)
1066 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1067 limited = Config(self.configFile)
1068 butler1 = Butler(butlerConfig)
1069 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1070 full = Config(self.tmpConfigFile)
1071 butler2 = Butler(butlerConfig)
1072 # Butlers should have the same configuration regardless of whether
1073 # defaults were expanded.
1074 self.assertEqual(butler1._config, butler2._config)
1075 # Config files loaded directly should not be the same.
1076 self.assertNotEqual(limited, full)
1077 # Make sure "limited" doesn't have a few keys we know it should be
1078 # inheriting from defaults.
1079 self.assertIn(self.fullConfigKey, full)
1080 self.assertNotIn(self.fullConfigKey, limited)
1082 # Collections don't appear until something is put in them
1083 collections1 = set(butler1.registry.queryCollections())
1084 self.assertEqual(collections1, set())
1085 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1087 # Check that a config with no associated file name will not
1088 # work properly with relocatable Butler repo
1089 butlerConfig.configFile = None
1090 with self.assertRaises(ValueError):
1091 Butler(butlerConfig)
1093 with self.assertRaises(FileExistsError):
1094 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1096 def testStringification(self) -> None:
1097 butler = Butler(self.tmpConfigFile, run=self.default_run)
1098 butlerStr = str(butler)
1100 if self.datastoreStr is not None:
1101 for testStr in self.datastoreStr:
1102 self.assertIn(testStr, butlerStr)
1103 if self.registryStr is not None:
1104 self.assertIn(self.registryStr, butlerStr)
1106 datastoreName = butler.datastore.name
1107 if self.datastoreName is not None:
1108 for testStr in self.datastoreName:
1109 self.assertIn(testStr, datastoreName)
1111 def testButlerRewriteDataId(self) -> None:
1112 """Test that dataIds can be rewritten based on dimension records."""
1114 butler = Butler(self.tmpConfigFile, run=self.default_run)
1116 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1117 datasetTypeName = "random_data"
1119 # Create dimension records.
1120 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1121 butler.registry.insertDimensionData(
1122 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1123 )
1124 butler.registry.insertDimensionData(
1125 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1126 )
1128 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1129 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1130 butler.registry.registerDatasetType(datasetType)
1132 n_exposures = 5
1133 dayobs = 20210530
1135 for i in range(n_exposures):
1136 butler.registry.insertDimensionData(
1137 "exposure",
1138 {
1139 "instrument": "DummyCamComp",
1140 "id": i,
1141 "obs_id": f"exp{i}",
1142 "seq_num": i,
1143 "day_obs": dayobs,
1144 "physical_filter": "d-r",
1145 },
1146 )
1148 # Write some data.
1149 for i in range(n_exposures):
1150 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1152 # Use the seq_num for the put to test rewriting.
1153 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1154 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1156 # Check that the exposure is correct in the dataId
1157 self.assertEqual(ref.dataId["exposure"], i)
1159 # and check that we can get the dataset back with the same dataId
1160 new_metric = butler.get(datasetTypeName, dataId=dataId)
1161 self.assertEqual(new_metric, metric)
1164class FileDatastoreButlerTests(ButlerTests):
1165 """Common tests and specialization of ButlerTests for butlers backed
1166 by datastores that inherit from FileDatastore.
1167 """
1169 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool:
1170 """Checks if file exists at a given path (relative to root).
1172 Test testPutTemplates verifies actual physical existance of the files
1173 in the requested location.
1174 """
1175 uri = ResourcePath(root, forceDirectory=True)
1176 return uri.join(relpath).exists()
1178 def testPutTemplates(self) -> None:
1179 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1180 butler = Butler(self.tmpConfigFile, run=self.default_run)
1182 # Add needed Dimensions
1183 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1184 butler.registry.insertDimensionData(
1185 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1186 )
1187 butler.registry.insertDimensionData(
1188 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1189 )
1190 butler.registry.insertDimensionData(
1191 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1192 )
1194 # Create and store a dataset
1195 metric = makeExampleMetrics()
1197 # Create two almost-identical DatasetTypes (both will use default
1198 # template)
1199 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1200 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1201 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1202 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1204 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1205 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1207 # Put with exactly the data ID keys needed
1208 ref = butler.put(metric, "metric1", dataId1)
1209 uri = butler.getURI(ref)
1210 self.assertTrue(uri.exists())
1211 self.assertTrue(
1212 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1213 )
1215 # Check the template based on dimensions
1216 if hasattr(butler.datastore, "templates"):
1217 butler.datastore.templates.validateTemplates([ref])
1219 # Put with extra data ID keys (physical_filter is an optional
1220 # dependency); should not change template (at least the way we're
1221 # defining them to behave now; the important thing is that they
1222 # must be consistent).
1223 ref = butler.put(metric, "metric2", dataId2)
1224 uri = butler.getURI(ref)
1225 self.assertTrue(uri.exists())
1226 self.assertTrue(
1227 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1228 )
1230 # Check the template based on dimensions
1231 if hasattr(butler.datastore, "templates"):
1232 butler.datastore.templates.validateTemplates([ref])
1234 # Use a template that has a typo in dimension record metadata.
1235 # Easier to test with a butler that has a ref with records attached.
1236 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1237 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1238 path = template.format(ref)
1239 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1241 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1242 with self.assertRaises(KeyError):
1243 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1244 template.format(ref)
1246 # Now use a file template that will not result in unique filenames
1247 with self.assertRaises(FileTemplateValidationError):
1248 butler.put(metric, "metric3", dataId1)
1250 def testImportExport(self) -> None:
1251 # Run put/get tests just to create and populate a repo.
1252 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1253 self.runImportExportTest(storageClass)
1255 @unittest.expectedFailure
1256 def testImportExportVirtualComposite(self) -> None:
1257 # Run put/get tests just to create and populate a repo.
1258 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1259 self.runImportExportTest(storageClass)
1261 def runImportExportTest(self, storageClass: StorageClass) -> None:
1262 """This test does an export to a temp directory and an import back
1263 into a new temp directory repo. It does not assume a posix datastore"""
1264 exportButler = self.runPutGetTest(storageClass, "test_metric")
1266 # Test that we must have a file extension.
1267 with self.assertRaises(ValueError):
1268 with exportButler.export(filename="dump", directory=".") as export:
1269 pass
1271 # Test that unknown format is not allowed.
1272 with self.assertRaises(ValueError):
1273 with exportButler.export(filename="dump.fits", directory=".") as export:
1274 pass
1276 # Test that the repo actually has at least one dataset.
1277 datasets = list(exportButler.registry.queryDatasets(..., collections=Ellipsis))
1278 self.assertGreater(len(datasets), 0)
1279 # Add a DimensionRecord that's unused by those datasets.
1280 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1281 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1282 # Export and then import datasets.
1283 with safeTestTempDir(TESTDIR) as exportDir:
1284 exportFile = os.path.join(exportDir, "exports.yaml")
1285 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1286 export.saveDatasets(datasets)
1287 # Export the same datasets again. This should quietly do
1288 # nothing because of internal deduplication, and it shouldn't
1289 # complain about being asked to export the "htm7" elements even
1290 # though there aren't any in these datasets or in the database.
1291 export.saveDatasets(datasets, elements=["htm7"])
1292 # Save one of the data IDs again; this should be harmless
1293 # because of internal deduplication.
1294 export.saveDataIds([datasets[0].dataId])
1295 # Save some dimension records directly.
1296 export.saveDimensionData("skymap", [skymapRecord])
1297 self.assertTrue(os.path.exists(exportFile))
1298 with safeTestTempDir(TESTDIR) as importDir:
1299 # We always want this to be a local posix butler
1300 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1301 # Calling script.butlerImport tests the implementation of the
1302 # butler command line interface "import" subcommand. Functions
1303 # in the script folder are generally considered protected and
1304 # should not be used as public api.
1305 with open(exportFile, "r") as f:
1306 script.butlerImport(
1307 importDir,
1308 export_file=f,
1309 directory=exportDir,
1310 transfer="auto",
1311 skip_dimensions=None,
1312 )
1313 importButler = Butler(importDir, run=self.default_run)
1314 for ref in datasets:
1315 with self.subTest(ref=ref):
1316 # Test for existence by passing in the DatasetType and
1317 # data ID separately, to avoid lookup by dataset_id.
1318 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1319 self.assertEqual(
1320 list(importButler.registry.queryDimensionRecords("skymap")),
1321 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1322 )
1324 def testRemoveRuns(self) -> None:
1325 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1326 butler = Butler(self.tmpConfigFile, writeable=True)
1327 # Load registry data with dimensions to hang datasets off of.
1328 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1329 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1330 # Add some RUN-type collection.
1331 run1 = "run1"
1332 butler.registry.registerRun(run1)
1333 run2 = "run2"
1334 butler.registry.registerRun(run2)
1335 # put a dataset in each
1336 metric = makeExampleMetrics()
1337 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1338 datasetType = self.addDatasetType(
1339 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1340 )
1341 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1342 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1343 uri1 = butler.getURI(ref1, collections=[run1])
1344 uri2 = butler.getURI(ref2, collections=[run2])
1346 with self.assertRaises(OrphanedRecordError):
1347 butler.registry.removeDatasetType(datasetType.name)
1349 # Remove from both runs with different values for unstore.
1350 butler.removeRuns([run1], unstore=True)
1351 butler.removeRuns([run2], unstore=False)
1352 # Should be nothing in registry for either one, and datastore should
1353 # not think either exists.
1354 with self.assertRaises(MissingCollectionError):
1355 butler.registry.getCollectionType(run1)
1356 with self.assertRaises(MissingCollectionError):
1357 butler.registry.getCollectionType(run2)
1358 self.assertFalse(butler.datastore.exists(ref1))
1359 self.assertFalse(butler.datastore.exists(ref2))
1360 # The ref we unstored should be gone according to the URI, but the
1361 # one we forgot should still be around.
1362 self.assertFalse(uri1.exists())
1363 self.assertTrue(uri2.exists())
1365 # Now that the collections have been pruned we can remove the
1366 # dataset type
1367 butler.registry.removeDatasetType(datasetType.name)
1369 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm:
1370 butler.registry.removeDatasetType(tuple(["test*", "test*"]))
1371 self.assertIn("not defined", "\n".join(cm.output))
1374class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1375 """PosixDatastore specialization of a butler"""
1377 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1378 fullConfigKey: str | None = ".datastore.formatters"
1379 validationCanFail = True
1380 datastoreStr = ["/tmp"]
1381 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1382 registryStr = "/gen3.sqlite3"
1384 def testPathConstructor(self) -> None:
1385 """Independent test of constructor using PathLike."""
1386 butler = Butler(self.tmpConfigFile, run=self.default_run)
1387 self.assertIsInstance(butler, Butler)
1389 # And again with a Path object with the butler yaml
1390 path = pathlib.Path(self.tmpConfigFile)
1391 butler = Butler(path, writeable=False)
1392 self.assertIsInstance(butler, Butler)
1394 # And again with a Path object without the butler yaml
1395 # (making sure we skip it if the tmp config doesn't end
1396 # in butler.yaml -- which is the case for a subclass)
1397 if self.tmpConfigFile.endswith("butler.yaml"):
1398 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1399 butler = Butler(path, writeable=False)
1400 self.assertIsInstance(butler, Butler)
1402 def testExportTransferCopy(self) -> None:
1403 """Test local export using all transfer modes"""
1404 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1405 exportButler = self.runPutGetTest(storageClass, "test_metric")
1406 # Test that the repo actually has at least one dataset.
1407 datasets = list(exportButler.registry.queryDatasets(..., collections=Ellipsis))
1408 self.assertGreater(len(datasets), 0)
1409 uris = [exportButler.getURI(d) for d in datasets]
1410 assert isinstance(exportButler.datastore, FileDatastore)
1411 datastoreRoot = exportButler.datastore.root
1413 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1415 for path in pathsInStore:
1416 # Assume local file system
1417 assert path is not None
1418 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1420 for transfer in ("copy", "link", "symlink", "relsymlink"):
1421 with safeTestTempDir(TESTDIR) as exportDir:
1422 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1423 export.saveDatasets(datasets)
1424 for path in pathsInStore:
1425 assert path is not None
1426 self.assertTrue(
1427 self.checkFileExists(exportDir, path),
1428 f"Check that mode {transfer} exported files",
1429 )
1431 def testPruneDatasets(self) -> None:
1432 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1433 butler = Butler(self.tmpConfigFile, writeable=True)
1434 assert isinstance(butler.datastore, FileDatastore)
1435 # Load registry data with dimensions to hang datasets off of.
1436 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1437 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1438 # Add some RUN-type collections.
1439 run1 = "run1"
1440 butler.registry.registerRun(run1)
1441 run2 = "run2"
1442 butler.registry.registerRun(run2)
1443 # put some datasets. ref1 and ref2 have the same data ID, and are in
1444 # different runs. ref3 has a different data ID.
1445 metric = makeExampleMetrics()
1446 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1447 datasetType = self.addDatasetType(
1448 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1449 )
1450 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1451 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1452 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1454 # Simple prune.
1455 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1456 with self.assertRaises(LookupError):
1457 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1459 # Put data back.
1460 ref1 = butler.put(metric, ref1, run=run1)
1461 ref2 = butler.put(metric, ref2, run=run2)
1462 ref3 = butler.put(metric, ref3, run=run1)
1464 # Check that in normal mode, deleting the record will lead to
1465 # trash not touching the file.
1466 uri1 = butler.datastore.getURI(ref1)
1467 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table
1468 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1469 butler.datastore.trash(ref1)
1470 butler.datastore.emptyTrash()
1471 self.assertTrue(uri1.exists())
1472 uri1.remove() # Clean it up.
1474 # Simulate execution butler setup by deleting the datastore
1475 # record but keeping the file around and trusting.
1476 butler.datastore.trustGetRequest = True
1477 uri2 = butler.datastore.getURI(ref2)
1478 uri3 = butler.datastore.getURI(ref3)
1479 self.assertTrue(uri2.exists())
1480 self.assertTrue(uri3.exists())
1482 # Remove the datastore record.
1483 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table
1484 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1485 self.assertTrue(uri2.exists())
1486 butler.datastore.trash([ref2, ref3])
1487 # Immediate removal for ref2 file
1488 self.assertFalse(uri2.exists())
1489 # But ref3 has to wait for the empty.
1490 self.assertTrue(uri3.exists())
1491 butler.datastore.emptyTrash()
1492 self.assertFalse(uri3.exists())
1494 # Clear out the datasets from registry.
1495 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1497 def testPytypeCoercion(self) -> None:
1498 """Test python type coercion on Butler.get and put."""
1500 # Store some data with the normal example storage class.
1501 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1502 datasetTypeName = "test_metric"
1503 butler = self.runPutGetTest(storageClass, datasetTypeName)
1505 dataId = {"instrument": "DummyCamComp", "visit": 423}
1506 metric = butler.get(datasetTypeName, dataId=dataId)
1507 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1509 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1510 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1512 # Now need to hack the registry dataset type definition.
1513 # There is no API for this.
1514 assert isinstance(butler.registry, SqlRegistry)
1515 manager = butler.registry._managers.datasets
1516 assert hasattr(manager, "_db") and hasattr(manager, "_static")
1517 manager._db.update(
1518 manager._static.dataset_type,
1519 {"name": datasetTypeName},
1520 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1521 )
1523 # Force reset of dataset type cache
1524 butler.registry.refresh()
1526 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1527 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1528 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1530 metric_model = butler.get(datasetTypeName, dataId=dataId)
1531 self.assertNotEqual(type(metric_model), type(metric))
1532 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1534 # Put the model and read it back to show that everything now
1535 # works as normal.
1536 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1537 metric_model_new = butler.get(metric_ref)
1538 self.assertEqual(metric_model_new, metric_model)
1540 # Hack the storage class again to something that will fail on the
1541 # get with no conversion class.
1542 manager._db.update(
1543 manager._static.dataset_type,
1544 {"name": datasetTypeName},
1545 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1546 )
1547 butler.registry.refresh()
1549 with self.assertRaises(ValueError):
1550 butler.get(datasetTypeName, dataId=dataId)
1553@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1554class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1555 """PosixDatastore specialization of a butler using Postgres"""
1557 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1558 fullConfigKey = ".datastore.formatters"
1559 validationCanFail = True
1560 datastoreStr = ["/tmp"]
1561 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1562 registryStr = "PostgreSQL@test"
1563 postgresql: Any
1565 @staticmethod
1566 def _handler(postgresql: Any) -> None:
1567 engine = sqlalchemy.engine.create_engine(postgresql.url())
1568 with engine.begin() as connection:
1569 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1571 @classmethod
1572 def setUpClass(cls) -> None:
1573 # Create the postgres test server.
1574 cls.postgresql = testing.postgresql.PostgresqlFactory(
1575 cache_initialized_db=True, on_initialized=cls._handler
1576 )
1577 super().setUpClass()
1579 @classmethod
1580 def tearDownClass(cls) -> None:
1581 # Clean up any lingering SQLAlchemy engines/connections
1582 # so they're closed before we shut down the server.
1583 gc.collect()
1584 cls.postgresql.clear_cache()
1585 super().tearDownClass()
1587 def setUp(self) -> None:
1588 self.server = self.postgresql()
1590 # Need to add a registry section to the config.
1591 self._temp_config = False
1592 config = Config(self.configFile)
1593 config["registry", "db"] = self.server.url()
1594 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1595 config.dump(fh)
1596 self.configFile = fh.name
1597 self._temp_config = True
1598 super().setUp()
1600 def tearDown(self) -> None:
1601 self.server.stop()
1602 if self._temp_config and os.path.exists(self.configFile):
1603 os.remove(self.configFile)
1604 super().tearDown()
1606 def testMakeRepo(self) -> None:
1607 # The base class test assumes that it's using sqlite and assumes
1608 # the config file is acceptable to sqlite.
1609 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1612class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1613 """InMemoryDatastore specialization of a butler"""
1615 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1616 fullConfigKey = None
1617 useTempRoot = False
1618 validationCanFail = False
1619 datastoreStr = ["datastore='InMemory"]
1620 datastoreName = ["InMemoryDatastore@"]
1621 registryStr = "/gen3.sqlite3"
1623 def testIngest(self) -> None:
1624 pass
1627class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1628 """PosixDatastore specialization"""
1630 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1631 fullConfigKey = ".datastore.datastores.1.formatters"
1632 validationCanFail = True
1633 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1634 datastoreName = [
1635 "InMemoryDatastore@",
1636 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1637 "SecondDatastore",
1638 ]
1639 registryStr = "/gen3.sqlite3"
1642class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1643 """Test that a yaml file in one location can refer to a root in another."""
1645 datastoreStr = ["dir1"]
1646 # Disable the makeRepo test since we are deliberately not using
1647 # butler.yaml as the config name.
1648 fullConfigKey = None
1650 def setUp(self) -> None:
1651 self.root = makeTestTempDir(TESTDIR)
1653 # Make a new repository in one place
1654 self.dir1 = os.path.join(self.root, "dir1")
1655 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1657 # Move the yaml file to a different place and add a "root"
1658 self.dir2 = os.path.join(self.root, "dir2")
1659 os.makedirs(self.dir2, exist_ok=True)
1660 configFile1 = os.path.join(self.dir1, "butler.yaml")
1661 config = Config(configFile1)
1662 config["root"] = self.dir1
1663 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1664 config.dumpToUri(configFile2)
1665 os.remove(configFile1)
1666 self.tmpConfigFile = configFile2
1668 def testFileLocations(self) -> None:
1669 self.assertNotEqual(self.dir1, self.dir2)
1670 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1671 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1672 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1675class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1676 """Test that a config file created by makeRepo outside of repo works."""
1678 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1680 def setUp(self) -> None:
1681 self.root = makeTestTempDir(TESTDIR)
1682 self.root2 = makeTestTempDir(TESTDIR)
1684 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1685 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1687 def tearDown(self) -> None:
1688 if os.path.exists(self.root2):
1689 shutil.rmtree(self.root2, ignore_errors=True)
1690 super().tearDown()
1692 def testConfigExistence(self) -> None:
1693 c = Config(self.tmpConfigFile)
1694 uri_config = ResourcePath(c["root"])
1695 uri_expected = ResourcePath(self.root, forceDirectory=True)
1696 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1697 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1699 def testPutGet(self) -> None:
1700 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1701 self.runPutGetTest(storageClass, "test_metric")
1704class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1705 """Test that a config file created by makeRepo outside of repo works."""
1707 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1709 def setUp(self) -> None:
1710 self.root = makeTestTempDir(TESTDIR)
1711 self.root2 = makeTestTempDir(TESTDIR)
1713 self.tmpConfigFile = self.root2
1714 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1716 def testConfigExistence(self) -> None:
1717 # Append the yaml file else Config constructor does not know the file
1718 # type.
1719 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1720 super().testConfigExistence()
1723class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1724 """Test that a config file created by makeRepo outside of repo works."""
1726 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1728 def setUp(self) -> None:
1729 self.root = makeTestTempDir(TESTDIR)
1730 self.root2 = makeTestTempDir(TESTDIR)
1732 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1733 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1736@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1737class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1738 """S3Datastore specialization of a butler; an S3 storage Datastore +
1739 a local in-memory SqlRegistry.
1740 """
1742 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1743 fullConfigKey = None
1744 validationCanFail = True
1746 bucketName = "anybucketname"
1747 """Name of the Bucket that will be used in the tests. The name is read from
1748 the config file used with the tests during set-up.
1749 """
1751 root = "butlerRoot/"
1752 """Root repository directory expected to be used in case useTempRoot=False.
1753 Otherwise the root is set to a 20 characters long randomly generated string
1754 during set-up.
1755 """
1757 datastoreStr = [f"datastore={root}"]
1758 """Contains all expected root locations in a format expected to be
1759 returned by Butler stringification.
1760 """
1762 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1763 """The expected format of the S3 Datastore string."""
1765 registryStr = "/gen3.sqlite3"
1766 """Expected format of the Registry string."""
1768 mock_s3 = mock_s3()
1769 """The mocked s3 interface from moto."""
1771 def genRoot(self) -> str:
1772 """Returns a random string of len 20 to serve as a root
1773 name for the temporary bucket repo.
1775 This is equivalent to tempfile.mkdtemp as this is what self.root
1776 becomes when useTempRoot is True.
1777 """
1778 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1779 return rndstr + "/"
1781 def setUp(self) -> None:
1782 config = Config(self.configFile)
1783 uri = ResourcePath(config[".datastore.datastore.root"])
1784 self.bucketName = uri.netloc
1786 # Enable S3 mocking of tests.
1787 self.mock_s3.start()
1789 # set up some fake credentials if they do not exist
1790 self.usingDummyCredentials = setAwsEnvCredentials()
1792 if self.useTempRoot:
1793 self.root = self.genRoot()
1794 rooturi = f"s3://{self.bucketName}/{self.root}"
1795 config.update({"datastore": {"datastore": {"root": rooturi}}})
1797 # need local folder to store registry database
1798 self.reg_dir = makeTestTempDir(TESTDIR)
1799 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1801 # MOTO needs to know that we expect Bucket bucketname to exist
1802 # (this used to be the class attribute bucketName)
1803 s3 = boto3.resource("s3")
1804 s3.create_bucket(Bucket=self.bucketName)
1806 self.datastoreStr = [f"datastore='{rooturi}'"]
1807 self.datastoreName = [f"FileDatastore@{rooturi}"]
1808 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1809 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1811 def tearDown(self) -> None:
1812 s3 = boto3.resource("s3")
1813 bucket = s3.Bucket(self.bucketName)
1814 try:
1815 bucket.objects.all().delete()
1816 except botocore.exceptions.ClientError as e:
1817 if e.response["Error"]["Code"] == "404":
1818 # the key was not reachable - pass
1819 pass
1820 else:
1821 raise
1823 bucket = s3.Bucket(self.bucketName)
1824 bucket.delete()
1826 # Stop the S3 mock.
1827 self.mock_s3.stop()
1829 # unset any potentially set dummy credentials
1830 if self.usingDummyCredentials:
1831 unsetAwsEnvCredentials()
1833 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1834 shutil.rmtree(self.reg_dir, ignore_errors=True)
1836 if self.useTempRoot and os.path.exists(self.root):
1837 shutil.rmtree(self.root, ignore_errors=True)
1839 super().tearDown()
1842class PosixDatastoreTransfers(unittest.TestCase):
1843 """Test data transfers between butlers.
1845 Test for different managers. UUID to UUID and integer to integer are
1846 tested. UUID to integer is not supported since we do not currently
1847 want to allow that. Integer to UUID is supported with the caveat
1848 that UUID4 will be generated and this will be incorrect for raw
1849 dataset types. The test ignores that.
1850 """
1852 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1853 storageClassFactory: StorageClassFactory
1855 @classmethod
1856 def setUpClass(cls) -> None:
1857 cls.storageClassFactory = StorageClassFactory()
1858 cls.storageClassFactory.addFromConfig(cls.configFile)
1860 def setUp(self) -> None:
1861 self.root = makeTestTempDir(TESTDIR)
1862 self.config = Config(self.configFile)
1864 def tearDown(self) -> None:
1865 removeTestTempDir(self.root)
1867 def create_butler(self, manager: str, label: str) -> Butler:
1868 config = Config(self.configFile)
1869 config["registry", "managers", "datasets"] = manager
1870 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1872 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None:
1873 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID"
1874 if manager1 is None:
1875 manager1 = default
1876 if manager2 is None:
1877 manager2 = default
1878 self.source_butler = self.create_butler(manager1, "1")
1879 self.target_butler = self.create_butler(manager2, "2")
1881 def testTransferUuidToUuid(self) -> None:
1882 self.create_butlers()
1883 self.assertButlerTransfers()
1885 def _enable_trust(self, datastore: Datastore) -> None:
1886 if hasattr(datastore, "trustGetRequest"):
1887 datastore.trustGetRequest = True
1888 elif hasattr(datastore, "datastores"):
1889 for datastore in datastore.datastores:
1890 if hasattr(datastore, "trustGetRequest"):
1891 datastore.trustGetRequest = True
1893 def testTransferMissing(self) -> None:
1894 """Test transfers where datastore records are missing.
1896 This is how execution butler works.
1897 """
1898 self.create_butlers()
1900 # Configure the source butler to allow trust.
1901 self._enable_trust(self.source_butler.datastore)
1903 self.assertButlerTransfers(purge=True)
1905 def testTransferMissingDisassembly(self) -> None:
1906 """Test transfers where datastore records are missing.
1908 This is how execution butler works.
1909 """
1910 self.create_butlers()
1912 # Configure the source butler to allow trust.
1913 self._enable_trust(self.source_butler.datastore)
1915 # Test disassembly.
1916 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1918 def testAbsoluteURITransferDirect(self) -> None:
1919 """Test transfer using an absolute URI."""
1920 self._absolute_transfer("auto")
1922 def testAbsoluteURITransferCopy(self) -> None:
1923 """Test transfer using an absolute URI."""
1924 self._absolute_transfer("copy")
1926 def _absolute_transfer(self, transfer: str) -> None:
1927 self.create_butlers()
1929 storageClassName = "StructuredData"
1930 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1931 datasetTypeName = "random_data"
1932 run = "run1"
1933 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1935 dimensions = self.source_butler.registry.dimensions.extract(())
1936 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1937 self.source_butler.registry.registerDatasetType(datasetType)
1939 metrics = makeExampleMetrics()
1940 with ResourcePath.temporary_uri(suffix=".json") as temp:
1941 dataId = DataCoordinate.makeEmpty(self.source_butler.dimensions)
1942 source_refs = [DatasetRef(datasetType, dataId, run=run)]
1943 temp.write(json.dumps(metrics.exportAsDict()).encode())
1944 dataset = FileDataset(path=temp, refs=source_refs)
1945 self.source_butler.ingest(dataset, transfer="direct")
1947 self.target_butler.transfer_from(
1948 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer
1949 )
1951 uri = self.target_butler.getURI(dataset.refs[0])
1952 if transfer == "auto":
1953 self.assertEqual(uri, temp)
1954 else:
1955 self.assertNotEqual(uri, temp)
1957 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None:
1958 """Test that a run can be transferred to another butler."""
1960 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1961 datasetTypeName = "random_data"
1963 # Test will create 3 collections and we will want to transfer
1964 # two of those three.
1965 runs = ["run1", "run2", "other"]
1967 # Also want to use two different dataset types to ensure that
1968 # grouping works.
1969 datasetTypeNames = ["random_data", "random_data_2"]
1971 # Create the run collections in the source butler.
1972 for run in runs:
1973 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1975 # Create dimensions in source butler.
1976 n_exposures = 30
1977 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1978 self.source_butler.registry.insertDimensionData(
1979 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1980 )
1981 self.source_butler.registry.insertDimensionData(
1982 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1983 )
1985 for i in range(n_exposures):
1986 self.source_butler.registry.insertDimensionData(
1987 "exposure",
1988 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1989 )
1991 # Create dataset types in the source butler.
1992 dimensions = self.source_butler.registry.dimensions.extract(["instrument", "exposure"])
1993 for datasetTypeName in datasetTypeNames:
1994 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1995 self.source_butler.registry.registerDatasetType(datasetType)
1997 # Write a dataset to an unrelated run -- this will ensure that
1998 # we are rewriting integer dataset ids in the target if necessary.
1999 # Will not be relevant for UUID.
2000 run = "distraction"
2001 butler = Butler(butler=self.source_butler, run=run)
2002 butler.put(
2003 makeExampleMetrics(),
2004 datasetTypeName,
2005 exposure=1,
2006 instrument="DummyCamComp",
2007 physical_filter="d-r",
2008 )
2010 # Write some example metrics to the source
2011 butler = Butler(butler=self.source_butler)
2013 # Set of DatasetRefs that should be in the list of refs to transfer
2014 # but which will not be transferred.
2015 deleted: set[DatasetRef] = set()
2017 n_expected = 20 # Number of datasets expected to be transferred
2018 source_refs = []
2019 for i in range(n_exposures):
2020 # Put a third of datasets into each collection, only retain
2021 # two thirds.
2022 index = i % 3
2023 run = runs[index]
2024 datasetTypeName = datasetTypeNames[i % 2]
2026 metric = MetricsExample(
2027 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)]
2028 )
2029 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2030 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2032 # Remove the datastore record using low-level API
2033 if purge:
2034 # Remove records for a fraction.
2035 if index == 1:
2036 # For one of these delete the file as well.
2037 # This allows the "missing" code to filter the
2038 # file out.
2039 # Access the individual datastores.
2040 datastores = []
2041 if hasattr(butler.datastore, "datastores"):
2042 datastores.extend(butler.datastore.datastores)
2043 else:
2044 datastores.append(butler.datastore)
2046 if not deleted:
2047 # For a chained datastore we need to remove
2048 # files in each chain.
2049 for datastore in datastores:
2050 # The file might not be known to the datastore
2051 # if constraints are used.
2052 try:
2053 primary, uris = datastore.getURIs(ref)
2054 except FileNotFoundError:
2055 continue
2056 if primary:
2057 if primary.scheme != "mem":
2058 primary.remove()
2059 for uri in uris.values():
2060 if uri.scheme != "mem":
2061 uri.remove()
2062 n_expected -= 1
2063 deleted.add(ref)
2065 # Remove the datastore record.
2066 for datastore in datastores:
2067 if hasattr(datastore, "removeStoredItemInfo"):
2068 datastore.removeStoredItemInfo(ref)
2070 if index < 2:
2071 source_refs.append(ref)
2072 if ref not in deleted:
2073 new_metric = butler.get(ref)
2074 self.assertEqual(new_metric, metric)
2076 # Create some bad dataset types to ensure we check for inconsistent
2077 # definitions.
2078 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2079 for datasetTypeName in datasetTypeNames:
2080 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2081 self.target_butler.registry.registerDatasetType(datasetType)
2082 with self.assertRaises(ConflictingDefinitionError) as cm:
2083 self.target_butler.transfer_from(self.source_butler, source_refs)
2084 self.assertIn("dataset type differs", str(cm.exception))
2086 # And remove the bad definitions.
2087 for datasetTypeName in datasetTypeNames:
2088 self.target_butler.registry.removeDatasetType(datasetTypeName)
2090 # Transfer without creating dataset types should fail.
2091 with self.assertRaises(KeyError):
2092 self.target_butler.transfer_from(self.source_butler, source_refs)
2094 # Transfer without creating dimensions should fail.
2095 with self.assertRaises(ConflictingDefinitionError) as cm:
2096 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True)
2097 self.assertIn("dimension", str(cm.exception))
2099 # The failed transfer above leaves registry in an inconsistent
2100 # state because the run is created but then rolled back without
2101 # the collection cache being cleared. For now force a refresh.
2102 # Can remove with DM-35498.
2103 self.target_butler.registry.refresh()
2105 # Now transfer them to the second butler, including dimensions.
2106 with self.assertLogs(level=logging.DEBUG) as log_cm:
2107 transferred = self.target_butler.transfer_from(
2108 self.source_butler,
2109 source_refs,
2110 register_dataset_types=True,
2111 transfer_dimensions=True,
2112 )
2113 self.assertEqual(len(transferred), n_expected)
2114 log_output = ";".join(log_cm.output)
2116 # A ChainedDatastore will use the in-memory datastore for mexists
2117 # so we can not rely on the mexists log message.
2118 self.assertIn("Number of datastore records found in source", log_output)
2119 self.assertIn("Creating output run", log_output)
2121 # Do the transfer twice to ensure that it will do nothing extra.
2122 # Only do this if purge=True because it does not work for int
2123 # dataset_id.
2124 if purge:
2125 # This should not need to register dataset types.
2126 transferred = self.target_butler.transfer_from(self.source_butler, source_refs)
2127 self.assertEqual(len(transferred), n_expected)
2129 # Also do an explicit low-level transfer to trigger some
2130 # edge cases.
2131 with self.assertLogs(level=logging.DEBUG) as log_cm:
2132 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2133 log_output = ";".join(log_cm.output)
2134 self.assertIn("no file artifacts exist", log_output)
2136 with self.assertRaises((TypeError, AttributeError)):
2137 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) # type: ignore
2139 with self.assertRaises(ValueError):
2140 self.target_butler.datastore.transfer_from(
2141 self.source_butler.datastore, source_refs, transfer="split"
2142 )
2144 # Now try to get the same refs from the new butler.
2145 for ref in source_refs:
2146 if ref not in deleted:
2147 new_metric = self.target_butler.get(ref)
2148 old_metric = self.source_butler.get(ref)
2149 self.assertEqual(new_metric, old_metric)
2151 # Now prune run2 collection and create instead a CHAINED collection.
2152 # This should block the transfer.
2153 self.target_butler.removeRuns(["run2"], unstore=True)
2154 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2155 with self.assertRaises(CollectionTypeError):
2156 # Re-importing the run1 datasets can be problematic if they
2157 # use integer IDs so filter those out.
2158 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2159 self.target_butler.transfer_from(self.source_butler, to_transfer)
2162class ChainedDatastoreTransfers(PosixDatastoreTransfers):
2163 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2166if __name__ == "__main__":
2167 unittest.main()