Coverage for tests/test_butler.py: 13%
1173 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 02:10 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 02:10 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
24from __future__ import annotations
26import gc
27import json
28import logging
29import os
30import pathlib
31import pickle
32import posixpath
33import random
34import shutil
35import string
36import tempfile
37import unittest
38import uuid
39from collections.abc import Mapping
40from typing import TYPE_CHECKING, Any, cast
42try:
43 import boto3
44 import botocore
45 from moto import mock_s3 # type: ignore[import]
46except ImportError:
47 boto3 = None
49 def mock_s3(cls):
50 """A no-op decorator in case moto mock_s3 can not be imported."""
51 return cls
54try:
55 # It's possible but silly to have testing.postgresql installed without
56 # having the postgresql server installed (because then nothing in
57 # testing.postgresql would work), so we use the presence of that module
58 # to test whether we can expect the server to be available.
59 import testing.postgresql # type: ignore[import]
60except ImportError:
61 testing = None
63import astropy.time
64import sqlalchemy
65from lsst.daf.butler import (
66 Butler,
67 ButlerConfig,
68 CollectionType,
69 Config,
70 DataCoordinate,
71 DatasetRef,
72 DatasetType,
73 FileDataset,
74 FileTemplate,
75 FileTemplateValidationError,
76 StorageClassFactory,
77 ValidationError,
78 script,
79)
80from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
81from lsst.daf.butler.datastores.fileDatastore import FileDatastore
82from lsst.daf.butler.registries.sql import SqlRegistry
83from lsst.daf.butler.registry import (
84 CollectionError,
85 CollectionTypeError,
86 ConflictingDefinitionError,
87 DataIdValueError,
88 MissingCollectionError,
89 OrphanedRecordError,
90)
91from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter
92from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir
93from lsst.resources import ResourcePath
94from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials
95from lsst.utils import doImportType
96from lsst.utils.introspection import get_full_type_name
98if TYPE_CHECKING:
99 from lsst.daf.butler import Datastore, DimensionGraph, Registry, StorageClass
101TESTDIR = os.path.abspath(os.path.dirname(__file__))
104def makeExampleMetrics():
105 return MetricsExample(
106 {"AM1": 5.2, "AM2": 30.6},
107 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}},
108 [563, 234, 456.7, 752, 8, 9, 27],
109 )
112class TransactionTestError(Exception):
113 """Specific error for testing transactions, to prevent misdiagnosing
114 that might otherwise occur when a standard exception is used.
115 """
117 pass
120class ButlerConfigTests(unittest.TestCase):
121 """Simple tests for ButlerConfig that are not tested in any other test
122 cases."""
124 def testSearchPath(self):
125 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
126 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
127 config1 = ButlerConfig(configFile)
128 self.assertNotIn("testConfigs", "\n".join(cm.output))
130 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
131 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
132 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
133 self.assertIn("testConfigs", "\n".join(cm.output))
135 key = ("datastore", "records", "table")
136 self.assertNotEqual(config1[key], config2[key])
137 self.assertEqual(config2[key], "override_record")
140class ButlerPutGetTests(TestCaseMixin):
141 """Helper method for running a suite of put/get tests from different
142 butler configurations."""
144 root: str
145 default_run = "ingésτ😺"
146 storageClassFactory: StorageClassFactory
147 configFile: str
148 tmpConfigFile: str
150 @staticmethod
151 def addDatasetType(
152 datasetTypeName: str, dimensions: DimensionGraph, storageClass: StorageClass | str, registry: Registry
153 ) -> DatasetType:
154 """Create a DatasetType and register it"""
155 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
156 registry.registerDatasetType(datasetType)
157 return datasetType
159 @classmethod
160 def setUpClass(cls) -> None:
161 cls.storageClassFactory = StorageClassFactory()
162 cls.storageClassFactory.addFromConfig(cls.configFile)
164 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None) -> None:
165 datasetType = datasetRef.datasetType
166 dataId = datasetRef.dataId
167 deferred = butler.getDeferred(datasetRef)
169 for component in components:
170 compTypeName = datasetType.componentTypeName(component)
171 result = butler.get(compTypeName, dataId, collections=collections)
172 self.assertEqual(result, getattr(reference, component))
173 result_deferred = deferred.get(component=component)
174 self.assertEqual(result_deferred, result)
176 def tearDown(self) -> None:
177 removeTestTempDir(self.root)
179 def create_butler(
180 self, run: str, storageClass: StorageClass | str, datasetTypeName: str
181 ) -> tuple[Butler, DatasetType]:
182 butler = Butler(self.tmpConfigFile, run=run)
184 collections = set(butler.registry.queryCollections())
185 self.assertEqual(collections, set([run]))
187 # Create and register a DatasetType
188 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
190 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
192 # Add needed Dimensions
193 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
194 butler.registry.insertDimensionData(
195 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
196 )
197 butler.registry.insertDimensionData(
198 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"}
199 )
200 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai")
201 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai")
202 butler.registry.insertDimensionData(
203 "visit",
204 {
205 "instrument": "DummyCamComp",
206 "id": 423,
207 "name": "fourtwentythree",
208 "physical_filter": "d-r",
209 "visit_system": 1,
210 "datetime_begin": visit_start,
211 "datetime_end": visit_end,
212 },
213 )
215 # Add more visits for some later tests
216 for visit_id in (424, 425):
217 butler.registry.insertDimensionData(
218 "visit",
219 {
220 "instrument": "DummyCamComp",
221 "id": visit_id,
222 "name": f"fourtwentyfour_{visit_id}",
223 "physical_filter": "d-r",
224 "visit_system": 1,
225 },
226 )
227 return butler, datasetType
229 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler:
230 # New datasets will be added to run and tag, but we will only look in
231 # tag when looking up datasets.
232 run = self.default_run
233 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
234 assert butler.run is not None
236 # Create and store a dataset
237 metric = makeExampleMetrics()
238 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423})
240 # Put and remove the dataset once as a DatasetRef, once as a dataId,
241 # and once with a DatasetType
243 # Keep track of any collections we add and do not clean up
244 expected_collections = {run}
246 counter = 0
247 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1")
248 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate]
249 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)):
250 # Since we are using subTest we can get cascading failures
251 # here with the first attempt failing and the others failing
252 # immediately because the dataset already exists. Work around
253 # this by using a distinct run collection each time
254 counter += 1
255 this_run = f"put_run_{counter}"
256 butler.registry.registerCollection(this_run, type=CollectionType.RUN)
257 expected_collections.update({this_run})
259 with self.subTest(args=args):
260 ref = butler.put(metric, *args, run=this_run)
261 self.assertIsInstance(ref, DatasetRef)
263 # Test getDirect
264 metricOut = butler.get(ref)
265 self.assertEqual(metric, metricOut)
266 # Test get
267 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run)
268 self.assertEqual(metric, metricOut)
269 # Test get with a datasetRef
270 metricOut = butler.get(ref, collections=this_run)
271 self.assertEqual(metric, metricOut)
272 # Test getDeferred with dataId
273 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get()
274 self.assertEqual(metric, metricOut)
275 # Test getDeferred with a datasetRef
276 metricOut = butler.getDeferred(ref, collections=this_run).get()
277 self.assertEqual(metric, metricOut)
278 # and deferred direct with ref
279 metricOut = butler.getDeferred(ref).get()
280 self.assertEqual(metric, metricOut)
282 # Check we can get components
283 if storageClass.isComposite():
284 self.assertGetComponents(
285 butler, ref, ("summary", "data", "output"), metric, collections=this_run
286 )
288 # Can the artifacts themselves be retrieved?
289 if not butler.datastore.isEphemeral:
290 root_uri = ResourcePath(self.root)
292 for preserve_path in (True, False):
293 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/")
294 # Use copy so that we can test that overwrite
295 # protection works (using "auto" for File URIs would
296 # use hard links and subsequent transfer would work
297 # because it knows they are the same file).
298 transferred = butler.retrieveArtifacts(
299 [ref], destination, preserve_path=preserve_path, transfer="copy"
300 )
301 self.assertGreater(len(transferred), 0)
302 artifacts = list(ResourcePath.findFileResources([destination]))
303 self.assertEqual(set(transferred), set(artifacts))
305 for artifact in transferred:
306 path_in_destination = artifact.relative_to(destination)
307 self.assertIsNotNone(path_in_destination)
308 assert path_in_destination is not None
310 # when path is not preserved there should not be
311 # any path separators.
312 num_seps = path_in_destination.count("/")
313 if preserve_path:
314 self.assertGreater(num_seps, 0)
315 else:
316 self.assertEqual(num_seps, 0)
318 primary_uri, secondary_uris = butler.datastore.getURIs(ref)
319 n_uris = len(secondary_uris)
320 if primary_uri:
321 n_uris += 1
322 self.assertEqual(
323 len(artifacts),
324 n_uris,
325 "Comparing expected artifacts vs actual:"
326 f" {artifacts} vs {primary_uri} and {secondary_uris}",
327 )
329 if preserve_path:
330 # No need to run these twice
331 with self.assertRaises(ValueError):
332 butler.retrieveArtifacts([ref], destination, transfer="move")
334 with self.assertRaises(FileExistsError):
335 butler.retrieveArtifacts([ref], destination)
337 transferred_again = butler.retrieveArtifacts(
338 [ref], destination, preserve_path=preserve_path, overwrite=True
339 )
340 self.assertEqual(set(transferred_again), set(transferred))
342 # Now remove the dataset completely.
343 butler.pruneDatasets([ref], purge=True, unstore=True)
344 # Lookup with original args should still fail.
345 with self.assertRaises(LookupError):
346 butler.datasetExists(*args, collections=this_run)
347 # get() should still fail.
348 with self.assertRaises(FileNotFoundError):
349 butler.get(ref)
350 # Registry shouldn't be able to find it by dataset_id anymore.
351 self.assertIsNone(butler.registry.getDataset(ref.id))
353 # Do explicit registry removal since we know they are
354 # empty
355 butler.registry.removeCollection(this_run)
356 expected_collections.remove(this_run)
358 # Create DatasetRef for put using default run.
359 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run)
361 # Put the dataset again, since the last thing we did was remove it
362 # and we want to use the default collection.
363 ref = butler.put(metric, refIn)
365 # Get with parameters
366 stop = 4
367 sliced = butler.get(ref, parameters={"slice": slice(stop)})
368 self.assertNotEqual(metric, sliced)
369 self.assertEqual(metric.summary, sliced.summary)
370 self.assertEqual(metric.output, sliced.output)
371 self.assertEqual(metric.data[:stop], sliced.data)
372 # getDeferred with parameters
373 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
374 self.assertNotEqual(metric, sliced)
375 self.assertEqual(metric.summary, sliced.summary)
376 self.assertEqual(metric.output, sliced.output)
377 self.assertEqual(metric.data[:stop], sliced.data)
378 # getDeferred with deferred parameters
379 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
380 self.assertNotEqual(metric, sliced)
381 self.assertEqual(metric.summary, sliced.summary)
382 self.assertEqual(metric.output, sliced.output)
383 self.assertEqual(metric.data[:stop], sliced.data)
385 if storageClass.isComposite():
386 # Check that components can be retrieved
387 metricOut = butler.get(ref.datasetType.name, dataId)
388 compNameS = ref.datasetType.componentTypeName("summary")
389 compNameD = ref.datasetType.componentTypeName("data")
390 summary = butler.get(compNameS, dataId)
391 self.assertEqual(summary, metric.summary)
392 data = butler.get(compNameD, dataId)
393 self.assertEqual(data, metric.data)
395 if "counter" in storageClass.derivedComponents:
396 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId)
397 self.assertEqual(count, len(data))
399 count = butler.get(
400 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)}
401 )
402 self.assertEqual(count, stop)
404 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
405 assert compRef is not None
406 summary = butler.get(compRef)
407 self.assertEqual(summary, metric.summary)
409 # Create a Dataset type that has the same name but is inconsistent.
410 inconsistentDatasetType = DatasetType(
411 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config")
412 )
414 # Getting with a dataset type that does not match registry fails
415 with self.assertRaisesRegex(ValueError, "Supplied dataset type .* inconsistent with registry"):
416 butler.get(inconsistentDatasetType, dataId)
418 # Combining a DatasetRef with a dataId should fail
419 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"):
420 butler.get(ref, dataId)
421 # Getting with an explicit ref should fail if the id doesn't match.
422 with self.assertRaises(FileNotFoundError):
423 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run))
425 # Getting a dataset with unknown parameters should fail
426 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"):
427 butler.get(ref, parameters={"unsupported": True})
429 # Check we have a collection
430 collections = set(butler.registry.queryCollections())
431 self.assertEqual(collections, expected_collections)
433 # Clean up to check that we can remove something that may have
434 # already had a component removed
435 butler.pruneDatasets([ref], unstore=True, purge=True)
437 # Add the same ref again, so we can check that duplicate put fails.
438 ref = butler.put(metric, datasetType, dataId)
440 # Repeat put will fail.
441 with self.assertRaisesRegex(
442 ConflictingDefinitionError, "A database constraint failure was triggered"
443 ):
444 butler.put(metric, datasetType, dataId)
446 # Remove the datastore entry.
447 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
449 # Put will still fail
450 with self.assertRaisesRegex(
451 ConflictingDefinitionError, "A database constraint failure was triggered"
452 ):
453 butler.put(metric, datasetType, dataId)
455 # Repeat the same sequence with resolved ref.
456 butler.pruneDatasets([ref], unstore=True, purge=True)
457 ref = butler.put(metric, refIn)
459 # Repeat put will fail.
460 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"):
461 butler.put(metric, refIn)
463 # Remove the datastore entry.
464 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False)
466 # In case of resolved ref this write will succeed.
467 ref = butler.put(metric, refIn)
469 # Leave the dataset in place since some downstream tests require
470 # something to be present
472 return butler
474 def testDeferredCollectionPassing(self) -> None:
475 # Construct a butler with no run or collection, but make it writeable.
476 butler = Butler(self.tmpConfigFile, writeable=True)
477 # Create and register a DatasetType
478 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
479 datasetType = self.addDatasetType(
480 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry
481 )
482 # Add needed Dimensions
483 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
484 butler.registry.insertDimensionData(
485 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
486 )
487 butler.registry.insertDimensionData(
488 "visit",
489 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
490 )
491 dataId = {"instrument": "DummyCamComp", "visit": 423}
492 # Create dataset.
493 metric = makeExampleMetrics()
494 # Register a new run and put dataset.
495 run = "deferred"
496 self.assertTrue(butler.registry.registerRun(run))
497 # Second time it will be allowed but indicate no-op
498 self.assertFalse(butler.registry.registerRun(run))
499 ref = butler.put(metric, datasetType, dataId, run=run)
500 # Putting with no run should fail with TypeError.
501 with self.assertRaises(CollectionError):
502 butler.put(metric, datasetType, dataId)
503 # Dataset should exist.
504 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
505 # We should be able to get the dataset back, but with and without
506 # a deferred dataset handle.
507 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
508 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
509 # Trying to find the dataset without any collection is a TypeError.
510 with self.assertRaises(CollectionError):
511 butler.datasetExists(datasetType, dataId)
512 with self.assertRaises(CollectionError):
513 butler.get(datasetType, dataId)
514 # Associate the dataset with a different collection.
515 butler.registry.registerCollection("tagged")
516 butler.registry.associate("tagged", [ref])
517 # Deleting the dataset from the new collection should make it findable
518 # in the original collection.
519 butler.pruneDatasets([ref], tags=["tagged"])
520 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
523class ButlerTests(ButlerPutGetTests):
524 """Tests for Butler."""
526 useTempRoot = True
527 validationCanFail: bool
528 fullConfigKey: str | None
529 registryStr: str | None
530 datastoreName: list[str] | None
531 datastoreStr: list[str]
533 def setUp(self) -> None:
534 """Create a new butler root for each test."""
535 self.root = makeTestTempDir(TESTDIR)
536 Butler.makeRepo(self.root, config=Config(self.configFile))
537 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
539 def testConstructor(self) -> None:
540 """Independent test of constructor."""
541 butler = Butler(self.tmpConfigFile, run=self.default_run)
542 self.assertIsInstance(butler, Butler)
544 # Check that butler.yaml is added automatically.
545 if self.tmpConfigFile.endswith(end := "/butler.yaml"):
546 config_dir = self.tmpConfigFile[: -len(end)]
547 butler = Butler(config_dir, run=self.default_run)
548 self.assertIsInstance(butler, Butler)
550 # Even with a ResourcePath.
551 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
552 self.assertIsInstance(butler, Butler)
554 collections = set(butler.registry.queryCollections())
555 self.assertEqual(collections, {self.default_run})
557 # Check that some special characters can be included in run name.
558 special_run = "u@b.c-A"
559 butler_special = Butler(butler=butler, run=special_run)
560 collections = set(butler_special.registry.queryCollections("*@*"))
561 self.assertEqual(collections, {special_run})
563 butler2 = Butler(butler=butler, collections=["other"])
564 self.assertEqual(butler2.collections, ("other",))
565 self.assertIsNone(butler2.run)
566 self.assertIs(butler.datastore, butler2.datastore)
568 # Test that we can use an environment variable to find this
569 # repository.
570 butler_index = Config()
571 butler_index["label"] = self.tmpConfigFile
572 for suffix in (".yaml", ".json"):
573 # Ensure that the content differs so that we know that
574 # we aren't reusing the cache.
575 bad_label = f"s3://bucket/not_real{suffix}"
576 butler_index["bad_label"] = bad_label
577 with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
578 butler_index.dumpToUri(temp_file)
579 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
580 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label")))
581 uri = Butler.get_repo_uri("bad_label")
582 self.assertEqual(uri, ResourcePath(bad_label))
583 uri = Butler.get_repo_uri("label")
584 butler = Butler(uri, writeable=False)
585 self.assertIsInstance(butler, Butler)
586 butler = Butler("label", writeable=False)
587 self.assertIsInstance(butler, Butler)
588 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
589 Butler("not_there", writeable=False)
590 with self.assertRaises(KeyError) as cm:
591 Butler.get_repo_uri("missing")
592 self.assertEqual(Butler.get_repo_uri("missing", True), ResourcePath("missing"))
593 self.assertIn("not known to", str(cm.exception))
594 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
595 with self.assertRaises(FileNotFoundError):
596 Butler.get_repo_uri("label")
597 self.assertEqual(Butler.get_known_repos(), set())
598 with self.assertRaises(KeyError) as cm:
599 # No environment variable set.
600 Butler.get_repo_uri("label")
601 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label"))
602 self.assertIn("No repository index defined", str(cm.exception))
603 with self.assertRaisesRegex(FileNotFoundError, "no known aliases"):
604 # No aliases registered.
605 Butler("not_there")
606 self.assertEqual(Butler.get_known_repos(), set())
608 def testBasicPutGet(self) -> None:
609 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
610 self.runPutGetTest(storageClass, "test_metric")
612 def testCompositePutGetConcrete(self) -> None:
613 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly")
614 butler = self.runPutGetTest(storageClass, "test_metric")
616 # Should *not* be disassembled
617 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
618 self.assertEqual(len(datasets), 1)
619 uri, components = butler.getURIs(datasets[0])
620 self.assertIsInstance(uri, ResourcePath)
621 self.assertFalse(components)
622 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
623 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
625 # Predicted dataset
626 dataId = {"instrument": "DummyCamComp", "visit": 424}
627 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
628 self.assertFalse(components)
629 self.assertIsInstance(uri, ResourcePath)
630 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
631 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
633 def testCompositePutGetVirtual(self) -> None:
634 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp")
635 butler = self.runPutGetTest(storageClass, "test_metric_comp")
637 # Should be disassembled
638 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run))
639 self.assertEqual(len(datasets), 1)
640 uri, components = butler.getURIs(datasets[0])
642 if butler.datastore.isEphemeral:
643 # Never disassemble in-memory datastore
644 self.assertIsInstance(uri, ResourcePath)
645 self.assertFalse(components)
646 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}")
647 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}")
648 else:
649 self.assertIsNone(uri)
650 self.assertEqual(set(components), set(storageClass.components))
651 for compuri in components.values():
652 self.assertIsInstance(compuri, ResourcePath)
653 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}")
654 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}")
656 # Predicted dataset
657 dataId = {"instrument": "DummyCamComp", "visit": 424}
658 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True)
660 if butler.datastore.isEphemeral:
661 # Never disassembled
662 self.assertIsInstance(uri, ResourcePath)
663 self.assertFalse(components)
664 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}")
665 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}")
666 else:
667 self.assertIsNone(uri)
668 self.assertEqual(set(components), set(storageClass.components))
669 for compuri in components.values():
670 self.assertIsInstance(compuri, ResourcePath)
671 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}")
672 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}")
674 def testStorageClassOverrideGet(self) -> None:
675 """Test storage class conversion on get with override."""
676 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
677 datasetTypeName = "anything"
678 run = self.default_run
680 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName)
682 # Create and store a dataset.
683 metric = makeExampleMetrics()
684 dataId = {"instrument": "DummyCamComp", "visit": 423}
686 ref = butler.put(metric, datasetType, dataId)
688 # Return native type.
689 retrieved = butler.get(ref)
690 self.assertEqual(retrieved, metric)
692 # Specify an override.
693 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion")
694 model = butler.get(ref, storageClass=new_sc)
695 self.assertNotEqual(type(model), type(retrieved))
696 self.assertIs(type(model), new_sc.pytype)
697 self.assertEqual(retrieved, model)
699 # Defer but override later.
700 deferred = butler.getDeferred(ref)
701 model = deferred.get(storageClass=new_sc)
702 self.assertIs(type(model), new_sc.pytype)
703 self.assertEqual(retrieved, model)
705 # Defer but override up front.
706 deferred = butler.getDeferred(ref, storageClass=new_sc)
707 model = deferred.get()
708 self.assertIs(type(model), new_sc.pytype)
709 self.assertEqual(retrieved, model)
711 # Retrieve a component. Should be a tuple.
712 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple")
713 self.assertIs(type(data), tuple)
714 self.assertEqual(data, tuple(retrieved.data))
716 # Parameter on the write storage class should work regardless
717 # of read storage class.
718 data = butler.get(
719 "anything.data",
720 dataId,
721 storageClass="StructuredDataDataTestTuple",
722 parameters={"slice": slice(2, 4)},
723 )
724 self.assertEqual(len(data), 2)
726 # Try a parameter that is known to the read storage class but not
727 # the write storage class.
728 with self.assertRaises(KeyError):
729 butler.get(
730 "anything.data",
731 dataId,
732 storageClass="StructuredDataDataTestTuple",
733 parameters={"xslice": slice(2, 4)},
734 )
736 def testPytypePutCoercion(self) -> None:
737 """Test python type coercion on Butler.get and put."""
739 # Store some data with the normal example storage class.
740 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
741 datasetTypeName = "test_metric"
742 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName)
744 dataId = {"instrument": "DummyCamComp", "visit": 423}
746 # Put a dict and this should coerce to a MetricsExample
747 test_dict = {"summary": {"a": 1}, "output": {"b": 2}}
748 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424)
749 test_metric = butler.get(metric_ref)
750 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample")
751 self.assertEqual(test_metric.summary, test_dict["summary"])
752 self.assertEqual(test_metric.output, test_dict["output"])
754 # Check that the put still works if a DatasetType is given with
755 # a definition matching this python type.
756 registry_type = butler.registry.getDatasetType(datasetTypeName)
757 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson")
758 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425)
759 self.assertEqual(metric2_ref.datasetType, registry_type)
761 # The get will return the type expected by registry.
762 test_metric2 = butler.get(metric2_ref)
763 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample")
765 # Make a new DatasetRef with the compatible but different DatasetType.
766 # This should now return a dict.
767 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run)
768 test_dict2 = butler.get(new_ref)
769 self.assertEqual(get_full_type_name(test_dict2), "dict")
771 # Get it again with the wrong dataset type definition using get()
772 # rather than get(). This should be consistent with get()
773 # behavior and return the type of the DatasetType.
774 test_dict3 = butler.get(this_type, dataId=dataId, visit=425)
775 self.assertEqual(get_full_type_name(test_dict3), "dict")
777 def testIngest(self) -> None:
778 butler = Butler(self.tmpConfigFile, run=self.default_run)
780 # Create and register a DatasetType
781 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
783 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
784 datasetTypeName = "metric"
786 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
788 # Add needed Dimensions
789 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
790 butler.registry.insertDimensionData(
791 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
792 )
793 for detector in (1, 2):
794 butler.registry.insertDimensionData(
795 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"}
796 )
798 butler.registry.insertDimensionData(
799 "visit",
800 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"},
801 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"},
802 )
804 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter")
805 dataRoot = os.path.join(TESTDIR, "data", "basic")
806 datasets = []
807 for detector in (1, 2):
808 detector_name = f"detector_{detector}"
809 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
810 dataId = butler.registry.expandDataId(
811 {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
812 )
813 # Create a DatasetRef for ingest
814 refIn = DatasetRef(datasetType, dataId, run=self.default_run)
816 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter))
818 butler.ingest(*datasets, transfer="copy")
820 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
821 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
823 metrics1 = butler.get(datasetTypeName, dataId1)
824 metrics2 = butler.get(datasetTypeName, dataId2)
825 self.assertNotEqual(metrics1, metrics2)
827 # Compare URIs
828 uri1 = butler.getURI(datasetTypeName, dataId1)
829 uri2 = butler.getURI(datasetTypeName, dataId2)
830 self.assertNotEqual(uri1, uri2)
832 # Now do a multi-dataset but single file ingest
833 metricFile = os.path.join(dataRoot, "detectors.yaml")
834 refs = []
835 for detector in (1, 2):
836 detector_name = f"detector_{detector}"
837 dataId = butler.registry.expandDataId(
838 {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
839 )
840 # Create a DatasetRef for ingest
841 refs.append(DatasetRef(datasetType, dataId, run=self.default_run))
843 # Test "move" transfer to ensure that the files themselves
844 # have disappeared following ingest.
845 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile:
846 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy")
848 datasets = []
849 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter))
851 # For first ingest use copy.
852 butler.ingest(*datasets, transfer="copy", record_validation_info=False)
854 # Now try to ingest again in "execution butler" mode where
855 # the registry entries exist but the datastore does not have
856 # the files. We also need to strip the dimension records to ensure
857 # that they will be re-added by the ingest.
858 ref = datasets[0].refs[0]
859 datasets[0].refs = [
860 cast(
861 DatasetRef,
862 butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run),
863 )
864 for ref in datasets[0].refs
865 ]
866 all_refs = []
867 for dataset in datasets:
868 refs = []
869 for ref in dataset.refs:
870 # Create a dict from the dataId to drop the records.
871 new_data_id = {str(k): v for k, v in ref.dataId.items()}
872 new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run)
873 assert new_ref is not None
874 self.assertFalse(new_ref.dataId.hasRecords())
875 refs.append(new_ref)
876 dataset.refs = refs
877 all_refs.extend(dataset.refs)
878 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False)
880 # Use move mode to test that the file is deleted. Also
881 # disable recording of file size.
882 butler.ingest(*datasets, transfer="move", record_validation_info=False)
884 # Check that every ref now has records.
885 for dataset in datasets:
886 for ref in dataset.refs:
887 self.assertTrue(ref.dataId.hasRecords())
889 # Ensure that the file has disappeared.
890 self.assertFalse(tempFile.exists())
892 # Check that the datastore recorded no file size.
893 # Not all datastores can support this.
894 try:
895 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined]
896 self.assertEqual(infos[0].file_size, -1)
897 except AttributeError:
898 pass
900 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
901 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
903 multi1 = butler.get(datasetTypeName, dataId1)
904 multi2 = butler.get(datasetTypeName, dataId2)
906 self.assertEqual(multi1, metrics1)
907 self.assertEqual(multi2, metrics2)
909 # Compare URIs
910 uri1 = butler.getURI(datasetTypeName, dataId1)
911 uri2 = butler.getURI(datasetTypeName, dataId2)
912 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}")
914 # Test that removing one does not break the second
915 # This line will issue a warning log message for a ChainedDatastore
916 # that uses an InMemoryDatastore since in-memory can not ingest
917 # files.
918 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
919 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
920 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
921 multi2b = butler.get(datasetTypeName, dataId2)
922 self.assertEqual(multi2, multi2b)
924 # Ensure we can ingest 0 datasets
925 datasets = []
926 butler.ingest(*datasets)
928 def testPickle(self) -> None:
929 """Test pickle support."""
930 butler = Butler(self.tmpConfigFile, run=self.default_run)
931 butlerOut = pickle.loads(pickle.dumps(butler))
932 self.assertIsInstance(butlerOut, Butler)
933 self.assertEqual(butlerOut._config, butler._config)
934 self.assertEqual(butlerOut.collections, butler.collections)
935 self.assertEqual(butlerOut.run, butler.run)
937 def testGetDatasetTypes(self) -> None:
938 butler = Butler(self.tmpConfigFile, run=self.default_run)
939 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
940 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [
941 (
942 "instrument",
943 [
944 {"instrument": "DummyCam"},
945 {"instrument": "DummyHSC"},
946 {"instrument": "DummyCamComp"},
947 ],
948 ),
949 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]),
950 ("visit", [{"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}]),
951 ]
952 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
953 # Add needed Dimensions
954 for element, data in dimensionEntries:
955 butler.registry.insertDimensionData(element, *data)
957 # When a DatasetType is added to the registry entries are not created
958 # for components but querying them can return the components.
959 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"}
960 components = set()
961 for datasetTypeName in datasetTypeNames:
962 # Create and register a DatasetType
963 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
965 for componentName in storageClass.components:
966 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
968 fromRegistry: set[DatasetType] = set()
969 for parent_dataset_type in butler.registry.queryDatasetTypes():
970 fromRegistry.add(parent_dataset_type)
971 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes())
972 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
974 # Now that we have some dataset types registered, validate them
975 butler.validateConfiguration(
976 ignore=[
977 "test_metric_comp",
978 "metric3",
979 "metric5",
980 "calexp",
981 "DummySC",
982 "datasetType.component",
983 "random_data",
984 "random_data_2",
985 ]
986 )
988 # Add a new datasetType that will fail template validation
989 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
990 if self.validationCanFail:
991 with self.assertRaises(ValidationError):
992 butler.validateConfiguration()
994 # Rerun validation but with a subset of dataset type names
995 butler.validateConfiguration(datasetTypeNames=["metric4"])
997 # Rerun validation but ignore the bad datasetType
998 butler.validateConfiguration(
999 ignore=[
1000 "test_metric_comp",
1001 "metric3",
1002 "metric5",
1003 "calexp",
1004 "DummySC",
1005 "datasetType.component",
1006 "random_data",
1007 "random_data_2",
1008 ]
1009 )
1011 def testTransaction(self) -> None:
1012 butler = Butler(self.tmpConfigFile, run=self.default_run)
1013 datasetTypeName = "test_metric"
1014 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1015 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = (
1016 ("instrument", {"instrument": "DummyCam"}),
1017 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
1018 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
1019 )
1020 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
1021 metric = makeExampleMetrics()
1022 dataId = {"instrument": "DummyCam", "visit": 42}
1023 # Create and register a DatasetType
1024 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
1025 with self.assertRaises(TransactionTestError):
1026 with butler.transaction():
1027 # Add needed Dimensions
1028 for args in dimensionEntries:
1029 butler.registry.insertDimensionData(*args)
1030 # Store a dataset
1031 ref = butler.put(metric, datasetTypeName, dataId)
1032 self.assertIsInstance(ref, DatasetRef)
1033 # Test getDirect
1034 metricOut = butler.get(ref)
1035 self.assertEqual(metric, metricOut)
1036 # Test get
1037 metricOut = butler.get(datasetTypeName, dataId)
1038 self.assertEqual(metric, metricOut)
1039 # Check we can get components
1040 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric)
1041 raise TransactionTestError("This should roll back the entire transaction")
1042 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"):
1043 butler.registry.expandDataId(dataId)
1044 # Should raise LookupError for missing data ID value
1045 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
1046 butler.get(datasetTypeName, dataId)
1047 # Also check explicitly if Dataset entry is missing
1048 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
1049 # Direct retrieval should not find the file in the Datastore
1050 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
1051 butler.get(ref)
1053 def testMakeRepo(self) -> None:
1054 """Test that we can write butler configuration to a new repository via
1055 the Butler.makeRepo interface and then instantiate a butler from the
1056 repo root.
1057 """
1058 # Do not run the test if we know this datastore configuration does
1059 # not support a file system root
1060 if self.fullConfigKey is None:
1061 return
1063 # create two separate directories
1064 root1 = tempfile.mkdtemp(dir=self.root)
1065 root2 = tempfile.mkdtemp(dir=self.root)
1067 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
1068 limited = Config(self.configFile)
1069 butler1 = Butler(butlerConfig)
1070 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
1071 full = Config(self.tmpConfigFile)
1072 butler2 = Butler(butlerConfig)
1073 # Butlers should have the same configuration regardless of whether
1074 # defaults were expanded.
1075 self.assertEqual(butler1._config, butler2._config)
1076 # Config files loaded directly should not be the same.
1077 self.assertNotEqual(limited, full)
1078 # Make sure "limited" doesn't have a few keys we know it should be
1079 # inheriting from defaults.
1080 self.assertIn(self.fullConfigKey, full)
1081 self.assertNotIn(self.fullConfigKey, limited)
1083 # Collections don't appear until something is put in them
1084 collections1 = set(butler1.registry.queryCollections())
1085 self.assertEqual(collections1, set())
1086 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
1088 # Check that a config with no associated file name will not
1089 # work properly with relocatable Butler repo
1090 butlerConfig.configFile = None
1091 with self.assertRaises(ValueError):
1092 Butler(butlerConfig)
1094 with self.assertRaises(FileExistsError):
1095 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
1097 def testStringification(self) -> None:
1098 butler = Butler(self.tmpConfigFile, run=self.default_run)
1099 butlerStr = str(butler)
1101 if self.datastoreStr is not None:
1102 for testStr in self.datastoreStr:
1103 self.assertIn(testStr, butlerStr)
1104 if self.registryStr is not None:
1105 self.assertIn(self.registryStr, butlerStr)
1107 datastoreName = butler.datastore.name
1108 if self.datastoreName is not None:
1109 for testStr in self.datastoreName:
1110 self.assertIn(testStr, datastoreName)
1112 def testButlerRewriteDataId(self) -> None:
1113 """Test that dataIds can be rewritten based on dimension records."""
1115 butler = Butler(self.tmpConfigFile, run=self.default_run)
1117 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
1118 datasetTypeName = "random_data"
1120 # Create dimension records.
1121 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1122 butler.registry.insertDimensionData(
1123 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1124 )
1125 butler.registry.insertDimensionData(
1126 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1127 )
1129 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"])
1130 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1131 butler.registry.registerDatasetType(datasetType)
1133 n_exposures = 5
1134 dayobs = 20210530
1136 for i in range(n_exposures):
1137 butler.registry.insertDimensionData(
1138 "exposure",
1139 {
1140 "instrument": "DummyCamComp",
1141 "id": i,
1142 "obs_id": f"exp{i}",
1143 "seq_num": i,
1144 "day_obs": dayobs,
1145 "physical_filter": "d-r",
1146 },
1147 )
1149 # Write some data.
1150 for i in range(n_exposures):
1151 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]}
1153 # Use the seq_num for the put to test rewriting.
1154 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"}
1155 ref = butler.put(metric, datasetTypeName, dataId=dataId)
1157 # Check that the exposure is correct in the dataId
1158 self.assertEqual(ref.dataId["exposure"], i)
1160 # and check that we can get the dataset back with the same dataId
1161 new_metric = butler.get(datasetTypeName, dataId=dataId)
1162 self.assertEqual(new_metric, metric)
1165class FileDatastoreButlerTests(ButlerTests):
1166 """Common tests and specialization of ButlerTests for butlers backed
1167 by datastores that inherit from FileDatastore.
1168 """
1170 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool:
1171 """Checks if file exists at a given path (relative to root).
1173 Test testPutTemplates verifies actual physical existance of the files
1174 in the requested location.
1175 """
1176 uri = ResourcePath(root, forceDirectory=True)
1177 return uri.join(relpath).exists()
1179 def testPutTemplates(self) -> None:
1180 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1181 butler = Butler(self.tmpConfigFile, run=self.default_run)
1183 # Add needed Dimensions
1184 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1185 butler.registry.insertDimensionData(
1186 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1187 )
1188 butler.registry.insertDimensionData(
1189 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"}
1190 )
1191 butler.registry.insertDimensionData(
1192 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"}
1193 )
1195 # Create and store a dataset
1196 metric = makeExampleMetrics()
1198 # Create two almost-identical DatasetTypes (both will use default
1199 # template)
1200 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
1201 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
1202 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
1203 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
1205 dataId1 = {"instrument": "DummyCamComp", "visit": 423}
1206 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
1208 # Put with exactly the data ID keys needed
1209 ref = butler.put(metric, "metric1", dataId1)
1210 uri = butler.getURI(ref)
1211 self.assertTrue(uri.exists())
1212 self.assertTrue(
1213 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle")
1214 )
1216 # Check the template based on dimensions
1217 if hasattr(butler.datastore, "templates"):
1218 butler.datastore.templates.validateTemplates([ref])
1220 # Put with extra data ID keys (physical_filter is an optional
1221 # dependency); should not change template (at least the way we're
1222 # defining them to behave now; the important thing is that they
1223 # must be consistent).
1224 ref = butler.put(metric, "metric2", dataId2)
1225 uri = butler.getURI(ref)
1226 self.assertTrue(uri.exists())
1227 self.assertTrue(
1228 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle")
1229 )
1231 # Check the template based on dimensions
1232 if hasattr(butler.datastore, "templates"):
1233 butler.datastore.templates.validateTemplates([ref])
1235 # Use a template that has a typo in dimension record metadata.
1236 # Easier to test with a butler that has a ref with records attached.
1237 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits")
1238 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1239 path = template.format(ref)
1240 self.assertEqual(path, f"a/v423/{ref.id}_fits")
1242 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits")
1243 with self.assertRaises(KeyError):
1244 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"):
1245 template.format(ref)
1247 # Now use a file template that will not result in unique filenames
1248 with self.assertRaises(FileTemplateValidationError):
1249 butler.put(metric, "metric3", dataId1)
1251 def testImportExport(self) -> None:
1252 # Run put/get tests just to create and populate a repo.
1253 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1254 self.runImportExportTest(storageClass)
1256 @unittest.expectedFailure
1257 def testImportExportVirtualComposite(self) -> None:
1258 # Run put/get tests just to create and populate a repo.
1259 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
1260 self.runImportExportTest(storageClass)
1262 def runImportExportTest(self, storageClass: StorageClass) -> None:
1263 """This test does an export to a temp directory and an import back
1264 into a new temp directory repo. It does not assume a posix datastore"""
1265 exportButler = self.runPutGetTest(storageClass, "test_metric")
1267 # Test that we must have a file extension.
1268 with self.assertRaises(ValueError):
1269 with exportButler.export(filename="dump", directory=".") as export:
1270 pass
1272 # Test that unknown format is not allowed.
1273 with self.assertRaises(ValueError):
1274 with exportButler.export(filename="dump.fits", directory=".") as export:
1275 pass
1277 # Test that the repo actually has at least one dataset.
1278 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1279 self.assertGreater(len(datasets), 0)
1280 # Add a DimensionRecord that's unused by those datasets.
1281 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")}
1282 exportButler.registry.insertDimensionData("skymap", skymapRecord)
1283 # Export and then import datasets.
1284 with safeTestTempDir(TESTDIR) as exportDir:
1285 exportFile = os.path.join(exportDir, "exports.yaml")
1286 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export:
1287 export.saveDatasets(datasets)
1288 # Export the same datasets again. This should quietly do
1289 # nothing because of internal deduplication, and it shouldn't
1290 # complain about being asked to export the "htm7" elements even
1291 # though there aren't any in these datasets or in the database.
1292 export.saveDatasets(datasets, elements=["htm7"])
1293 # Save one of the data IDs again; this should be harmless
1294 # because of internal deduplication.
1295 export.saveDataIds([datasets[0].dataId])
1296 # Save some dimension records directly.
1297 export.saveDimensionData("skymap", [skymapRecord])
1298 self.assertTrue(os.path.exists(exportFile))
1299 with safeTestTempDir(TESTDIR) as importDir:
1300 # We always want this to be a local posix butler
1301 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml")))
1302 # Calling script.butlerImport tests the implementation of the
1303 # butler command line interface "import" subcommand. Functions
1304 # in the script folder are generally considered protected and
1305 # should not be used as public api.
1306 with open(exportFile, "r") as f:
1307 script.butlerImport(
1308 importDir,
1309 export_file=f,
1310 directory=exportDir,
1311 transfer="auto",
1312 skip_dimensions=None,
1313 )
1314 importButler = Butler(importDir, run=self.default_run)
1315 for ref in datasets:
1316 with self.subTest(ref=ref):
1317 # Test for existence by passing in the DatasetType and
1318 # data ID separately, to avoid lookup by dataset_id.
1319 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
1320 self.assertEqual(
1321 list(importButler.registry.queryDimensionRecords("skymap")),
1322 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)],
1323 )
1325 def testRemoveRuns(self) -> None:
1326 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1327 butler = Butler(self.tmpConfigFile, writeable=True)
1328 # Load registry data with dimensions to hang datasets off of.
1329 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
1330 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1331 # Add some RUN-type collection.
1332 run1 = "run1"
1333 butler.registry.registerRun(run1)
1334 run2 = "run2"
1335 butler.registry.registerRun(run2)
1336 # put a dataset in each
1337 metric = makeExampleMetrics()
1338 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1339 datasetType = self.addDatasetType(
1340 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1341 )
1342 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1343 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1344 uri1 = butler.getURI(ref1, collections=[run1])
1345 uri2 = butler.getURI(ref2, collections=[run2])
1347 with self.assertRaises(OrphanedRecordError):
1348 butler.registry.removeDatasetType(datasetType.name)
1350 # Remove from both runs with different values for unstore.
1351 butler.removeRuns([run1], unstore=True)
1352 butler.removeRuns([run2], unstore=False)
1353 # Should be nothing in registry for either one, and datastore should
1354 # not think either exists.
1355 with self.assertRaises(MissingCollectionError):
1356 butler.registry.getCollectionType(run1)
1357 with self.assertRaises(MissingCollectionError):
1358 butler.registry.getCollectionType(run2)
1359 self.assertFalse(butler.datastore.exists(ref1))
1360 self.assertFalse(butler.datastore.exists(ref2))
1361 # The ref we unstored should be gone according to the URI, but the
1362 # one we forgot should still be around.
1363 self.assertFalse(uri1.exists())
1364 self.assertTrue(uri2.exists())
1366 # Now that the collections have been pruned we can remove the
1367 # dataset type
1368 butler.registry.removeDatasetType(datasetType.name)
1370 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm:
1371 butler.registry.removeDatasetType(tuple(["test*", "test*"]))
1372 self.assertIn("not defined", "\n".join(cm.output))
1375class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1376 """PosixDatastore specialization of a butler"""
1378 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1379 fullConfigKey: str | None = ".datastore.formatters"
1380 validationCanFail = True
1381 datastoreStr = ["/tmp"]
1382 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1383 registryStr = "/gen3.sqlite3"
1385 def testPathConstructor(self) -> None:
1386 """Independent test of constructor using PathLike."""
1387 butler = Butler(self.tmpConfigFile, run=self.default_run)
1388 self.assertIsInstance(butler, Butler)
1390 # And again with a Path object with the butler yaml
1391 path = pathlib.Path(self.tmpConfigFile)
1392 butler = Butler(path, writeable=False)
1393 self.assertIsInstance(butler, Butler)
1395 # And again with a Path object without the butler yaml
1396 # (making sure we skip it if the tmp config doesn't end
1397 # in butler.yaml -- which is the case for a subclass)
1398 if self.tmpConfigFile.endswith("butler.yaml"):
1399 path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
1400 butler = Butler(path, writeable=False)
1401 self.assertIsInstance(butler, Butler)
1403 def testExportTransferCopy(self) -> None:
1404 """Test local export using all transfer modes"""
1405 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1406 exportButler = self.runPutGetTest(storageClass, "test_metric")
1407 # Test that the repo actually has at least one dataset.
1408 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
1409 self.assertGreater(len(datasets), 0)
1410 uris = [exportButler.getURI(d) for d in datasets]
1411 assert isinstance(exportButler.datastore, FileDatastore)
1412 datastoreRoot = exportButler.datastore.root
1414 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris]
1416 for path in pathsInStore:
1417 # Assume local file system
1418 assert path is not None
1419 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}")
1421 for transfer in ("copy", "link", "symlink", "relsymlink"):
1422 with safeTestTempDir(TESTDIR) as exportDir:
1423 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export:
1424 export.saveDatasets(datasets)
1425 for path in pathsInStore:
1426 assert path is not None
1427 self.assertTrue(
1428 self.checkFileExists(exportDir, path),
1429 f"Check that mode {transfer} exported files",
1430 )
1432 def testPruneDatasets(self) -> None:
1433 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1434 butler = Butler(self.tmpConfigFile, writeable=True)
1435 assert isinstance(butler.datastore, FileDatastore)
1436 # Load registry data with dimensions to hang datasets off of.
1437 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
1438 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
1439 # Add some RUN-type collections.
1440 run1 = "run1"
1441 butler.registry.registerRun(run1)
1442 run2 = "run2"
1443 butler.registry.registerRun(run2)
1444 # put some datasets. ref1 and ref2 have the same data ID, and are in
1445 # different runs. ref3 has a different data ID.
1446 metric = makeExampleMetrics()
1447 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
1448 datasetType = self.addDatasetType(
1449 "prune_collections_test_dataset", dimensions, storageClass, butler.registry
1450 )
1451 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
1452 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
1453 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
1455 # Simple prune.
1456 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1457 with self.assertRaises(LookupError):
1458 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1)
1460 # Put data back.
1461 ref1 = butler.put(metric, ref1, run=run1)
1462 ref2 = butler.put(metric, ref2, run=run2)
1463 ref3 = butler.put(metric, ref3, run=run1)
1465 # Check that in normal mode, deleting the record will lead to
1466 # trash not touching the file.
1467 uri1 = butler.datastore.getURI(ref1)
1468 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table
1469 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id})
1470 butler.datastore.trash(ref1)
1471 butler.datastore.emptyTrash()
1472 self.assertTrue(uri1.exists())
1473 uri1.remove() # Clean it up.
1475 # Simulate execution butler setup by deleting the datastore
1476 # record but keeping the file around and trusting.
1477 butler.datastore.trustGetRequest = True
1478 uri2 = butler.datastore.getURI(ref2)
1479 uri3 = butler.datastore.getURI(ref3)
1480 self.assertTrue(uri2.exists())
1481 self.assertTrue(uri3.exists())
1483 # Remove the datastore record.
1484 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table
1485 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id})
1486 self.assertTrue(uri2.exists())
1487 butler.datastore.trash([ref2, ref3])
1488 # Immediate removal for ref2 file
1489 self.assertFalse(uri2.exists())
1490 # But ref3 has to wait for the empty.
1491 self.assertTrue(uri3.exists())
1492 butler.datastore.emptyTrash()
1493 self.assertFalse(uri3.exists())
1495 # Clear out the datasets from registry.
1496 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True)
1498 def testPytypeCoercion(self) -> None:
1499 """Test python type coercion on Butler.get and put."""
1501 # Store some data with the normal example storage class.
1502 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1503 datasetTypeName = "test_metric"
1504 butler = self.runPutGetTest(storageClass, datasetTypeName)
1506 dataId = {"instrument": "DummyCamComp", "visit": 423}
1507 metric = butler.get(datasetTypeName, dataId=dataId)
1508 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample")
1510 datasetType_ori = butler.registry.getDatasetType(datasetTypeName)
1511 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents")
1513 # Now need to hack the registry dataset type definition.
1514 # There is no API for this.
1515 assert isinstance(butler.registry, SqlRegistry)
1516 manager = butler.registry._managers.datasets
1517 assert hasattr(manager, "_db") and hasattr(manager, "_static")
1518 manager._db.update(
1519 manager._static.dataset_type,
1520 {"name": datasetTypeName},
1521 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"},
1522 )
1524 # Force reset of dataset type cache
1525 butler.registry.refresh()
1527 datasetType_new = butler.registry.getDatasetType(datasetTypeName)
1528 self.assertEqual(datasetType_new.name, datasetType_ori.name)
1529 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel")
1531 metric_model = butler.get(datasetTypeName, dataId=dataId)
1532 self.assertNotEqual(type(metric_model), type(metric))
1533 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel")
1535 # Put the model and read it back to show that everything now
1536 # works as normal.
1537 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424)
1538 metric_model_new = butler.get(metric_ref)
1539 self.assertEqual(metric_model_new, metric_model)
1541 # Hack the storage class again to something that will fail on the
1542 # get with no conversion class.
1543 manager._db.update(
1544 manager._static.dataset_type,
1545 {"name": datasetTypeName},
1546 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"},
1547 )
1548 butler.registry.refresh()
1550 with self.assertRaises(ValueError):
1551 butler.get(datasetTypeName, dataId=dataId)
1554@unittest.skipUnless(testing is not None, "testing.postgresql module not found")
1555class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1556 """PosixDatastore specialization of a butler using Postgres"""
1558 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1559 fullConfigKey = ".datastore.formatters"
1560 validationCanFail = True
1561 datastoreStr = ["/tmp"]
1562 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"]
1563 registryStr = "PostgreSQL@test"
1564 postgresql: Any
1566 @staticmethod
1567 def _handler(postgresql: Any) -> None:
1568 engine = sqlalchemy.engine.create_engine(postgresql.url())
1569 with engine.begin() as connection:
1570 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;"))
1572 @classmethod
1573 def setUpClass(cls) -> None:
1574 # Create the postgres test server.
1575 cls.postgresql = testing.postgresql.PostgresqlFactory(
1576 cache_initialized_db=True, on_initialized=cls._handler
1577 )
1578 super().setUpClass()
1580 @classmethod
1581 def tearDownClass(cls) -> None:
1582 # Clean up any lingering SQLAlchemy engines/connections
1583 # so they're closed before we shut down the server.
1584 gc.collect()
1585 cls.postgresql.clear_cache()
1586 super().tearDownClass()
1588 def setUp(self) -> None:
1589 self.server = self.postgresql()
1591 # Need to add a registry section to the config.
1592 self._temp_config = False
1593 config = Config(self.configFile)
1594 config["registry", "db"] = self.server.url()
1595 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh:
1596 config.dump(fh)
1597 self.configFile = fh.name
1598 self._temp_config = True
1599 super().setUp()
1601 def tearDown(self) -> None:
1602 self.server.stop()
1603 if self._temp_config and os.path.exists(self.configFile):
1604 os.remove(self.configFile)
1605 super().tearDown()
1607 def testMakeRepo(self) -> None:
1608 # The base class test assumes that it's using sqlite and assumes
1609 # the config file is acceptable to sqlite.
1610 raise unittest.SkipTest("Postgres config is not compatible with this test.")
1613class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
1614 """InMemoryDatastore specialization of a butler"""
1616 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
1617 fullConfigKey = None
1618 useTempRoot = False
1619 validationCanFail = False
1620 datastoreStr = ["datastore='InMemory"]
1621 datastoreName = ["InMemoryDatastore@"]
1622 registryStr = "/gen3.sqlite3"
1624 def testIngest(self) -> None:
1625 pass
1628class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1629 """PosixDatastore specialization"""
1631 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
1632 fullConfigKey = ".datastore.datastores.1.formatters"
1633 validationCanFail = True
1634 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"]
1635 datastoreName = [
1636 "InMemoryDatastore@",
1637 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1",
1638 "SecondDatastore",
1639 ]
1640 registryStr = "/gen3.sqlite3"
1643class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
1644 """Test that a yaml file in one location can refer to a root in another."""
1646 datastoreStr = ["dir1"]
1647 # Disable the makeRepo test since we are deliberately not using
1648 # butler.yaml as the config name.
1649 fullConfigKey = None
1651 def setUp(self) -> None:
1652 self.root = makeTestTempDir(TESTDIR)
1654 # Make a new repository in one place
1655 self.dir1 = os.path.join(self.root, "dir1")
1656 Butler.makeRepo(self.dir1, config=Config(self.configFile))
1658 # Move the yaml file to a different place and add a "root"
1659 self.dir2 = os.path.join(self.root, "dir2")
1660 os.makedirs(self.dir2, exist_ok=True)
1661 configFile1 = os.path.join(self.dir1, "butler.yaml")
1662 config = Config(configFile1)
1663 config["root"] = self.dir1
1664 configFile2 = os.path.join(self.dir2, "butler2.yaml")
1665 config.dumpToUri(configFile2)
1666 os.remove(configFile1)
1667 self.tmpConfigFile = configFile2
1669 def testFileLocations(self) -> None:
1670 self.assertNotEqual(self.dir1, self.dir2)
1671 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
1672 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
1673 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
1676class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
1677 """Test that a config file created by makeRepo outside of repo works."""
1679 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1681 def setUp(self) -> None:
1682 self.root = makeTestTempDir(TESTDIR)
1683 self.root2 = makeTestTempDir(TESTDIR)
1685 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
1686 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1688 def tearDown(self) -> None:
1689 if os.path.exists(self.root2):
1690 shutil.rmtree(self.root2, ignore_errors=True)
1691 super().tearDown()
1693 def testConfigExistence(self) -> None:
1694 c = Config(self.tmpConfigFile)
1695 uri_config = ResourcePath(c["root"])
1696 uri_expected = ResourcePath(self.root, forceDirectory=True)
1697 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
1698 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
1700 def testPutGet(self) -> None:
1701 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
1702 self.runPutGetTest(storageClass, "test_metric")
1705class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
1706 """Test that a config file created by makeRepo outside of repo works."""
1708 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1710 def setUp(self) -> None:
1711 self.root = makeTestTempDir(TESTDIR)
1712 self.root2 = makeTestTempDir(TESTDIR)
1714 self.tmpConfigFile = self.root2
1715 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1717 def testConfigExistence(self) -> None:
1718 # Append the yaml file else Config constructor does not know the file
1719 # type.
1720 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
1721 super().testConfigExistence()
1724class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
1725 """Test that a config file created by makeRepo outside of repo works."""
1727 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1729 def setUp(self) -> None:
1730 self.root = makeTestTempDir(TESTDIR)
1731 self.root2 = makeTestTempDir(TESTDIR)
1733 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl()
1734 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile)
1737@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1738class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
1739 """S3Datastore specialization of a butler; an S3 storage Datastore +
1740 a local in-memory SqlRegistry.
1741 """
1743 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1744 fullConfigKey = None
1745 validationCanFail = True
1747 bucketName = "anybucketname"
1748 """Name of the Bucket that will be used in the tests. The name is read from
1749 the config file used with the tests during set-up.
1750 """
1752 root = "butlerRoot/"
1753 """Root repository directory expected to be used in case useTempRoot=False.
1754 Otherwise the root is set to a 20 characters long randomly generated string
1755 during set-up.
1756 """
1758 datastoreStr = [f"datastore={root}"]
1759 """Contains all expected root locations in a format expected to be
1760 returned by Butler stringification.
1761 """
1763 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"]
1764 """The expected format of the S3 Datastore string."""
1766 registryStr = "/gen3.sqlite3"
1767 """Expected format of the Registry string."""
1769 mock_s3 = mock_s3()
1770 """The mocked s3 interface from moto."""
1772 def genRoot(self) -> str:
1773 """Returns a random string of len 20 to serve as a root
1774 name for the temporary bucket repo.
1776 This is equivalent to tempfile.mkdtemp as this is what self.root
1777 becomes when useTempRoot is True.
1778 """
1779 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20))
1780 return rndstr + "/"
1782 def setUp(self) -> None:
1783 config = Config(self.configFile)
1784 uri = ResourcePath(config[".datastore.datastore.root"])
1785 self.bucketName = uri.netloc
1787 # Enable S3 mocking of tests.
1788 self.mock_s3.start()
1790 # set up some fake credentials if they do not exist
1791 self.usingDummyCredentials = setAwsEnvCredentials()
1793 if self.useTempRoot:
1794 self.root = self.genRoot()
1795 rooturi = f"s3://{self.bucketName}/{self.root}"
1796 config.update({"datastore": {"datastore": {"root": rooturi}}})
1798 # need local folder to store registry database
1799 self.reg_dir = makeTestTempDir(TESTDIR)
1800 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3"
1802 # MOTO needs to know that we expect Bucket bucketname to exist
1803 # (this used to be the class attribute bucketName)
1804 s3 = boto3.resource("s3")
1805 s3.create_bucket(Bucket=self.bucketName)
1807 self.datastoreStr = [f"datastore='{rooturi}'"]
1808 self.datastoreName = [f"FileDatastore@{rooturi}"]
1809 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1810 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1812 def tearDown(self) -> None:
1813 s3 = boto3.resource("s3")
1814 bucket = s3.Bucket(self.bucketName)
1815 try:
1816 bucket.objects.all().delete()
1817 except botocore.exceptions.ClientError as e:
1818 if e.response["Error"]["Code"] == "404":
1819 # the key was not reachable - pass
1820 pass
1821 else:
1822 raise
1824 bucket = s3.Bucket(self.bucketName)
1825 bucket.delete()
1827 # Stop the S3 mock.
1828 self.mock_s3.stop()
1830 # unset any potentially set dummy credentials
1831 if self.usingDummyCredentials:
1832 unsetAwsEnvCredentials()
1834 if self.reg_dir is not None and os.path.exists(self.reg_dir):
1835 shutil.rmtree(self.reg_dir, ignore_errors=True)
1837 if self.useTempRoot and os.path.exists(self.root):
1838 shutil.rmtree(self.root, ignore_errors=True)
1840 super().tearDown()
1843class PosixDatastoreTransfers(unittest.TestCase):
1844 """Test data transfers between butlers.
1846 Test for different managers. UUID to UUID and integer to integer are
1847 tested. UUID to integer is not supported since we do not currently
1848 want to allow that. Integer to UUID is supported with the caveat
1849 that UUID4 will be generated and this will be incorrect for raw
1850 dataset types. The test ignores that.
1851 """
1853 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1854 storageClassFactory: StorageClassFactory
1856 @classmethod
1857 def setUpClass(cls) -> None:
1858 cls.storageClassFactory = StorageClassFactory()
1859 cls.storageClassFactory.addFromConfig(cls.configFile)
1861 def setUp(self) -> None:
1862 self.root = makeTestTempDir(TESTDIR)
1863 self.config = Config(self.configFile)
1865 def tearDown(self) -> None:
1866 removeTestTempDir(self.root)
1868 def create_butler(self, manager: str, label: str) -> Butler:
1869 config = Config(self.configFile)
1870 config["registry", "managers", "datasets"] = manager
1871 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
1873 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None:
1874 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID"
1875 if manager1 is None:
1876 manager1 = default
1877 if manager2 is None:
1878 manager2 = default
1879 self.source_butler = self.create_butler(manager1, "1")
1880 self.target_butler = self.create_butler(manager2, "2")
1882 def testTransferUuidToUuid(self) -> None:
1883 self.create_butlers()
1884 self.assertButlerTransfers()
1886 def _enable_trust(self, datastore: Datastore) -> None:
1887 if hasattr(datastore, "trustGetRequest"):
1888 datastore.trustGetRequest = True
1889 elif hasattr(datastore, "datastores"):
1890 for datastore in datastore.datastores:
1891 if hasattr(datastore, "trustGetRequest"):
1892 datastore.trustGetRequest = True
1894 def testTransferMissing(self) -> None:
1895 """Test transfers where datastore records are missing.
1897 This is how execution butler works.
1898 """
1899 self.create_butlers()
1901 # Configure the source butler to allow trust.
1902 self._enable_trust(self.source_butler.datastore)
1904 self.assertButlerTransfers(purge=True)
1906 def testTransferMissingDisassembly(self) -> None:
1907 """Test transfers where datastore records are missing.
1909 This is how execution butler works.
1910 """
1911 self.create_butlers()
1913 # Configure the source butler to allow trust.
1914 self._enable_trust(self.source_butler.datastore)
1916 # Test disassembly.
1917 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite")
1919 def testAbsoluteURITransferDirect(self) -> None:
1920 """Test transfer using an absolute URI."""
1921 self._absolute_transfer("auto")
1923 def testAbsoluteURITransferCopy(self) -> None:
1924 """Test transfer using an absolute URI."""
1925 self._absolute_transfer("copy")
1927 def _absolute_transfer(self, transfer: str) -> None:
1928 self.create_butlers()
1930 storageClassName = "StructuredData"
1931 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1932 datasetTypeName = "random_data"
1933 run = "run1"
1934 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1936 dimensions = self.source_butler.registry.dimensions.extract(())
1937 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1938 self.source_butler.registry.registerDatasetType(datasetType)
1940 metrics = makeExampleMetrics()
1941 with ResourcePath.temporary_uri(suffix=".json") as temp:
1942 dataId = DataCoordinate.makeEmpty(self.source_butler.dimensions)
1943 source_refs = [DatasetRef(datasetType, dataId, run=run)]
1944 temp.write(json.dumps(metrics.exportAsDict()).encode())
1945 dataset = FileDataset(path=temp, refs=source_refs)
1946 self.source_butler.ingest(dataset, transfer="direct")
1948 self.target_butler.transfer_from(
1949 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer
1950 )
1952 uri = self.target_butler.getURI(dataset.refs[0])
1953 if transfer == "auto":
1954 self.assertEqual(uri, temp)
1955 else:
1956 self.assertNotEqual(uri, temp)
1958 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None:
1959 """Test that a run can be transferred to another butler."""
1961 storageClass = self.storageClassFactory.getStorageClass(storageClassName)
1962 datasetTypeName = "random_data"
1964 # Test will create 3 collections and we will want to transfer
1965 # two of those three.
1966 runs = ["run1", "run2", "other"]
1968 # Also want to use two different dataset types to ensure that
1969 # grouping works.
1970 datasetTypeNames = ["random_data", "random_data_2"]
1972 # Create the run collections in the source butler.
1973 for run in runs:
1974 self.source_butler.registry.registerCollection(run, CollectionType.RUN)
1976 # Create dimensions in source butler.
1977 n_exposures = 30
1978 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
1979 self.source_butler.registry.insertDimensionData(
1980 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"}
1981 )
1982 self.source_butler.registry.insertDimensionData(
1983 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"}
1984 )
1986 for i in range(n_exposures):
1987 self.source_butler.registry.insertDimensionData(
1988 "exposure",
1989 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"},
1990 )
1992 # Create dataset types in the source butler.
1993 dimensions = self.source_butler.registry.dimensions.extract(["instrument", "exposure"])
1994 for datasetTypeName in datasetTypeNames:
1995 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
1996 self.source_butler.registry.registerDatasetType(datasetType)
1998 # Write a dataset to an unrelated run -- this will ensure that
1999 # we are rewriting integer dataset ids in the target if necessary.
2000 # Will not be relevant for UUID.
2001 run = "distraction"
2002 butler = Butler(butler=self.source_butler, run=run)
2003 butler.put(
2004 makeExampleMetrics(),
2005 datasetTypeName,
2006 exposure=1,
2007 instrument="DummyCamComp",
2008 physical_filter="d-r",
2009 )
2011 # Write some example metrics to the source
2012 butler = Butler(butler=self.source_butler)
2014 # Set of DatasetRefs that should be in the list of refs to transfer
2015 # but which will not be transferred.
2016 deleted: set[DatasetRef] = set()
2018 n_expected = 20 # Number of datasets expected to be transferred
2019 source_refs = []
2020 for i in range(n_exposures):
2021 # Put a third of datasets into each collection, only retain
2022 # two thirds.
2023 index = i % 3
2024 run = runs[index]
2025 datasetTypeName = datasetTypeNames[i % 2]
2027 metric = MetricsExample(
2028 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)]
2029 )
2030 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"}
2031 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run)
2033 # Remove the datastore record using low-level API
2034 if purge:
2035 # Remove records for a fraction.
2036 if index == 1:
2037 # For one of these delete the file as well.
2038 # This allows the "missing" code to filter the
2039 # file out.
2040 # Access the individual datastores.
2041 datastores = []
2042 if hasattr(butler.datastore, "datastores"):
2043 datastores.extend(butler.datastore.datastores)
2044 else:
2045 datastores.append(butler.datastore)
2047 if not deleted:
2048 # For a chained datastore we need to remove
2049 # files in each chain.
2050 for datastore in datastores:
2051 # The file might not be known to the datastore
2052 # if constraints are used.
2053 try:
2054 primary, uris = datastore.getURIs(ref)
2055 except FileNotFoundError:
2056 continue
2057 if primary:
2058 if primary.scheme != "mem":
2059 primary.remove()
2060 for uri in uris.values():
2061 if uri.scheme != "mem":
2062 uri.remove()
2063 n_expected -= 1
2064 deleted.add(ref)
2066 # Remove the datastore record.
2067 for datastore in datastores:
2068 if hasattr(datastore, "removeStoredItemInfo"):
2069 datastore.removeStoredItemInfo(ref)
2071 if index < 2:
2072 source_refs.append(ref)
2073 if ref not in deleted:
2074 new_metric = butler.get(ref)
2075 self.assertEqual(new_metric, metric)
2077 # Create some bad dataset types to ensure we check for inconsistent
2078 # definitions.
2079 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList")
2080 for datasetTypeName in datasetTypeNames:
2081 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass)
2082 self.target_butler.registry.registerDatasetType(datasetType)
2083 with self.assertRaises(ConflictingDefinitionError) as cm:
2084 self.target_butler.transfer_from(self.source_butler, source_refs)
2085 self.assertIn("dataset type differs", str(cm.exception))
2087 # And remove the bad definitions.
2088 for datasetTypeName in datasetTypeNames:
2089 self.target_butler.registry.removeDatasetType(datasetTypeName)
2091 # Transfer without creating dataset types should fail.
2092 with self.assertRaises(KeyError):
2093 self.target_butler.transfer_from(self.source_butler, source_refs)
2095 # Transfer without creating dimensions should fail.
2096 with self.assertRaises(ConflictingDefinitionError) as cm:
2097 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True)
2098 self.assertIn("dimension", str(cm.exception))
2100 # The failed transfer above leaves registry in an inconsistent
2101 # state because the run is created but then rolled back without
2102 # the collection cache being cleared. For now force a refresh.
2103 # Can remove with DM-35498.
2104 self.target_butler.registry.refresh()
2106 # Now transfer them to the second butler, including dimensions.
2107 with self.assertLogs(level=logging.DEBUG) as log_cm:
2108 transferred = self.target_butler.transfer_from(
2109 self.source_butler,
2110 source_refs,
2111 register_dataset_types=True,
2112 transfer_dimensions=True,
2113 )
2114 self.assertEqual(len(transferred), n_expected)
2115 log_output = ";".join(log_cm.output)
2117 # A ChainedDatastore will use the in-memory datastore for mexists
2118 # so we can not rely on the mexists log message.
2119 self.assertIn("Number of datastore records found in source", log_output)
2120 self.assertIn("Creating output run", log_output)
2122 # Do the transfer twice to ensure that it will do nothing extra.
2123 # Only do this if purge=True because it does not work for int
2124 # dataset_id.
2125 if purge:
2126 # This should not need to register dataset types.
2127 transferred = self.target_butler.transfer_from(self.source_butler, source_refs)
2128 self.assertEqual(len(transferred), n_expected)
2130 # Also do an explicit low-level transfer to trigger some
2131 # edge cases.
2132 with self.assertLogs(level=logging.DEBUG) as log_cm:
2133 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs)
2134 log_output = ";".join(log_cm.output)
2135 self.assertIn("no file artifacts exist", log_output)
2137 with self.assertRaises((TypeError, AttributeError)):
2138 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) # type: ignore
2140 with self.assertRaises(ValueError):
2141 self.target_butler.datastore.transfer_from(
2142 self.source_butler.datastore, source_refs, transfer="split"
2143 )
2145 # Now try to get the same refs from the new butler.
2146 for ref in source_refs:
2147 if ref not in deleted:
2148 new_metric = self.target_butler.get(ref)
2149 old_metric = self.source_butler.get(ref)
2150 self.assertEqual(new_metric, old_metric)
2152 # Now prune run2 collection and create instead a CHAINED collection.
2153 # This should block the transfer.
2154 self.target_butler.removeRuns(["run2"], unstore=True)
2155 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED)
2156 with self.assertRaises(CollectionTypeError):
2157 # Re-importing the run1 datasets can be problematic if they
2158 # use integer IDs so filter those out.
2159 to_transfer = [ref for ref in source_refs if ref.run == "run2"]
2160 self.target_butler.transfer_from(self.source_butler, to_transfer)
2163class ChainedDatastoreTransfers(PosixDatastoreTransfers):
2164 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
2167if __name__ == "__main__":
2168 unittest.main()