Coverage for tests/test_butler.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""Tests for Butler.
23"""
25import os
26import posixpath
27import unittest
28import tempfile
29import shutil
30import pickle
31import string
32import random
33import numpy as np
35try:
36 import boto3
37 import botocore
38 from moto import mock_s3
39except ImportError:
40 boto3 = None
42 def mock_s3(cls):
43 """A no-op decorator in case moto mock_s3 can not be imported.
44 """
45 return cls
47from lsst.utils import doImport
48from lsst.daf.butler.core.safeFileIo import safeMakeDir
49from lsst.daf.butler import Butler, Config, ButlerConfig
50from lsst.daf.butler import StorageClassFactory
51from lsst.daf.butler import DatasetType, DatasetRef
52from lsst.daf.butler import FileTemplateValidationError, ValidationError
53from lsst.daf.butler import FileDataset
54from lsst.daf.butler import CollectionSearch, CollectionType
55from lsst.daf.butler.registry import MissingCollectionError
56from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
57from lsst.daf.butler.core.location import ButlerURI
58from lsst.daf.butler.core.s3utils import (s3CheckFileExists, setAwsEnvCredentials,
59 unsetAwsEnvCredentials)
61from lsst.daf.butler.tests import MultiDetectorFormatter, MetricsExample
63TESTDIR = os.path.abspath(os.path.dirname(__file__))
66def makeExampleMetrics():
67 return MetricsExample({"AM1": 5.2, "AM2": 30.6},
68 {"a": [1, 2, 3],
69 "b": {"blue": 5, "red": "green"}},
70 [563, 234, 456.7, 752, 8, 9, 27]
71 )
74class TransactionTestError(Exception):
75 """Specific error for testing transactions, to prevent misdiagnosing
76 that might otherwise occur when a standard exception is used.
77 """
78 pass
81class ButlerConfigTests(unittest.TestCase):
82 """Simple tests for ButlerConfig that are not tested in other test cases.
83 """
85 def testSearchPath(self):
86 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml")
87 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
88 config1 = ButlerConfig(configFile)
89 self.assertNotIn("testConfigs", "\n".join(cm.output))
91 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs")
92 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm:
93 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory])
94 self.assertIn("testConfigs", "\n".join(cm.output))
96 key = ("datastore", "records", "table")
97 self.assertNotEqual(config1[key], config2[key])
98 self.assertEqual(config2[key], "override_record")
101class ButlerPutGetTests:
102 """Helper method for running a suite of put/get tests from different
103 butler configurations."""
105 root = None
107 @staticmethod
108 def addDatasetType(datasetTypeName, dimensions, storageClass, registry):
109 """Create a DatasetType and register it
110 """
111 datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
112 registry.registerDatasetType(datasetType)
113 return datasetType
115 @classmethod
116 def setUpClass(cls):
117 cls.storageClassFactory = StorageClassFactory()
118 cls.storageClassFactory.addFromConfig(cls.configFile)
120 def assertGetComponents(self, butler, datasetRef, components, reference):
121 datasetTypeName = datasetRef.datasetType.name
122 dataId = datasetRef.dataId
123 for component in components:
124 compTypeName = DatasetType.nameWithComponent(datasetTypeName, component)
125 result = butler.get(compTypeName, dataId)
126 self.assertEqual(result, getattr(reference, component))
128 def tearDown(self):
129 if self.root is not None and os.path.exists(self.root):
130 shutil.rmtree(self.root, ignore_errors=True)
132 def runPutGetTest(self, storageClass, datasetTypeName):
133 # New datasets will be added to run and tag, but we will only look in
134 # tag when looking up datasets.
135 run = "ingest/run"
136 tag = "ingest"
137 butler = Butler(self.tmpConfigFile, run=run, collections=[tag], tags=[tag])
139 # There will not be a collection yet
140 collections = set(butler.registry.queryCollections())
141 self.assertEqual(collections, set([run, tag]))
143 # Create and register a DatasetType
144 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
146 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
148 # Try to create one that will have a name that is too long
149 with self.assertRaises(Exception) as cm:
150 self.addDatasetType("DatasetTypeNameTooLong" * 50, dimensions, storageClass, butler.registry)
151 self.assertIn("check constraint", str(cm.exception).lower())
153 # Add needed Dimensions
154 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
155 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp",
156 "name": "d-r",
157 "abstract_filter": "R"})
158 butler.registry.insertDimensionData("visit_system", {"instrument": "DummyCamComp",
159 "id": 1,
160 "name": "default"})
161 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423,
162 "name": "fourtwentythree", "physical_filter": "d-r",
163 "visit_system": 1})
165 # Create and store a dataset
166 metric = makeExampleMetrics()
167 dataId = {"instrument": "DummyCamComp", "visit": 423}
169 # Create a DatasetRef for put
170 refIn = DatasetRef(datasetType, dataId, id=None)
172 # Put with a preexisting id should fail
173 with self.assertRaises(ValueError):
174 butler.put(metric, DatasetRef(datasetType, dataId, id=100))
176 # Put and remove the dataset once as a DatasetRef, once as a dataId,
177 # and once with a DatasetType
178 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)):
179 with self.subTest(args=args):
180 ref = butler.put(metric, *args)
181 self.assertIsInstance(ref, DatasetRef)
183 # Test getDirect
184 metricOut = butler.getDirect(ref)
185 self.assertEqual(metric, metricOut)
186 # Test get
187 metricOut = butler.get(ref.datasetType.name, dataId)
188 self.assertEqual(metric, metricOut)
189 # Test get with a datasetRef
190 metricOut = butler.get(ref)
191 self.assertEqual(metric, metricOut)
192 # Test getDeferred with dataId
193 metricOut = butler.getDeferred(ref.datasetType.name, dataId).get()
194 self.assertEqual(metric, metricOut)
195 # Test getDeferred with a datasetRef
196 metricOut = butler.getDeferred(ref).get()
197 self.assertEqual(metric, metricOut)
199 # Check we can get components
200 if storageClass.isComposite():
201 self.assertGetComponents(butler, ref,
202 ("summary", "data", "output"), metric)
204 # Remove from the tagged collection only; after that we
205 # shouldn't be able to find it unless we use the dataset_id.
206 butler.pruneDatasets([ref])
207 with self.assertRaises(LookupError):
208 butler.datasetExists(*args)
209 # Registry still knows about it, if we use the dataset_id.
210 self.assertEqual(butler.registry.getDataset(ref.id), ref)
211 # If we use the output ref with the dataset_id, we should
212 # still be able to load it with getDirect().
213 self.assertEqual(metric, butler.getDirect(ref))
215 # Reinsert into collection, then delete from Datastore *and*
216 # remove from collection.
217 butler.registry.associate(tag, [ref])
218 butler.pruneDatasets([ref], unstore=True)
219 # Lookup with original args should still fail.
220 with self.assertRaises(LookupError):
221 butler.datasetExists(*args)
222 # Now getDirect() should fail, too.
223 with self.assertRaises(FileNotFoundError, msg=f"Checking ref {ref} not found"):
224 butler.getDirect(ref)
225 # Registry still knows about it, if we use the dataset_id.
226 self.assertEqual(butler.registry.getDataset(ref.id), ref)
228 # Now remove the dataset completely.
229 butler.pruneDatasets([ref], purge=True, unstore=True)
230 # Lookup with original args should still fail.
231 with self.assertRaises(LookupError):
232 butler.datasetExists(*args)
233 # getDirect() should still fail.
234 with self.assertRaises(FileNotFoundError):
235 butler.getDirect(ref)
236 # Registry shouldn't be able to find it by dataset_id anymore.
237 self.assertIsNone(butler.registry.getDataset(ref.id))
239 # Put the dataset again, since the last thing we did was remove it.
240 ref = butler.put(metric, refIn)
242 # Get with parameters
243 stop = 4
244 sliced = butler.get(ref, parameters={"slice": slice(stop)})
245 self.assertNotEqual(metric, sliced)
246 self.assertEqual(metric.summary, sliced.summary)
247 self.assertEqual(metric.output, sliced.output)
248 self.assertEqual(metric.data[:stop], sliced.data)
249 # getDeferred with parameters
250 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get()
251 self.assertNotEqual(metric, sliced)
252 self.assertEqual(metric.summary, sliced.summary)
253 self.assertEqual(metric.output, sliced.output)
254 self.assertEqual(metric.data[:stop], sliced.data)
255 # getDeferred with deferred parameters
256 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)})
257 self.assertNotEqual(metric, sliced)
258 self.assertEqual(metric.summary, sliced.summary)
259 self.assertEqual(metric.output, sliced.output)
260 self.assertEqual(metric.data[:stop], sliced.data)
262 if storageClass.isComposite():
263 # Check that components can be retrieved
264 # ref.components will only be populated in certain cases
265 metricOut = butler.get(ref.datasetType.name, dataId)
266 compNameS = DatasetType.nameWithComponent(datasetTypeName, "summary")
267 compNameD = DatasetType.nameWithComponent(datasetTypeName, "data")
268 summary = butler.get(compNameS, dataId)
269 self.assertEqual(summary, metric.summary)
270 data = butler.get(compNameD, dataId)
271 self.assertEqual(data, metric.data)
273 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
274 if ref.components:
275 self.assertTrue(butler.datastore.exists(ref.components["summary"]))
276 self.assertEqual(compRef, ref.components["summary"])
277 self.assertTrue(butler.datastore.exists(ref.components["data"]))
278 else:
279 self.assertTrue(compRef.hasParentId)
281 # Create a Dataset type that has the same name but is inconsistent.
282 inconsistentDatasetType = DatasetType(datasetTypeName, dimensions,
283 self.storageClassFactory.getStorageClass("Config"))
285 # Getting with a dataset type that does not match registry fails
286 with self.assertRaises(ValueError):
287 butler.get(inconsistentDatasetType, dataId)
289 # Combining a DatasetRef with a dataId should fail
290 with self.assertRaises(ValueError):
291 butler.get(ref, dataId)
292 # Getting with an explicit ref should fail if the id doesn't match
293 with self.assertRaises(ValueError):
294 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101))
296 # Getting a dataset with unknown parameters should fail
297 with self.assertRaises(KeyError):
298 butler.get(ref, parameters={"unsupported": True})
300 # Check we have a collection
301 collections = set(butler.registry.queryCollections())
302 self.assertEqual(collections, {run, tag})
304 # Clean up to check that we can remove something that may have
305 # already had a component removed
306 butler.pruneDatasets([ref], unstore=True, purge=True)
308 # Add a dataset back in since some downstream tests require
309 # something to be present
310 ref = butler.put(metric, refIn)
312 return butler
314 def testDeferredCollectionPassing(self):
315 # Construct a butler with no run or collection, but make it writeable.
316 butler = Butler(self.tmpConfigFile, writeable=True)
317 # Create and register a DatasetType
318 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
319 datasetType = self.addDatasetType("example", dimensions,
320 self.storageClassFactory.getStorageClass("StructuredData"),
321 butler.registry)
322 # Add needed Dimensions
323 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
324 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp",
325 "name": "d-r",
326 "abstract_filter": "R"})
327 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423,
328 "name": "fourtwentythree", "physical_filter": "d-r"})
329 dataId = {"instrument": "DummyCamComp", "visit": 423}
330 # Create dataset.
331 metric = makeExampleMetrics()
332 # Register a new run and put dataset.
333 run = "deferred"
334 butler.registry.registerRun(run)
335 ref = butler.put(metric, datasetType, dataId, run=run)
336 # Putting with no run should fail with TypeError.
337 with self.assertRaises(TypeError):
338 butler.put(metric, datasetType, dataId)
339 # Dataset should exist.
340 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
341 # We should be able to get the dataset back, but with and without
342 # a deferred dataset handle.
343 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run]))
344 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get())
345 # Trying to find the dataset without any collection is a TypeError.
346 with self.assertRaises(TypeError):
347 butler.datasetExists(datasetType, dataId)
348 with self.assertRaises(TypeError):
349 butler.get(datasetType, dataId)
350 # Associate the dataset with a different collection.
351 butler.registry.registerCollection("tagged")
352 butler.registry.associate("tagged", [ref])
353 # Deleting the dataset from the new collection should make it findable
354 # in the original collection.
355 butler.pruneDatasets([ref], tags=["tagged"])
356 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run]))
359class ButlerTests(ButlerPutGetTests):
360 """Tests for Butler.
361 """
362 useTempRoot = True
364 def setUp(self):
365 """Create a new butler root for each test."""
366 if self.useTempRoot:
367 self.root = tempfile.mkdtemp(dir=TESTDIR)
368 Butler.makeRepo(self.root, config=Config(self.configFile))
369 self.tmpConfigFile = os.path.join(self.root, "butler.yaml")
370 else:
371 self.root = None
372 self.tmpConfigFile = self.configFile
374 def testConstructor(self):
375 """Independent test of constructor.
376 """
377 butler = Butler(self.tmpConfigFile, run="ingest")
378 self.assertIsInstance(butler, Butler)
380 collections = set(butler.registry.queryCollections())
381 self.assertEqual(collections, {"ingest"})
383 butler2 = Butler(butler=butler, collections=["other"])
384 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"]))
385 self.assertIsNone(butler2.run)
386 self.assertIs(butler.registry, butler2.registry)
387 self.assertIs(butler.datastore, butler2.datastore)
389 def testBasicPutGet(self):
390 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
391 self.runPutGetTest(storageClass, "test_metric")
393 def testCompositePutGetConcrete(self):
394 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
395 self.runPutGetTest(storageClass, "test_metric")
397 def testCompositePutGetVirtual(self):
398 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
399 self.runPutGetTest(storageClass, "test_metric_comp")
401 def testIngest(self):
402 butler = Butler(self.tmpConfigFile, run="ingest")
404 # Create and register a DatasetType
405 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"])
407 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml")
408 datasetTypeName = "metric"
410 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
412 # Add needed Dimensions
413 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
414 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp",
415 "name": "d-r",
416 "abstract_filter": "R"})
417 for detector in (1, 2):
418 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", "id": detector,
419 "full_name": f"detector{detector}"})
421 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423,
422 "name": "fourtwentythree", "physical_filter": "d-r"},
423 {"instrument": "DummyCamComp", "id": 424,
424 "name": "fourtwentyfour", "physical_filter": "d-r"})
426 formatter = doImport("lsst.daf.butler.formatters.yamlFormatter.YamlFormatter")
427 dataRoot = os.path.join(TESTDIR, "data", "basic")
428 datasets = []
429 for detector in (1, 2):
430 detector_name = f"detector_{detector}"
431 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
432 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector}
433 # Create a DatasetRef for ingest
434 refIn = DatasetRef(datasetType, dataId, id=None)
436 datasets.append(FileDataset(path=metricFile,
437 refs=[refIn],
438 formatter=formatter))
440 butler.ingest(*datasets, transfer="copy")
442 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
443 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}
445 metrics1 = butler.get(datasetTypeName, dataId1)
446 metrics2 = butler.get(datasetTypeName, dataId2)
447 self.assertNotEqual(metrics1, metrics2)
449 # Compare URIs
450 uri1 = butler.getUri(datasetTypeName, dataId1)
451 uri2 = butler.getUri(datasetTypeName, dataId2)
452 self.assertNotEqual(uri1, uri2)
454 # Now do a multi-dataset but single file ingest
455 metricFile = os.path.join(dataRoot, "detectors.yaml")
456 refs = []
457 for detector in (1, 2):
458 detector_name = f"detector_{detector}"
459 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector}
460 # Create a DatasetRef for ingest
461 refs.append(DatasetRef(datasetType, dataId, id=None))
463 datasets = []
464 datasets.append(FileDataset(path=metricFile,
465 refs=refs,
466 formatter=MultiDetectorFormatter))
468 butler.ingest(*datasets, transfer="copy")
470 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
471 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}
473 multi1 = butler.get(datasetTypeName, dataId1)
474 multi2 = butler.get(datasetTypeName, dataId2)
476 self.assertEqual(multi1, metrics1)
477 self.assertEqual(multi2, metrics2)
479 # Compare URIs
480 uri1 = butler.getUri(datasetTypeName, dataId1)
481 uri2 = butler.getUri(datasetTypeName, dataId2)
482 self.assertEqual(uri1, uri2)
484 # Test that removing one does not break the second
485 # This line will issue a warning log message for a ChainedDatastore
486 # that uses an InMemoryDatastore since in-memory can not ingest
487 # files.
488 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False)
489 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1))
490 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
491 multi2b = butler.get(datasetTypeName, dataId2)
492 self.assertEqual(multi2, multi2b)
494 def testPruneCollections(self):
495 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
496 butler = Butler(self.tmpConfigFile, writeable=True)
497 # Load registry data with dimensions to hang datasets off of.
498 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
499 butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
500 # Add some RUN-type collections.
501 run1 = "run1"
502 butler.registry.registerRun(run1)
503 run2 = "run2"
504 butler.registry.registerRun(run2)
505 # put some datasets. ref1 and ref2 have the same data ID, and are in
506 # different runs. ref3 has a different data ID.
507 metric = makeExampleMetrics()
508 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"])
509 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass,
510 butler.registry)
511 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1)
512 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2)
513 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1)
514 # Try to delete a RUN collection without purge, or with purge and not
515 # unstore.
516 with self.assertRaises(TypeError):
517 butler.pruneCollection(run1)
518 with self.assertRaises(TypeError):
519 butler.pruneCollection(run2, purge=True)
520 # Add a TAGGED collection and associate ref3 only into it.
521 tag1 = "tag1"
522 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
523 butler.registry.associate(tag1, [ref3])
524 # Add a CHAINED collection that searches run1 and then run2. It
525 # logically contains only ref1, because ref2 is shadowed due to them
526 # having the same data ID and dataset type.
527 chain1 = "chain1"
528 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
529 butler.registry.setCollectionChain(chain1, [run1, run2])
530 # Try to delete RUN collections, which should fail with complete
531 # rollback because they're still referenced by the CHAINED
532 # collection.
533 with self.assertRaises(Exception):
534 butler.pruneCollection(run1, pruge=True, unstore=True)
535 with self.assertRaises(Exception):
536 butler.pruneCollection(run2, pruge=True, unstore=True)
537 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)),
538 [ref1, ref2, ref3])
539 self.assertTrue(butler.datastore.exists(ref1))
540 self.assertTrue(butler.datastore.exists(ref2))
541 self.assertTrue(butler.datastore.exists(ref3))
542 # Try to delete CHAINED and TAGGED collections with purge; should not
543 # work.
544 with self.assertRaises(TypeError):
545 butler.pruneCollection(tag1, purge=True, unstore=True)
546 with self.assertRaises(TypeError):
547 butler.pruneCollection(chain1, purge=True, unstore=True)
548 # Remove the tagged collection with unstore=False. This should not
549 # affect the datasets.
550 butler.pruneCollection(tag1)
551 with self.assertRaises(MissingCollectionError):
552 butler.registry.getCollectionType(tag1)
553 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)),
554 [ref1, ref2, ref3])
555 self.assertTrue(butler.datastore.exists(ref1))
556 self.assertTrue(butler.datastore.exists(ref2))
557 self.assertTrue(butler.datastore.exists(ref3))
558 # Add the tagged collection back in, and remove it with unstore=True.
559 # This should remove ref3 only from the datastore.
560 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED)
561 butler.registry.associate(tag1, [ref3])
562 butler.pruneCollection(tag1, unstore=True)
563 with self.assertRaises(MissingCollectionError):
564 butler.registry.getCollectionType(tag1)
565 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)),
566 [ref1, ref2, ref3])
567 self.assertTrue(butler.datastore.exists(ref1))
568 self.assertTrue(butler.datastore.exists(ref2))
569 self.assertFalse(butler.datastore.exists(ref3))
570 # Delete the chain with unstore=False. The datasets should not be
571 # affected at all.
572 butler.pruneCollection(chain1)
573 with self.assertRaises(MissingCollectionError):
574 butler.registry.getCollectionType(chain1)
575 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)),
576 [ref1, ref2, ref3])
577 self.assertTrue(butler.datastore.exists(ref1))
578 self.assertTrue(butler.datastore.exists(ref2))
579 self.assertFalse(butler.datastore.exists(ref3))
580 # Redefine and then delete the chain with unstore=True. Only ref1
581 # should be unstored (ref3 has already been unstored, but otherwise
582 # would be now).
583 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED)
584 butler.registry.setCollectionChain(chain1, [run1, run2])
585 butler.pruneCollection(chain1, unstore=True)
586 with self.assertRaises(MissingCollectionError):
587 butler.registry.getCollectionType(chain1)
588 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)),
589 [ref1, ref2, ref3])
590 self.assertFalse(butler.datastore.exists(ref1))
591 self.assertTrue(butler.datastore.exists(ref2))
592 self.assertFalse(butler.datastore.exists(ref3))
593 # Remove run1. This removes ref1 and ref3 from the registry (they're
594 # already gone from the datastore, which is fine).
595 butler.pruneCollection(run1, purge=True, unstore=True)
596 with self.assertRaises(MissingCollectionError):
597 butler.registry.getCollectionType(run1)
598 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)),
599 [ref2])
600 self.assertTrue(butler.datastore.exists(ref2))
601 # Remove run2. This removes ref2 from the registry and the datastore.
602 butler.pruneCollection(run2, purge=True, unstore=True)
603 with self.assertRaises(MissingCollectionError):
604 butler.registry.getCollectionType(run2)
605 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)),
606 [])
608 def testPickle(self):
609 """Test pickle support.
610 """
611 butler = Butler(self.tmpConfigFile, run="ingest")
612 butlerOut = pickle.loads(pickle.dumps(butler))
613 self.assertIsInstance(butlerOut, Butler)
614 self.assertEqual(butlerOut._config, butler._config)
615 self.assertEqual(butlerOut.collections, butler.collections)
616 self.assertEqual(butlerOut.run, butler.run)
618 def testGetDatasetTypes(self):
619 butler = Butler(self.tmpConfigFile, run="ingest")
620 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"])
621 dimensionEntries = [
622 ("instrument", {"instrument": "DummyCam"}, {"instrument": "DummyHSC"},
623 {"instrument": "DummyCamComp"}),
624 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "abstract_filter": "R"}),
625 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"})
626 ]
627 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
628 # Add needed Dimensions
629 for args in dimensionEntries:
630 butler.registry.insertDimensionData(*args)
632 # When a DatasetType is added to the registry entries are created
633 # for each component. Need entries for each component in the test
634 # configuration otherwise validation won't work. The ones that
635 # are deliberately broken will be ignored later.
636 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi"}
637 components = set()
638 for datasetTypeName in datasetTypeNames:
639 # Create and register a DatasetType
640 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
642 for componentName in storageClass.components:
643 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName))
645 fromRegistry = set(butler.registry.queryDatasetTypes())
646 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components)
648 # Now that we have some dataset types registered, validate them
649 butler.validateConfiguration(ignore=["test_metric_comp", "metric3", "calexp", "DummySC",
650 "datasetType.component"])
652 # Add a new datasetType that will fail template validation
653 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry)
654 if self.validationCanFail:
655 with self.assertRaises(ValidationError):
656 butler.validateConfiguration()
658 # Rerun validation but with a subset of dataset type names
659 butler.validateConfiguration(datasetTypeNames=["metric4"])
661 # Rerun validation but ignore the bad datasetType
662 butler.validateConfiguration(ignore=["test_metric_comp", "metric3", "calexp", "DummySC",
663 "datasetType.component"])
665 def testTransaction(self):
666 butler = Butler(self.tmpConfigFile, run="ingest")
667 datasetTypeName = "test_metric"
668 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
669 dimensionEntries = (("instrument", {"instrument": "DummyCam"}),
670 ("physical_filter", {"instrument": "DummyCam", "name": "d-r",
671 "abstract_filter": "R"}),
672 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo",
673 "physical_filter": "d-r"}))
674 storageClass = self.storageClassFactory.getStorageClass("StructuredData")
675 metric = makeExampleMetrics()
676 dataId = {"instrument": "DummyCam", "visit": 42}
677 # Create and register a DatasetType
678 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry)
679 with self.assertRaises(TransactionTestError):
680 with butler.transaction():
681 # Add needed Dimensions
682 for args in dimensionEntries:
683 butler.registry.insertDimensionData(*args)
684 # Store a dataset
685 ref = butler.put(metric, datasetTypeName, dataId)
686 self.assertIsInstance(ref, DatasetRef)
687 # Test getDirect
688 metricOut = butler.getDirect(ref)
689 self.assertEqual(metric, metricOut)
690 # Test get
691 metricOut = butler.get(datasetTypeName, dataId)
692 self.assertEqual(metric, metricOut)
693 # Check we can get components
694 self.assertGetComponents(butler, ref,
695 ("summary", "data", "output"), metric)
696 raise TransactionTestError("This should roll back the entire transaction")
697 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"):
698 butler.registry.expandDataId(dataId)
699 # Should raise LookupError for missing data ID value
700 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
701 butler.get(datasetTypeName, dataId)
702 # Also check explicitly if Dataset entry is missing
703 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
704 # Direct retrieval should not find the file in the Datastore
705 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
706 butler.getDirect(ref)
708 def testMakeRepo(self):
709 """Test that we can write butler configuration to a new repository via
710 the Butler.makeRepo interface and then instantiate a butler from the
711 repo root.
712 """
713 # Do not run the test if we know this datastore configuration does
714 # not support a file system root
715 if self.fullConfigKey is None:
716 return
718 # Remove the file created in setUp
719 os.unlink(self.tmpConfigFile)
721 butlerConfig = Butler.makeRepo(self.root, config=Config(self.configFile))
722 limited = Config(self.configFile)
723 butler1 = Butler(butlerConfig)
724 butlerConfig = Butler.makeRepo(self.root, standalone=True, createRegistry=False,
725 config=Config(self.configFile), overwrite=True)
726 full = Config(self.tmpConfigFile)
727 butler2 = Butler(butlerConfig)
728 # Butlers should have the same configuration regardless of whether
729 # defaults were expanded.
730 self.assertEqual(butler1._config, butler2._config)
731 # Config files loaded directly should not be the same.
732 self.assertNotEqual(limited, full)
733 # Make sure "limited" doesn't have a few keys we know it should be
734 # inheriting from defaults.
735 self.assertIn(self.fullConfigKey, full)
736 self.assertNotIn(self.fullConfigKey, limited)
738 # Collections don't appear until something is put in them
739 collections1 = set(butler1.registry.queryCollections())
740 self.assertEqual(collections1, set())
741 self.assertEqual(set(butler2.registry.queryCollections()), collections1)
743 # Check that a config with no associated file name will not
744 # work properly with relocatable Butler repo
745 butlerConfig.configFile = None
746 with self.assertRaises(ValueError):
747 Butler(butlerConfig)
749 with self.assertRaises(FileExistsError):
750 Butler.makeRepo(self.root, standalone=True, createRegistry=False,
751 config=Config(self.configFile), overwrite=False)
753 def testStringification(self):
754 butler = Butler(self.tmpConfigFile, run="ingest")
755 butlerStr = str(butler)
757 if self.datastoreStr is not None:
758 for testStr in self.datastoreStr:
759 self.assertIn(testStr, butlerStr)
760 if self.registryStr is not None:
761 self.assertIn(self.registryStr, butlerStr)
763 datastoreName = butler.datastore.name
764 if self.datastoreName is not None:
765 for testStr in self.datastoreName:
766 self.assertIn(testStr, datastoreName)
769class FileLikeDatastoreButlerTests(ButlerTests):
770 """Common tests and specialization of ButlerTests for butlers backed
771 by datastores that inherit from FileLikeDatastore.
772 """
774 def checkFileExists(self, root, path):
775 """Checks if file exists at a given path (relative to root).
777 Test testPutTemplates verifies actual physical existance of the files
778 in the requested location. For POSIXDatastore this test is equivalent
779 to `os.path.exists` call.
780 """
781 return os.path.exists(os.path.join(root, path))
783 def testPutTemplates(self):
784 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
785 butler = Butler(self.tmpConfigFile, run="ingest")
787 # Add needed Dimensions
788 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
789 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp",
790 "name": "d-r",
791 "abstract_filter": "R"})
792 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423",
793 "physical_filter": "d-r"})
794 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425",
795 "physical_filter": "d-r"})
797 # Create and store a dataset
798 metric = makeExampleMetrics()
800 # Create two almost-identical DatasetTypes (both will use default
801 # template)
802 dimensions = butler.registry.dimensions.extract(["instrument", "visit"])
803 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass))
804 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass))
805 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass))
807 dataId1 = {"instrument": "DummyCamComp", "visit": np.int64(423)}
808 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"}
809 dataId3 = {"instrument": "DummyCamComp", "visit": 425}
811 # Put with exactly the data ID keys needed
812 ref = butler.put(metric, "metric1", dataId1)
813 self.assertTrue(self.checkFileExists(butler.datastore.root,
814 "ingest/metric1/d-r/DummyCamComp_423.pickle"))
816 # Check the template based on dimensions
817 butler.datastore.templates.validateTemplates([ref])
819 # Put with extra data ID keys (physical_filter is an optional
820 # dependency); should not change template (at least the way we're
821 # defining them to behave now; the important thing is that they
822 # must be consistent).
823 ref = butler.put(metric, "metric2", dataId2)
824 self.assertTrue(self.checkFileExists(butler.datastore.root,
825 "ingest/metric2/d-r/DummyCamComp_v423.pickle"))
827 # Check the template based on dimensions
828 butler.datastore.templates.validateTemplates([ref])
830 # Now use a file template that will not result in unique filenames
831 ref = butler.put(metric, "metric3", dataId1)
833 # Check the template based on dimensions. This one is a bad template
834 with self.assertRaises(FileTemplateValidationError):
835 butler.datastore.templates.validateTemplates([ref])
837 with self.assertRaises(FileExistsError):
838 butler.put(metric, "metric3", dataId3)
840 def testImportExport(self):
841 # Run put/get tests just to create and populate a repo.
842 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
843 self.runImportExportTest(storageClass)
845 @unittest.expectedFailure
846 def testImportExportVirtualComposite(self):
847 # Run put/get tests just to create and populate a repo.
848 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite")
849 self.runImportExportTest(storageClass)
851 def runImportExportTest(self, storageClass):
852 exportButler = self.runPutGetTest(storageClass, "test_metric")
853 # Test that the repo actually has at least one dataset.
854 datasets = list(exportButler.registry.queryDatasets(..., collections=...))
855 self.assertGreater(len(datasets), 0)
856 # Export those datasets. We used TemporaryDirectory because there
857 # doesn't seem to be a way to get the filename (as opposed to the file
858 # object) from any of tempfile's temporary-file context managers.
859 with tempfile.TemporaryDirectory() as exportDir:
860 # TODO: When PosixDatastore supports transfer-on-exist, add tests
861 # for that.
862 exportFile = os.path.join(exportDir, "exports.yaml")
863 with exportButler.export(filename=exportFile) as export:
864 export.saveDatasets(datasets)
865 self.assertTrue(os.path.exists(exportFile))
866 with tempfile.TemporaryDirectory() as importDir:
867 Butler.makeRepo(importDir, config=Config(self.configFile))
868 importButler = Butler(importDir, run="ingest/run")
869 importButler.import_(filename=exportFile, directory=exportButler.datastore.root,
870 transfer="symlink")
871 for ref in datasets:
872 with self.subTest(ref=ref):
873 # Test for existence by passing in the DatasetType and
874 # data ID separately, to avoid lookup by dataset_id.
875 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId))
878class PosixDatastoreButlerTestCase(FileLikeDatastoreButlerTests, unittest.TestCase):
879 """PosixDatastore specialization of a butler"""
880 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
881 fullConfigKey = ".datastore.formatters"
882 validationCanFail = True
883 datastoreStr = ["/tmp"]
884 datastoreName = [f"PosixDatastore@{BUTLER_ROOT_TAG}"]
885 registryStr = "/gen3.sqlite3"
888class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
889 """InMemoryDatastore specialization of a butler"""
890 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml")
891 fullConfigKey = None
892 useTempRoot = False
893 validationCanFail = False
894 datastoreStr = ["datastore='InMemory"]
895 datastoreName = ["InMemoryDatastore@"]
896 registryStr = ":memory:"
898 def testIngest(self):
899 pass
902class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase):
903 """PosixDatastore specialization"""
904 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")
905 fullConfigKey = ".datastore.datastores.1.formatters"
906 validationCanFail = True
907 datastoreStr = ["datastore='InMemory", "/PosixDatastore_1,", "/PosixDatastore_2'"]
908 datastoreName = ["InMemoryDatastore@", f"PosixDatastore@{BUTLER_ROOT_TAG}/PosixDatastore_1",
909 "SecondDatastore"]
910 registryStr = "/gen3.sqlite3"
913class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase):
914 """Test that a yaml file in one location can refer to a root in another."""
916 datastoreStr = ["dir1"]
917 # Disable the makeRepo test since we are deliberately not using
918 # butler.yaml as the config name.
919 fullConfigKey = None
921 def setUp(self):
922 self.root = tempfile.mkdtemp(dir=TESTDIR)
924 # Make a new repository in one place
925 self.dir1 = os.path.join(self.root, "dir1")
926 Butler.makeRepo(self.dir1, config=Config(self.configFile))
928 # Move the yaml file to a different place and add a "root"
929 self.dir2 = os.path.join(self.root, "dir2")
930 safeMakeDir(self.dir2)
931 configFile1 = os.path.join(self.dir1, "butler.yaml")
932 config = Config(configFile1)
933 config["root"] = self.dir1
934 configFile2 = os.path.join(self.dir2, "butler2.yaml")
935 config.dumpToFile(configFile2)
936 os.remove(configFile1)
937 self.tmpConfigFile = configFile2
939 def testFileLocations(self):
940 self.assertNotEqual(self.dir1, self.dir2)
941 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml")))
942 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml")))
943 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3")))
946class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase):
947 """Test that a config file created by makeRepo outside of repo works."""
949 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
951 def setUp(self):
952 self.root = tempfile.mkdtemp(dir=TESTDIR)
953 self.root2 = tempfile.mkdtemp(dir=TESTDIR)
955 self.tmpConfigFile = os.path.join(self.root2, "different.yaml")
956 Butler.makeRepo(self.root, config=Config(self.configFile),
957 outfile=self.tmpConfigFile)
959 def tearDown(self):
960 if os.path.exists(self.root2):
961 shutil.rmtree(self.root2, ignore_errors=True)
962 super().tearDown()
964 def testConfigExistence(self):
965 c = Config(self.tmpConfigFile)
966 uri_config = ButlerURI(c["root"])
967 uri_expected = ButlerURI(self.root, forceDirectory=True)
968 self.assertEqual(uri_config.geturl(), uri_expected.geturl())
969 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path")
971 def testPutGet(self):
972 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
973 self.runPutGetTest(storageClass, "test_metric")
976class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase):
977 """Test that a config file created by makeRepo outside of repo works."""
979 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
981 def setUp(self):
982 self.root = tempfile.mkdtemp(dir=TESTDIR)
983 self.root2 = tempfile.mkdtemp(dir=TESTDIR)
985 self.tmpConfigFile = self.root2
986 Butler.makeRepo(self.root, config=Config(self.configFile),
987 outfile=self.tmpConfigFile)
989 def testConfigExistence(self):
990 # Append the yaml file else Config constructor does not know the file
991 # type.
992 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml")
993 super().testConfigExistence()
996class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase):
997 """Test that a config file created by makeRepo outside of repo works."""
999 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")
1001 def setUp(self):
1002 self.root = tempfile.mkdtemp(dir=TESTDIR)
1003 self.root2 = tempfile.mkdtemp(dir=TESTDIR)
1005 self.tmpConfigFile = ButlerURI(os.path.join(self.root2, "something.yaml")).geturl()
1006 Butler.makeRepo(self.root, config=Config(self.configFile),
1007 outfile=self.tmpConfigFile)
1010@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
1011@mock_s3
1012class S3DatastoreButlerTestCase(FileLikeDatastoreButlerTests, unittest.TestCase):
1013 """S3Datastore specialization of a butler; an S3 storage Datastore +
1014 a local in-memory SqlRegistry.
1015 """
1016 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml")
1017 fullConfigKey = None
1018 validationCanFail = True
1020 bucketName = "anybucketname"
1021 """Name of the Bucket that will be used in the tests. The name is read from
1022 the config file used with the tests during set-up.
1023 """
1025 root = "butlerRoot/"
1026 """Root repository directory expected to be used in case useTempRoot=False.
1027 Otherwise the root is set to a 20 characters long randomly generated string
1028 during set-up.
1029 """
1031 datastoreStr = [f"datastore={root}"]
1032 """Contains all expected root locations in a format expected to be
1033 returned by Butler stringification.
1034 """
1036 datastoreName = ["S3Datastore@s3://{bucketName}/{root}"]
1037 """The expected format of the S3Datastore string."""
1039 registryStr = f":memory:"
1040 """Expected format of the Registry string."""
1042 def genRoot(self):
1043 """Returns a random string of len 20 to serve as a root
1044 name for the temporary bucket repo.
1046 This is equivalent to tempfile.mkdtemp as this is what self.root
1047 becomes when useTempRoot is True.
1048 """
1049 rndstr = "".join(
1050 random.choice(string.ascii_uppercase + string.digits) for _ in range(20)
1051 )
1052 return rndstr + "/"
1054 def setUp(self):
1055 config = Config(self.configFile)
1056 uri = ButlerURI(config[".datastore.datastore.root"])
1057 self.bucketName = uri.netloc
1059 # set up some fake credentials if they do not exist
1060 self.usingDummyCredentials = setAwsEnvCredentials()
1062 if self.useTempRoot:
1063 self.root = self.genRoot()
1064 rooturi = f"s3://{self.bucketName}/{self.root}"
1065 config.update({"datastore": {"datastore": {"root": rooturi}}})
1067 # MOTO needs to know that we expect Bucket bucketname to exist
1068 # (this used to be the class attribute bucketName)
1069 s3 = boto3.resource("s3")
1070 s3.create_bucket(Bucket=self.bucketName)
1072 self.datastoreStr = f"datastore={self.root}"
1073 self.datastoreName = [f"S3Datastore@{rooturi}"]
1074 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False)
1075 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml")
1077 def tearDown(self):
1078 s3 = boto3.resource("s3")
1079 bucket = s3.Bucket(self.bucketName)
1080 try:
1081 bucket.objects.all().delete()
1082 except botocore.exceptions.ClientError as e:
1083 if e.response["Error"]["Code"] == "404":
1084 # the key was not reachable - pass
1085 pass
1086 else:
1087 raise
1089 bucket = s3.Bucket(self.bucketName)
1090 bucket.delete()
1092 # unset any potentially set dummy credentials
1093 if self.usingDummyCredentials:
1094 unsetAwsEnvCredentials()
1096 def checkFileExists(self, root, relpath):
1097 """Checks if file exists at a given path (relative to root).
1099 Test testPutTemplates verifies actual physical existance of the files
1100 in the requested location. For S3Datastore this test is equivalent to
1101 `lsst.daf.butler.core.s3utils.s3checkFileExists` call.
1102 """
1103 uri = ButlerURI(root)
1104 uri.updateFile(relpath)
1105 return s3CheckFileExists(uri)[0]
1107 @unittest.expectedFailure
1108 def testImportExport(self):
1109 super().testImportExport()
1112if __name__ == "__main__": 1112 ↛ 1113line 1112 didn't jump to line 1113, because the condition on line 1112 was never true
1113 unittest.main()