Coverage for tests/test_butler.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1090 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24 

25import logging 

26import os 

27import pathlib 

28import pickle 

29import posixpath 

30import random 

31import shutil 

32import socket 

33import string 

34import tempfile 

35import time 

36import unittest 

37 

38try: 

39 import boto3 

40 import botocore 

41 from moto import mock_s3 

42except ImportError: 

43 boto3 = None 

44 

45 def mock_s3(cls): 

46 """A no-op decorator in case moto mock_s3 can not be imported.""" 

47 return cls 

48 

49 

50try: 

51 from cheroot import wsgi 

52 from wsgidav.wsgidav_app import WsgiDAVApp 

53except ImportError: 

54 WsgiDAVApp = None 

55 

56from tempfile import gettempdir 

57from threading import Thread 

58 

59import astropy.time 

60from lsst.daf.butler import ( 

61 Butler, 

62 ButlerConfig, 

63 ButlerURI, 

64 CollectionSearch, 

65 CollectionType, 

66 Config, 

67 DatasetIdGenEnum, 

68 DatasetRef, 

69 DatasetType, 

70 FileDataset, 

71 FileTemplateValidationError, 

72 StorageClassFactory, 

73 ValidationError, 

74 script, 

75) 

76from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

77from lsst.daf.butler.registry import ConflictingDefinitionError, MissingCollectionError 

78from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

79from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir 

80from lsst.resources.http import isWebdavEndpoint 

81from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

82from lsst.utils import doImport 

83from lsst.utils.introspection import get_full_type_name 

84 

85TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

86 

87 

88def makeExampleMetrics(): 

89 return MetricsExample( 

90 {"AM1": 5.2, "AM2": 30.6}, 

91 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

92 [563, 234, 456.7, 752, 8, 9, 27], 

93 ) 

94 

95 

96class TransactionTestError(Exception): 

97 """Specific error for testing transactions, to prevent misdiagnosing 

98 that might otherwise occur when a standard exception is used. 

99 """ 

100 

101 pass 

102 

103 

104class ButlerConfigTests(unittest.TestCase): 

105 """Simple tests for ButlerConfig that are not tested in any other test 

106 cases.""" 

107 

108 def testSearchPath(self): 

109 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

110 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

111 config1 = ButlerConfig(configFile) 

112 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

113 

114 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

115 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

116 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

117 self.assertIn("testConfigs", "\n".join(cm.output)) 

118 

119 key = ("datastore", "records", "table") 

120 self.assertNotEqual(config1[key], config2[key]) 

121 self.assertEqual(config2[key], "override_record") 

122 

123 

124class ButlerPutGetTests: 

125 """Helper method for running a suite of put/get tests from different 

126 butler configurations.""" 

127 

128 root = None 

129 

130 @staticmethod 

131 def addDatasetType(datasetTypeName, dimensions, storageClass, registry): 

132 """Create a DatasetType and register it""" 

133 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

134 registry.registerDatasetType(datasetType) 

135 return datasetType 

136 

137 @classmethod 

138 def setUpClass(cls): 

139 cls.storageClassFactory = StorageClassFactory() 

140 cls.storageClassFactory.addFromConfig(cls.configFile) 

141 

142 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None): 

143 datasetType = datasetRef.datasetType 

144 dataId = datasetRef.dataId 

145 deferred = butler.getDirectDeferred(datasetRef) 

146 

147 for component in components: 

148 compTypeName = datasetType.componentTypeName(component) 

149 result = butler.get(compTypeName, dataId, collections=collections) 

150 self.assertEqual(result, getattr(reference, component)) 

151 result_deferred = deferred.get(component=component) 

152 self.assertEqual(result_deferred, result) 

153 

154 def tearDown(self): 

155 removeTestTempDir(self.root) 

156 

157 def runPutGetTest(self, storageClass, datasetTypeName): 

158 # New datasets will be added to run and tag, but we will only look in 

159 # tag when looking up datasets. 

160 run = "ingest" 

161 butler = Butler(self.tmpConfigFile, run=run) 

162 

163 collections = set(butler.registry.queryCollections()) 

164 self.assertEqual(collections, set([run])) 

165 

166 # Create and register a DatasetType 

167 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

168 

169 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

170 

171 # Add needed Dimensions 

172 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

173 butler.registry.insertDimensionData( 

174 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

175 ) 

176 butler.registry.insertDimensionData( 

177 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

178 ) 

179 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

180 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

181 butler.registry.insertDimensionData( 

182 "visit", 

183 { 

184 "instrument": "DummyCamComp", 

185 "id": 423, 

186 "name": "fourtwentythree", 

187 "physical_filter": "d-r", 

188 "visit_system": 1, 

189 "datetime_begin": visit_start, 

190 "datetime_end": visit_end, 

191 }, 

192 ) 

193 

194 # Add a second visit for some later tests 

195 butler.registry.insertDimensionData( 

196 "visit", 

197 { 

198 "instrument": "DummyCamComp", 

199 "id": 424, 

200 "name": "fourtwentyfour", 

201 "physical_filter": "d-r", 

202 "visit_system": 1, 

203 }, 

204 ) 

205 

206 # Create and store a dataset 

207 metric = makeExampleMetrics() 

208 dataId = {"instrument": "DummyCamComp", "visit": 423} 

209 

210 # Create a DatasetRef for put 

211 refIn = DatasetRef(datasetType, dataId, id=None) 

212 

213 # Put with a preexisting id should fail 

214 with self.assertRaises(ValueError): 

215 butler.put(metric, DatasetRef(datasetType, dataId, id=100)) 

216 

217 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

218 # and once with a DatasetType 

219 

220 # Keep track of any collections we add and do not clean up 

221 expected_collections = {run} 

222 

223 counter = 0 

224 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)): 

225 # Since we are using subTest we can get cascading failures 

226 # here with the first attempt failing and the others failing 

227 # immediately because the dataset already exists. Work around 

228 # this by using a distinct run collection each time 

229 counter += 1 

230 this_run = f"put_run_{counter}" 

231 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

232 expected_collections.update({this_run}) 

233 

234 with self.subTest(args=args): 

235 ref = butler.put(metric, *args, run=this_run) 

236 self.assertIsInstance(ref, DatasetRef) 

237 

238 # Test getDirect 

239 metricOut = butler.getDirect(ref) 

240 self.assertEqual(metric, metricOut) 

241 # Test get 

242 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

243 self.assertEqual(metric, metricOut) 

244 # Test get with a datasetRef 

245 metricOut = butler.get(ref, collections=this_run) 

246 self.assertEqual(metric, metricOut) 

247 # Test getDeferred with dataId 

248 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

249 self.assertEqual(metric, metricOut) 

250 # Test getDeferred with a datasetRef 

251 metricOut = butler.getDeferred(ref, collections=this_run).get() 

252 self.assertEqual(metric, metricOut) 

253 # and deferred direct with ref 

254 metricOut = butler.getDirectDeferred(ref).get() 

255 self.assertEqual(metric, metricOut) 

256 

257 # Check we can get components 

258 if storageClass.isComposite(): 

259 self.assertGetComponents( 

260 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

261 ) 

262 

263 # Can the artifacts themselves be retrieved? 

264 if not butler.datastore.isEphemeral: 

265 root_uri = ButlerURI(self.root) 

266 

267 for preserve_path in (True, False): 

268 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

269 # Use copy so that we can test that overwrite 

270 # protection works (using "auto" for File URIs would 

271 # use hard links and subsequent transfer would work 

272 # because it knows they are the same file). 

273 transferred = butler.retrieveArtifacts( 

274 [ref], destination, preserve_path=preserve_path, transfer="copy" 

275 ) 

276 self.assertGreater(len(transferred), 0) 

277 artifacts = list(ButlerURI.findFileResources([destination])) 

278 self.assertEqual(set(transferred), set(artifacts)) 

279 

280 for artifact in transferred: 

281 path_in_destination = artifact.relative_to(destination) 

282 self.assertIsNotNone(path_in_destination) 

283 

284 # when path is not preserved there should not be 

285 # any path separators. 

286 num_seps = path_in_destination.count("/") 

287 if preserve_path: 

288 self.assertGreater(num_seps, 0) 

289 else: 

290 self.assertEqual(num_seps, 0) 

291 

292 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

293 n_uris = len(secondary_uris) 

294 if primary_uri: 

295 n_uris += 1 

296 self.assertEqual( 

297 len(artifacts), 

298 n_uris, 

299 "Comparing expected artifacts vs actual:" 

300 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

301 ) 

302 

303 if preserve_path: 

304 # No need to run these twice 

305 with self.assertRaises(ValueError): 

306 butler.retrieveArtifacts([ref], destination, transfer="move") 

307 

308 with self.assertRaises(FileExistsError): 

309 butler.retrieveArtifacts([ref], destination) 

310 

311 transferred_again = butler.retrieveArtifacts( 

312 [ref], destination, preserve_path=preserve_path, overwrite=True 

313 ) 

314 self.assertEqual(set(transferred_again), set(transferred)) 

315 

316 # Now remove the dataset completely. 

317 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run) 

318 # Lookup with original args should still fail. 

319 with self.assertRaises(LookupError): 

320 butler.datasetExists(*args, collections=this_run) 

321 # getDirect() should still fail. 

322 with self.assertRaises(FileNotFoundError): 

323 butler.getDirect(ref) 

324 # Registry shouldn't be able to find it by dataset_id anymore. 

325 self.assertIsNone(butler.registry.getDataset(ref.id)) 

326 

327 # Do explicit registry removal since we know they are 

328 # empty 

329 butler.registry.removeCollection(this_run) 

330 expected_collections.remove(this_run) 

331 

332 # Put the dataset again, since the last thing we did was remove it 

333 # and we want to use the default collection. 

334 ref = butler.put(metric, refIn) 

335 

336 # Get with parameters 

337 stop = 4 

338 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

339 self.assertNotEqual(metric, sliced) 

340 self.assertEqual(metric.summary, sliced.summary) 

341 self.assertEqual(metric.output, sliced.output) 

342 self.assertEqual(metric.data[:stop], sliced.data) 

343 # getDeferred with parameters 

344 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

345 self.assertNotEqual(metric, sliced) 

346 self.assertEqual(metric.summary, sliced.summary) 

347 self.assertEqual(metric.output, sliced.output) 

348 self.assertEqual(metric.data[:stop], sliced.data) 

349 # getDeferred with deferred parameters 

350 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

351 self.assertNotEqual(metric, sliced) 

352 self.assertEqual(metric.summary, sliced.summary) 

353 self.assertEqual(metric.output, sliced.output) 

354 self.assertEqual(metric.data[:stop], sliced.data) 

355 

356 if storageClass.isComposite(): 

357 # Check that components can be retrieved 

358 metricOut = butler.get(ref.datasetType.name, dataId) 

359 compNameS = ref.datasetType.componentTypeName("summary") 

360 compNameD = ref.datasetType.componentTypeName("data") 

361 summary = butler.get(compNameS, dataId) 

362 self.assertEqual(summary, metric.summary) 

363 data = butler.get(compNameD, dataId) 

364 self.assertEqual(data, metric.data) 

365 

366 if "counter" in storageClass.derivedComponents: 

367 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

368 self.assertEqual(count, len(data)) 

369 

370 count = butler.get( 

371 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

372 ) 

373 self.assertEqual(count, stop) 

374 

375 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

376 summary = butler.getDirect(compRef) 

377 self.assertEqual(summary, metric.summary) 

378 

379 # Create a Dataset type that has the same name but is inconsistent. 

380 inconsistentDatasetType = DatasetType( 

381 datasetTypeName, dimensions, self.storageClassFactory.getStorageClass("Config") 

382 ) 

383 

384 # Getting with a dataset type that does not match registry fails 

385 with self.assertRaises(ValueError): 

386 butler.get(inconsistentDatasetType, dataId) 

387 

388 # Combining a DatasetRef with a dataId should fail 

389 with self.assertRaises(ValueError): 

390 butler.get(ref, dataId) 

391 # Getting with an explicit ref should fail if the id doesn't match 

392 with self.assertRaises(ValueError): 

393 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101)) 

394 

395 # Getting a dataset with unknown parameters should fail 

396 with self.assertRaises(KeyError): 

397 butler.get(ref, parameters={"unsupported": True}) 

398 

399 # Check we have a collection 

400 collections = set(butler.registry.queryCollections()) 

401 self.assertEqual(collections, expected_collections) 

402 

403 # Clean up to check that we can remove something that may have 

404 # already had a component removed 

405 butler.pruneDatasets([ref], unstore=True, purge=True) 

406 

407 # Check that we can configure a butler to accept a put even 

408 # if it already has the dataset in registry. 

409 ref = butler.put(metric, refIn) 

410 

411 # Repeat put will fail. 

412 with self.assertRaises(ConflictingDefinitionError): 

413 butler.put(metric, refIn) 

414 

415 # Remove the datastore entry. 

416 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

417 

418 # Put will still fail 

419 with self.assertRaises(ConflictingDefinitionError): 

420 butler.put(metric, refIn) 

421 

422 # Allow the put to succeed 

423 butler._allow_put_of_predefined_dataset = True 

424 ref2 = butler.put(metric, refIn) 

425 self.assertEqual(ref2.id, ref.id) 

426 

427 # A second put will still fail but with a different exception 

428 # than before. 

429 with self.assertRaises(ConflictingDefinitionError): 

430 butler.put(metric, refIn) 

431 

432 # Reset the flag to avoid confusion 

433 butler._allow_put_of_predefined_dataset = False 

434 

435 # Leave the dataset in place since some downstream tests require 

436 # something to be present 

437 

438 return butler 

439 

440 def testDeferredCollectionPassing(self): 

441 # Construct a butler with no run or collection, but make it writeable. 

442 butler = Butler(self.tmpConfigFile, writeable=True) 

443 # Create and register a DatasetType 

444 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

445 datasetType = self.addDatasetType( 

446 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

447 ) 

448 # Add needed Dimensions 

449 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

450 butler.registry.insertDimensionData( 

451 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

452 ) 

453 butler.registry.insertDimensionData( 

454 "visit", 

455 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

456 ) 

457 dataId = {"instrument": "DummyCamComp", "visit": 423} 

458 # Create dataset. 

459 metric = makeExampleMetrics() 

460 # Register a new run and put dataset. 

461 run = "deferred" 

462 self.assertTrue(butler.registry.registerRun(run)) 

463 # Second time it will be allowed but indicate no-op 

464 self.assertFalse(butler.registry.registerRun(run)) 

465 ref = butler.put(metric, datasetType, dataId, run=run) 

466 # Putting with no run should fail with TypeError. 

467 with self.assertRaises(TypeError): 

468 butler.put(metric, datasetType, dataId) 

469 # Dataset should exist. 

470 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

471 # We should be able to get the dataset back, but with and without 

472 # a deferred dataset handle. 

473 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

474 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

475 # Trying to find the dataset without any collection is a TypeError. 

476 with self.assertRaises(TypeError): 

477 butler.datasetExists(datasetType, dataId) 

478 with self.assertRaises(TypeError): 

479 butler.get(datasetType, dataId) 

480 # Associate the dataset with a different collection. 

481 butler.registry.registerCollection("tagged") 

482 butler.registry.associate("tagged", [ref]) 

483 # Deleting the dataset from the new collection should make it findable 

484 # in the original collection. 

485 butler.pruneDatasets([ref], tags=["tagged"]) 

486 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

487 

488 

489class ButlerTests(ButlerPutGetTests): 

490 """Tests for Butler.""" 

491 

492 useTempRoot = True 

493 

494 def setUp(self): 

495 """Create a new butler root for each test.""" 

496 self.root = makeTestTempDir(TESTDIR) 

497 Butler.makeRepo(self.root, config=Config(self.configFile)) 

498 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

499 

500 def testConstructor(self): 

501 """Independent test of constructor.""" 

502 butler = Butler(self.tmpConfigFile, run="ingest") 

503 self.assertIsInstance(butler, Butler) 

504 

505 collections = set(butler.registry.queryCollections()) 

506 self.assertEqual(collections, {"ingest"}) 

507 

508 butler2 = Butler(butler=butler, collections=["other"]) 

509 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"])) 

510 self.assertIsNone(butler2.run) 

511 self.assertIs(butler.datastore, butler2.datastore) 

512 

513 # Test that we can use an environment variable to find this 

514 # repository. 

515 butler_index = Config() 

516 butler_index["label"] = self.tmpConfigFile 

517 for suffix in (".yaml", ".json"): 

518 # Ensure that the content differs so that we know that 

519 # we aren't reusing the cache. 

520 bad_label = f"s3://bucket/not_real{suffix}" 

521 butler_index["bad_label"] = bad_label 

522 with ButlerURI.temporary_uri(suffix=suffix) as temp_file: 

523 butler_index.dumpToUri(temp_file) 

524 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

525 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label"))) 

526 uri = Butler.get_repo_uri("bad_label") 

527 self.assertEqual(uri, ButlerURI(bad_label)) 

528 uri = Butler.get_repo_uri("label") 

529 butler = Butler(uri, writeable=False) 

530 self.assertIsInstance(butler, Butler) 

531 with self.assertRaises(KeyError) as cm: 

532 Butler.get_repo_uri("missing") 

533 self.assertIn("not known to", str(cm.exception)) 

534 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

535 with self.assertRaises(FileNotFoundError): 

536 Butler.get_repo_uri("label") 

537 self.assertEqual(Butler.get_known_repos(), set()) 

538 with self.assertRaises(KeyError) as cm: 

539 # No environment variable set. 

540 Butler.get_repo_uri("label") 

541 self.assertIn("No repository index defined", str(cm.exception)) 

542 self.assertEqual(Butler.get_known_repos(), set()) 

543 

544 def testBasicPutGet(self): 

545 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

546 self.runPutGetTest(storageClass, "test_metric") 

547 

548 def testCompositePutGetConcrete(self): 

549 

550 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

551 butler = self.runPutGetTest(storageClass, "test_metric") 

552 

553 # Should *not* be disassembled 

554 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

555 self.assertEqual(len(datasets), 1) 

556 uri, components = butler.getURIs(datasets[0]) 

557 self.assertIsInstance(uri, ButlerURI) 

558 self.assertFalse(components) 

559 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

560 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

561 

562 # Predicted dataset 

563 dataId = {"instrument": "DummyCamComp", "visit": 424} 

564 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

565 self.assertFalse(components) 

566 self.assertIsInstance(uri, ButlerURI) 

567 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

568 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

569 

570 def testCompositePutGetVirtual(self): 

571 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

572 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

573 

574 # Should be disassembled 

575 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

576 self.assertEqual(len(datasets), 1) 

577 uri, components = butler.getURIs(datasets[0]) 

578 

579 if butler.datastore.isEphemeral: 

580 # Never disassemble in-memory datastore 

581 self.assertIsInstance(uri, ButlerURI) 

582 self.assertFalse(components) 

583 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

584 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

585 else: 

586 self.assertIsNone(uri) 

587 self.assertEqual(set(components), set(storageClass.components)) 

588 for compuri in components.values(): 

589 self.assertIsInstance(compuri, ButlerURI) 

590 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

591 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

592 

593 # Predicted dataset 

594 dataId = {"instrument": "DummyCamComp", "visit": 424} 

595 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

596 

597 if butler.datastore.isEphemeral: 

598 # Never disassembled 

599 self.assertIsInstance(uri, ButlerURI) 

600 self.assertFalse(components) 

601 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

602 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

603 else: 

604 self.assertIsNone(uri) 

605 self.assertEqual(set(components), set(storageClass.components)) 

606 for compuri in components.values(): 

607 self.assertIsInstance(compuri, ButlerURI) 

608 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

609 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

610 

611 def testIngest(self): 

612 butler = Butler(self.tmpConfigFile, run="ingest") 

613 

614 # Create and register a DatasetType 

615 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"]) 

616 

617 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

618 datasetTypeName = "metric" 

619 

620 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

621 

622 # Add needed Dimensions 

623 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

624 butler.registry.insertDimensionData( 

625 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

626 ) 

627 for detector in (1, 2): 

628 butler.registry.insertDimensionData( 

629 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

630 ) 

631 

632 butler.registry.insertDimensionData( 

633 "visit", 

634 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

635 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

636 ) 

637 

638 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter") 

639 dataRoot = os.path.join(TESTDIR, "data", "basic") 

640 datasets = [] 

641 for detector in (1, 2): 

642 detector_name = f"detector_{detector}" 

643 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

644 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

645 # Create a DatasetRef for ingest 

646 refIn = DatasetRef(datasetType, dataId, id=None) 

647 

648 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

649 

650 butler.ingest(*datasets, transfer="copy") 

651 

652 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

653 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

654 

655 metrics1 = butler.get(datasetTypeName, dataId1) 

656 metrics2 = butler.get(datasetTypeName, dataId2) 

657 self.assertNotEqual(metrics1, metrics2) 

658 

659 # Compare URIs 

660 uri1 = butler.getURI(datasetTypeName, dataId1) 

661 uri2 = butler.getURI(datasetTypeName, dataId2) 

662 self.assertNotEqual(uri1, uri2) 

663 

664 # Now do a multi-dataset but single file ingest 

665 metricFile = os.path.join(dataRoot, "detectors.yaml") 

666 refs = [] 

667 for detector in (1, 2): 

668 detector_name = f"detector_{detector}" 

669 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

670 # Create a DatasetRef for ingest 

671 refs.append(DatasetRef(datasetType, dataId, id=None)) 

672 

673 datasets = [] 

674 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter)) 

675 

676 butler.ingest(*datasets, transfer="copy") 

677 

678 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

679 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

680 

681 multi1 = butler.get(datasetTypeName, dataId1) 

682 multi2 = butler.get(datasetTypeName, dataId2) 

683 

684 self.assertEqual(multi1, metrics1) 

685 self.assertEqual(multi2, metrics2) 

686 

687 # Compare URIs 

688 uri1 = butler.getURI(datasetTypeName, dataId1) 

689 uri2 = butler.getURI(datasetTypeName, dataId2) 

690 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

691 

692 # Test that removing one does not break the second 

693 # This line will issue a warning log message for a ChainedDatastore 

694 # that uses an InMemoryDatastore since in-memory can not ingest 

695 # files. 

696 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

697 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1)) 

698 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2)) 

699 multi2b = butler.get(datasetTypeName, dataId2) 

700 self.assertEqual(multi2, multi2b) 

701 

702 def testPruneCollections(self): 

703 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

704 butler = Butler(self.tmpConfigFile, writeable=True) 

705 # Load registry data with dimensions to hang datasets off of. 

706 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

707 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

708 # Add some RUN-type collections. 

709 run1 = "run1" 

710 butler.registry.registerRun(run1) 

711 run2 = "run2" 

712 butler.registry.registerRun(run2) 

713 # put some datasets. ref1 and ref2 have the same data ID, and are in 

714 # different runs. ref3 has a different data ID. 

715 metric = makeExampleMetrics() 

716 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

717 datasetType = self.addDatasetType( 

718 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

719 ) 

720 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

721 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

722 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

723 

724 # Try to delete a RUN collection without purge, or with purge and not 

725 # unstore. 

726 with self.assertRaises(TypeError): 

727 butler.pruneCollection(run1) 

728 with self.assertRaises(TypeError): 

729 butler.pruneCollection(run2, purge=True) 

730 # Add a TAGGED collection and associate ref3 only into it. 

731 tag1 = "tag1" 

732 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

733 self.assertTrue(registered) 

734 # Registering a second time should be allowed. 

735 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

736 self.assertFalse(registered) 

737 butler.registry.associate(tag1, [ref3]) 

738 # Add a CHAINED collection that searches run1 and then run2. It 

739 # logically contains only ref1, because ref2 is shadowed due to them 

740 # having the same data ID and dataset type. 

741 chain1 = "chain1" 

742 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

743 butler.registry.setCollectionChain(chain1, [run1, run2]) 

744 # Try to delete RUN collections, which should fail with complete 

745 # rollback because they're still referenced by the CHAINED 

746 # collection. 

747 with self.assertRaises(Exception): 

748 butler.pruneCollection(run1, pruge=True, unstore=True) 

749 with self.assertRaises(Exception): 

750 butler.pruneCollection(run2, pruge=True, unstore=True) 

751 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

752 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

753 self.assertTrue(existence[ref1]) 

754 self.assertTrue(existence[ref2]) 

755 self.assertTrue(existence[ref3]) 

756 # Try to delete CHAINED and TAGGED collections with purge; should not 

757 # work. 

758 with self.assertRaises(TypeError): 

759 butler.pruneCollection(tag1, purge=True, unstore=True) 

760 with self.assertRaises(TypeError): 

761 butler.pruneCollection(chain1, purge=True, unstore=True) 

762 # Remove the tagged collection with unstore=False. This should not 

763 # affect the datasets. 

764 butler.pruneCollection(tag1) 

765 with self.assertRaises(MissingCollectionError): 

766 butler.registry.getCollectionType(tag1) 

767 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

768 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

769 self.assertTrue(existence[ref1]) 

770 self.assertTrue(existence[ref2]) 

771 self.assertTrue(existence[ref3]) 

772 # Add the tagged collection back in, and remove it with unstore=True. 

773 # This should remove ref3 only from the datastore. 

774 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

775 butler.registry.associate(tag1, [ref3]) 

776 butler.pruneCollection(tag1, unstore=True) 

777 with self.assertRaises(MissingCollectionError): 

778 butler.registry.getCollectionType(tag1) 

779 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

780 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

781 self.assertTrue(existence[ref1]) 

782 self.assertTrue(existence[ref2]) 

783 self.assertFalse(existence[ref3]) 

784 # Delete the chain with unstore=False. The datasets should not be 

785 # affected at all. 

786 butler.pruneCollection(chain1) 

787 with self.assertRaises(MissingCollectionError): 

788 butler.registry.getCollectionType(chain1) 

789 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

790 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

791 self.assertTrue(existence[ref1]) 

792 self.assertTrue(existence[ref2]) 

793 self.assertFalse(existence[ref3]) 

794 # Redefine and then delete the chain with unstore=True. Only ref1 

795 # should be unstored (ref3 has already been unstored, but otherwise 

796 # would be now). 

797 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

798 butler.registry.setCollectionChain(chain1, [run1, run2]) 

799 butler.pruneCollection(chain1, unstore=True) 

800 with self.assertRaises(MissingCollectionError): 

801 butler.registry.getCollectionType(chain1) 

802 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

803 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

804 self.assertFalse(existence[ref1]) 

805 self.assertTrue(existence[ref2]) 

806 self.assertFalse(existence[ref3]) 

807 # Remove run1. This removes ref1 and ref3 from the registry (they're 

808 # already gone from the datastore, which is fine). 

809 butler.pruneCollection(run1, purge=True, unstore=True) 

810 with self.assertRaises(MissingCollectionError): 

811 butler.registry.getCollectionType(run1) 

812 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2]) 

813 self.assertTrue(butler.datastore.exists(ref2)) 

814 # Remove run2. This removes ref2 from the registry and the datastore. 

815 butler.pruneCollection(run2, purge=True, unstore=True) 

816 with self.assertRaises(MissingCollectionError): 

817 butler.registry.getCollectionType(run2) 

818 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), []) 

819 

820 # Now that the collections have been pruned we can remove the 

821 # dataset type 

822 butler.registry.removeDatasetType(datasetType.name) 

823 

824 def testPickle(self): 

825 """Test pickle support.""" 

826 butler = Butler(self.tmpConfigFile, run="ingest") 

827 butlerOut = pickle.loads(pickle.dumps(butler)) 

828 self.assertIsInstance(butlerOut, Butler) 

829 self.assertEqual(butlerOut._config, butler._config) 

830 self.assertEqual(butlerOut.collections, butler.collections) 

831 self.assertEqual(butlerOut.run, butler.run) 

832 

833 def testGetDatasetTypes(self): 

834 butler = Butler(self.tmpConfigFile, run="ingest") 

835 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"]) 

836 dimensionEntries = [ 

837 ( 

838 "instrument", 

839 {"instrument": "DummyCam"}, 

840 {"instrument": "DummyHSC"}, 

841 {"instrument": "DummyCamComp"}, 

842 ), 

843 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

844 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

845 ] 

846 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

847 # Add needed Dimensions 

848 for args in dimensionEntries: 

849 butler.registry.insertDimensionData(*args) 

850 

851 # When a DatasetType is added to the registry entries are not created 

852 # for components but querying them can return the components. 

853 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

854 components = set() 

855 for datasetTypeName in datasetTypeNames: 

856 # Create and register a DatasetType 

857 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

858 

859 for componentName in storageClass.components: 

860 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

861 

862 fromRegistry = set(butler.registry.queryDatasetTypes(components=True)) 

863 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

864 

865 # Now that we have some dataset types registered, validate them 

866 butler.validateConfiguration( 

867 ignore=[ 

868 "test_metric_comp", 

869 "metric3", 

870 "calexp", 

871 "DummySC", 

872 "datasetType.component", 

873 "random_data", 

874 "random_data_2", 

875 ] 

876 ) 

877 

878 # Add a new datasetType that will fail template validation 

879 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

880 if self.validationCanFail: 

881 with self.assertRaises(ValidationError): 

882 butler.validateConfiguration() 

883 

884 # Rerun validation but with a subset of dataset type names 

885 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

886 

887 # Rerun validation but ignore the bad datasetType 

888 butler.validateConfiguration( 

889 ignore=[ 

890 "test_metric_comp", 

891 "metric3", 

892 "calexp", 

893 "DummySC", 

894 "datasetType.component", 

895 "random_data", 

896 "random_data_2", 

897 ] 

898 ) 

899 

900 def testTransaction(self): 

901 butler = Butler(self.tmpConfigFile, run="ingest") 

902 datasetTypeName = "test_metric" 

903 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

904 dimensionEntries = ( 

905 ("instrument", {"instrument": "DummyCam"}), 

906 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

907 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

908 ) 

909 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

910 metric = makeExampleMetrics() 

911 dataId = {"instrument": "DummyCam", "visit": 42} 

912 # Create and register a DatasetType 

913 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

914 with self.assertRaises(TransactionTestError): 

915 with butler.transaction(): 

916 # Add needed Dimensions 

917 for args in dimensionEntries: 

918 butler.registry.insertDimensionData(*args) 

919 # Store a dataset 

920 ref = butler.put(metric, datasetTypeName, dataId) 

921 self.assertIsInstance(ref, DatasetRef) 

922 # Test getDirect 

923 metricOut = butler.getDirect(ref) 

924 self.assertEqual(metric, metricOut) 

925 # Test get 

926 metricOut = butler.get(datasetTypeName, dataId) 

927 self.assertEqual(metric, metricOut) 

928 # Check we can get components 

929 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

930 raise TransactionTestError("This should roll back the entire transaction") 

931 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"): 

932 butler.registry.expandDataId(dataId) 

933 # Should raise LookupError for missing data ID value 

934 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

935 butler.get(datasetTypeName, dataId) 

936 # Also check explicitly if Dataset entry is missing 

937 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

938 # Direct retrieval should not find the file in the Datastore 

939 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

940 butler.getDirect(ref) 

941 

942 def testMakeRepo(self): 

943 """Test that we can write butler configuration to a new repository via 

944 the Butler.makeRepo interface and then instantiate a butler from the 

945 repo root. 

946 """ 

947 # Do not run the test if we know this datastore configuration does 

948 # not support a file system root 

949 if self.fullConfigKey is None: 

950 return 

951 

952 # create two separate directories 

953 root1 = tempfile.mkdtemp(dir=self.root) 

954 root2 = tempfile.mkdtemp(dir=self.root) 

955 

956 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

957 limited = Config(self.configFile) 

958 butler1 = Butler(butlerConfig) 

959 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

960 full = Config(self.tmpConfigFile) 

961 butler2 = Butler(butlerConfig) 

962 # Butlers should have the same configuration regardless of whether 

963 # defaults were expanded. 

964 self.assertEqual(butler1._config, butler2._config) 

965 # Config files loaded directly should not be the same. 

966 self.assertNotEqual(limited, full) 

967 # Make sure "limited" doesn't have a few keys we know it should be 

968 # inheriting from defaults. 

969 self.assertIn(self.fullConfigKey, full) 

970 self.assertNotIn(self.fullConfigKey, limited) 

971 

972 # Collections don't appear until something is put in them 

973 collections1 = set(butler1.registry.queryCollections()) 

974 self.assertEqual(collections1, set()) 

975 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

976 

977 # Check that a config with no associated file name will not 

978 # work properly with relocatable Butler repo 

979 butlerConfig.configFile = None 

980 with self.assertRaises(ValueError): 

981 Butler(butlerConfig) 

982 

983 with self.assertRaises(FileExistsError): 

984 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

985 

986 def testStringification(self): 

987 butler = Butler(self.tmpConfigFile, run="ingest") 

988 butlerStr = str(butler) 

989 

990 if self.datastoreStr is not None: 

991 for testStr in self.datastoreStr: 

992 self.assertIn(testStr, butlerStr) 

993 if self.registryStr is not None: 

994 self.assertIn(self.registryStr, butlerStr) 

995 

996 datastoreName = butler.datastore.name 

997 if self.datastoreName is not None: 

998 for testStr in self.datastoreName: 

999 self.assertIn(testStr, datastoreName) 

1000 

1001 def testButlerRewriteDataId(self): 

1002 """Test that dataIds can be rewritten based on dimension records.""" 

1003 

1004 butler = Butler(self.tmpConfigFile, run="ingest") 

1005 

1006 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1007 datasetTypeName = "random_data" 

1008 

1009 # Create dimension records. 

1010 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1011 butler.registry.insertDimensionData( 

1012 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1013 ) 

1014 butler.registry.insertDimensionData( 

1015 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1016 ) 

1017 

1018 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1019 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1020 butler.registry.registerDatasetType(datasetType) 

1021 

1022 n_exposures = 5 

1023 dayobs = 20210530 

1024 

1025 for i in range(n_exposures): 

1026 butler.registry.insertDimensionData( 

1027 "exposure", 

1028 { 

1029 "instrument": "DummyCamComp", 

1030 "id": i, 

1031 "obs_id": f"exp{i}", 

1032 "seq_num": i, 

1033 "day_obs": dayobs, 

1034 "physical_filter": "d-r", 

1035 }, 

1036 ) 

1037 

1038 # Write some data. 

1039 for i in range(n_exposures): 

1040 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1041 

1042 # Use the seq_num for the put to test rewriting. 

1043 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1044 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1045 

1046 # Check that the exposure is correct in the dataId 

1047 self.assertEqual(ref.dataId["exposure"], i) 

1048 

1049 # and check that we can get the dataset back with the same dataId 

1050 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1051 self.assertEqual(new_metric, metric) 

1052 

1053 

1054class FileDatastoreButlerTests(ButlerTests): 

1055 """Common tests and specialization of ButlerTests for butlers backed 

1056 by datastores that inherit from FileDatastore. 

1057 """ 

1058 

1059 def checkFileExists(self, root, relpath): 

1060 """Checks if file exists at a given path (relative to root). 

1061 

1062 Test testPutTemplates verifies actual physical existance of the files 

1063 in the requested location. 

1064 """ 

1065 uri = ButlerURI(root, forceDirectory=True) 

1066 return uri.join(relpath).exists() 

1067 

1068 def testPutTemplates(self): 

1069 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1070 butler = Butler(self.tmpConfigFile, run="ingest") 

1071 

1072 # Add needed Dimensions 

1073 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1074 butler.registry.insertDimensionData( 

1075 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1076 ) 

1077 butler.registry.insertDimensionData( 

1078 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1079 ) 

1080 butler.registry.insertDimensionData( 

1081 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1082 ) 

1083 

1084 # Create and store a dataset 

1085 metric = makeExampleMetrics() 

1086 

1087 # Create two almost-identical DatasetTypes (both will use default 

1088 # template) 

1089 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

1090 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1091 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1092 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1093 

1094 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1095 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1096 

1097 # Put with exactly the data ID keys needed 

1098 ref = butler.put(metric, "metric1", dataId1) 

1099 uri = butler.getURI(ref) 

1100 self.assertTrue( 

1101 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"), 

1102 f"Checking existence of {uri}", 

1103 ) 

1104 

1105 # Check the template based on dimensions 

1106 butler.datastore.templates.validateTemplates([ref]) 

1107 

1108 # Put with extra data ID keys (physical_filter is an optional 

1109 # dependency); should not change template (at least the way we're 

1110 # defining them to behave now; the important thing is that they 

1111 # must be consistent). 

1112 ref = butler.put(metric, "metric2", dataId2) 

1113 uri = butler.getURI(ref) 

1114 self.assertTrue( 

1115 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"), 

1116 f"Checking existence of {uri}", 

1117 ) 

1118 

1119 # Check the template based on dimensions 

1120 butler.datastore.templates.validateTemplates([ref]) 

1121 

1122 # Now use a file template that will not result in unique filenames 

1123 with self.assertRaises(FileTemplateValidationError): 

1124 butler.put(metric, "metric3", dataId1) 

1125 

1126 def testImportExport(self): 

1127 # Run put/get tests just to create and populate a repo. 

1128 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1129 self.runImportExportTest(storageClass) 

1130 

1131 @unittest.expectedFailure 

1132 def testImportExportVirtualComposite(self): 

1133 # Run put/get tests just to create and populate a repo. 

1134 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1135 self.runImportExportTest(storageClass) 

1136 

1137 def runImportExportTest(self, storageClass): 

1138 """This test does an export to a temp directory and an import back 

1139 into a new temp directory repo. It does not assume a posix datastore""" 

1140 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1141 print("Root:", exportButler.datastore.root) 

1142 # Test that the repo actually has at least one dataset. 

1143 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1144 self.assertGreater(len(datasets), 0) 

1145 # Add a DimensionRecord that's unused by those datasets. 

1146 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1147 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1148 # Export and then import datasets. 

1149 with safeTestTempDir(TESTDIR) as exportDir: 

1150 exportFile = os.path.join(exportDir, "exports.yaml") 

1151 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1152 export.saveDatasets(datasets) 

1153 # Export the same datasets again. This should quietly do 

1154 # nothing because of internal deduplication, and it shouldn't 

1155 # complain about being asked to export the "htm7" elements even 

1156 # though there aren't any in these datasets or in the database. 

1157 export.saveDatasets(datasets, elements=["htm7"]) 

1158 # Save one of the data IDs again; this should be harmless 

1159 # because of internal deduplication. 

1160 export.saveDataIds([datasets[0].dataId]) 

1161 # Save some dimension records directly. 

1162 export.saveDimensionData("skymap", [skymapRecord]) 

1163 self.assertTrue(os.path.exists(exportFile)) 

1164 with safeTestTempDir(TESTDIR) as importDir: 

1165 # We always want this to be a local posix butler 

1166 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1167 # Calling script.butlerImport tests the implementation of the 

1168 # butler command line interface "import" subcommand. Functions 

1169 # in the script folder are generally considered protected and 

1170 # should not be used as public api. 

1171 with open(exportFile, "r") as f: 

1172 script.butlerImport( 

1173 importDir, 

1174 export_file=f, 

1175 directory=exportDir, 

1176 transfer="auto", 

1177 skip_dimensions=None, 

1178 reuse_ids=False, 

1179 ) 

1180 importButler = Butler(importDir, run="ingest") 

1181 for ref in datasets: 

1182 with self.subTest(ref=ref): 

1183 # Test for existence by passing in the DatasetType and 

1184 # data ID separately, to avoid lookup by dataset_id. 

1185 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId)) 

1186 self.assertEqual( 

1187 list(importButler.registry.queryDimensionRecords("skymap")), 

1188 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)], 

1189 ) 

1190 

1191 def testRemoveRuns(self): 

1192 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1193 butler = Butler(self.tmpConfigFile, writeable=True) 

1194 # Load registry data with dimensions to hang datasets off of. 

1195 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1196 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1197 # Add some RUN-type collection. 

1198 run1 = "run1" 

1199 butler.registry.registerRun(run1) 

1200 run2 = "run2" 

1201 butler.registry.registerRun(run2) 

1202 # put a dataset in each 

1203 metric = makeExampleMetrics() 

1204 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1205 datasetType = self.addDatasetType( 

1206 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1207 ) 

1208 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1209 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1210 uri1 = butler.getURI(ref1, collections=[run1]) 

1211 uri2 = butler.getURI(ref2, collections=[run2]) 

1212 # Remove from both runs with different values for unstore. 

1213 butler.removeRuns([run1], unstore=True) 

1214 butler.removeRuns([run2], unstore=False) 

1215 # Should be nothing in registry for either one, and datastore should 

1216 # not think either exists. 

1217 with self.assertRaises(MissingCollectionError): 

1218 butler.registry.getCollectionType(run1) 

1219 with self.assertRaises(MissingCollectionError): 

1220 butler.registry.getCollectionType(run2) 

1221 self.assertFalse(butler.datastore.exists(ref1)) 

1222 self.assertFalse(butler.datastore.exists(ref2)) 

1223 # The ref we unstored should be gone according to the URI, but the 

1224 # one we forgot should still be around. 

1225 self.assertFalse(uri1.exists()) 

1226 self.assertTrue(uri2.exists()) 

1227 

1228 

1229class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1230 """PosixDatastore specialization of a butler""" 

1231 

1232 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1233 fullConfigKey = ".datastore.formatters" 

1234 validationCanFail = True 

1235 datastoreStr = ["/tmp"] 

1236 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1237 registryStr = "/gen3.sqlite3" 

1238 

1239 def testPathConstructor(self): 

1240 """Independent test of constructor using PathLike.""" 

1241 butler = Butler(self.tmpConfigFile, run="ingest") 

1242 self.assertIsInstance(butler, Butler) 

1243 

1244 # And again with a Path object with the butler yaml 

1245 path = pathlib.Path(self.tmpConfigFile) 

1246 butler = Butler(path, writeable=False) 

1247 self.assertIsInstance(butler, Butler) 

1248 

1249 # And again with a Path object without the butler yaml 

1250 # (making sure we skip it if the tmp config doesn't end 

1251 # in butler.yaml -- which is the case for a subclass) 

1252 if self.tmpConfigFile.endswith("butler.yaml"): 

1253 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1254 butler = Butler(path, writeable=False) 

1255 self.assertIsInstance(butler, Butler) 

1256 

1257 def testExportTransferCopy(self): 

1258 """Test local export using all transfer modes""" 

1259 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1260 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1261 # Test that the repo actually has at least one dataset. 

1262 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1263 self.assertGreater(len(datasets), 0) 

1264 uris = [exportButler.getURI(d) for d in datasets] 

1265 datastoreRoot = exportButler.datastore.root 

1266 

1267 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1268 

1269 for path in pathsInStore: 

1270 # Assume local file system 

1271 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1272 

1273 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1274 with safeTestTempDir(TESTDIR) as exportDir: 

1275 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1276 export.saveDatasets(datasets) 

1277 for path in pathsInStore: 

1278 self.assertTrue( 

1279 self.checkFileExists(exportDir, path), 

1280 f"Check that mode {transfer} exported files", 

1281 ) 

1282 

1283 def testPruneDatasets(self): 

1284 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1285 butler = Butler(self.tmpConfigFile, writeable=True) 

1286 # Load registry data with dimensions to hang datasets off of. 

1287 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1288 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1289 # Add some RUN-type collections. 

1290 run1 = "run1" 

1291 butler.registry.registerRun(run1) 

1292 run2 = "run2" 

1293 butler.registry.registerRun(run2) 

1294 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1295 # different runs. ref3 has a different data ID. 

1296 metric = makeExampleMetrics() 

1297 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1298 datasetType = self.addDatasetType( 

1299 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1300 ) 

1301 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1302 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1303 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1304 

1305 # Simple prune. 

1306 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1307 with self.assertRaises(LookupError): 

1308 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1) 

1309 

1310 # Put data back. 

1311 ref1 = butler.put(metric, ref1.unresolved(), run=run1) 

1312 ref2 = butler.put(metric, ref2.unresolved(), run=run2) 

1313 ref3 = butler.put(metric, ref3.unresolved(), run=run1) 

1314 

1315 # Check that in normal mode, deleting the record will lead to 

1316 # trash not touching the file. 

1317 uri1 = butler.datastore.getURI(ref1) 

1318 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table 

1319 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id}) 

1320 butler.datastore.trash(ref1) 

1321 butler.datastore.emptyTrash() 

1322 self.assertTrue(uri1.exists()) 

1323 uri1.remove() # Clean it up. 

1324 

1325 # Simulate execution butler setup by deleting the datastore 

1326 # record but keeping the file around and trusting. 

1327 butler.datastore.trustGetRequest = True 

1328 uri2 = butler.datastore.getURI(ref2) 

1329 uri3 = butler.datastore.getURI(ref3) 

1330 self.assertTrue(uri2.exists()) 

1331 self.assertTrue(uri3.exists()) 

1332 

1333 # Remove the datastore record. 

1334 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table 

1335 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id}) 

1336 self.assertTrue(uri2.exists()) 

1337 butler.datastore.trash([ref2, ref3]) 

1338 # Immediate removal for ref2 file 

1339 self.assertFalse(uri2.exists()) 

1340 # But ref3 has to wait for the empty. 

1341 self.assertTrue(uri3.exists()) 

1342 butler.datastore.emptyTrash() 

1343 self.assertFalse(uri3.exists()) 

1344 

1345 # Clear out the datasets from registry. 

1346 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1347 

1348 def testPytypeCoercion(self): 

1349 """Test python type coercion on Butler.get""" 

1350 

1351 # Store some data with the normal example storage class. 

1352 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1353 datasetTypeName = "test_metric" 

1354 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1355 

1356 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1357 metric = butler.get(datasetTypeName, dataId=dataId) 

1358 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1359 

1360 datasetType_ori = butler.registry.getDatasetType(datasetTypeName) 

1361 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1362 

1363 # Now need to hack the registry dataset type definition. 

1364 # There is no API for this. 

1365 manager = butler.registry._managers.datasets 

1366 manager._db.update( 

1367 manager._static.dataset_type, 

1368 {"name": datasetTypeName}, 

1369 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1370 ) 

1371 

1372 # Force reset of dataset type cache 

1373 butler.registry.refresh() 

1374 

1375 datasetType_new = butler.registry.getDatasetType(datasetTypeName) 

1376 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1377 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1378 

1379 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1380 self.assertNotEqual(type(metric_model), type(metric)) 

1381 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1382 

1383 # Put the model and read it back to show that everything now 

1384 # works as normal. 

1385 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1386 metric_model_new = butler.get(metric_ref) 

1387 self.assertEqual(metric_model_new, metric_model) 

1388 

1389 # Hack the storage class again to something that will fail on the 

1390 # get with no conversion class. 

1391 manager._db.update( 

1392 manager._static.dataset_type, 

1393 {"name": datasetTypeName}, 

1394 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1395 ) 

1396 butler.registry.refresh() 

1397 

1398 with self.assertRaises(ValueError): 

1399 butler.get(datasetTypeName, dataId=dataId) 

1400 

1401 

1402class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1403 """InMemoryDatastore specialization of a butler""" 

1404 

1405 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1406 fullConfigKey = None 

1407 useTempRoot = False 

1408 validationCanFail = False 

1409 datastoreStr = ["datastore='InMemory"] 

1410 datastoreName = ["InMemoryDatastore@"] 

1411 registryStr = "/gen3.sqlite3" 

1412 

1413 def testIngest(self): 

1414 pass 

1415 

1416 

1417class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1418 """PosixDatastore specialization""" 

1419 

1420 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1421 fullConfigKey = ".datastore.datastores.1.formatters" 

1422 validationCanFail = True 

1423 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1424 datastoreName = [ 

1425 "InMemoryDatastore@", 

1426 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1427 "SecondDatastore", 

1428 ] 

1429 registryStr = "/gen3.sqlite3" 

1430 

1431 

1432class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1433 """Test that a yaml file in one location can refer to a root in another.""" 

1434 

1435 datastoreStr = ["dir1"] 

1436 # Disable the makeRepo test since we are deliberately not using 

1437 # butler.yaml as the config name. 

1438 fullConfigKey = None 

1439 

1440 def setUp(self): 

1441 self.root = makeTestTempDir(TESTDIR) 

1442 

1443 # Make a new repository in one place 

1444 self.dir1 = os.path.join(self.root, "dir1") 

1445 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1446 

1447 # Move the yaml file to a different place and add a "root" 

1448 self.dir2 = os.path.join(self.root, "dir2") 

1449 os.makedirs(self.dir2, exist_ok=True) 

1450 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1451 config = Config(configFile1) 

1452 config["root"] = self.dir1 

1453 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1454 config.dumpToUri(configFile2) 

1455 os.remove(configFile1) 

1456 self.tmpConfigFile = configFile2 

1457 

1458 def testFileLocations(self): 

1459 self.assertNotEqual(self.dir1, self.dir2) 

1460 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1461 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1462 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1463 

1464 

1465class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1466 """Test that a config file created by makeRepo outside of repo works.""" 

1467 

1468 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1469 

1470 def setUp(self): 

1471 self.root = makeTestTempDir(TESTDIR) 

1472 self.root2 = makeTestTempDir(TESTDIR) 

1473 

1474 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1475 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1476 

1477 def tearDown(self): 

1478 if os.path.exists(self.root2): 

1479 shutil.rmtree(self.root2, ignore_errors=True) 

1480 super().tearDown() 

1481 

1482 def testConfigExistence(self): 

1483 c = Config(self.tmpConfigFile) 

1484 uri_config = ButlerURI(c["root"]) 

1485 uri_expected = ButlerURI(self.root, forceDirectory=True) 

1486 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1487 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1488 

1489 def testPutGet(self): 

1490 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1491 self.runPutGetTest(storageClass, "test_metric") 

1492 

1493 

1494class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1495 """Test that a config file created by makeRepo outside of repo works.""" 

1496 

1497 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1498 

1499 def setUp(self): 

1500 self.root = makeTestTempDir(TESTDIR) 

1501 self.root2 = makeTestTempDir(TESTDIR) 

1502 

1503 self.tmpConfigFile = self.root2 

1504 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1505 

1506 def testConfigExistence(self): 

1507 # Append the yaml file else Config constructor does not know the file 

1508 # type. 

1509 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1510 super().testConfigExistence() 

1511 

1512 

1513class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1514 """Test that a config file created by makeRepo outside of repo works.""" 

1515 

1516 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1517 

1518 def setUp(self): 

1519 self.root = makeTestTempDir(TESTDIR) 

1520 self.root2 = makeTestTempDir(TESTDIR) 

1521 

1522 self.tmpConfigFile = ButlerURI(os.path.join(self.root2, "something.yaml")).geturl() 

1523 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1524 

1525 

1526@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1527@mock_s3 

1528class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1529 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1530 a local in-memory SqlRegistry. 

1531 """ 

1532 

1533 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1534 fullConfigKey = None 

1535 validationCanFail = True 

1536 

1537 bucketName = "anybucketname" 

1538 """Name of the Bucket that will be used in the tests. The name is read from 

1539 the config file used with the tests during set-up. 

1540 """ 

1541 

1542 root = "butlerRoot/" 

1543 """Root repository directory expected to be used in case useTempRoot=False. 

1544 Otherwise the root is set to a 20 characters long randomly generated string 

1545 during set-up. 

1546 """ 

1547 

1548 datastoreStr = [f"datastore={root}"] 

1549 """Contains all expected root locations in a format expected to be 

1550 returned by Butler stringification. 

1551 """ 

1552 

1553 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1554 """The expected format of the S3 Datastore string.""" 

1555 

1556 registryStr = "/gen3.sqlite3" 

1557 """Expected format of the Registry string.""" 

1558 

1559 def genRoot(self): 

1560 """Returns a random string of len 20 to serve as a root 

1561 name for the temporary bucket repo. 

1562 

1563 This is equivalent to tempfile.mkdtemp as this is what self.root 

1564 becomes when useTempRoot is True. 

1565 """ 

1566 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1567 return rndstr + "/" 

1568 

1569 def setUp(self): 

1570 config = Config(self.configFile) 

1571 uri = ButlerURI(config[".datastore.datastore.root"]) 

1572 self.bucketName = uri.netloc 

1573 

1574 # set up some fake credentials if they do not exist 

1575 self.usingDummyCredentials = setAwsEnvCredentials() 

1576 

1577 if self.useTempRoot: 

1578 self.root = self.genRoot() 

1579 rooturi = f"s3://{self.bucketName}/{self.root}" 

1580 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1581 

1582 # need local folder to store registry database 

1583 self.reg_dir = makeTestTempDir(TESTDIR) 

1584 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1585 

1586 # MOTO needs to know that we expect Bucket bucketname to exist 

1587 # (this used to be the class attribute bucketName) 

1588 s3 = boto3.resource("s3") 

1589 s3.create_bucket(Bucket=self.bucketName) 

1590 

1591 self.datastoreStr = f"datastore={self.root}" 

1592 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1593 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1594 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1595 

1596 def tearDown(self): 

1597 s3 = boto3.resource("s3") 

1598 bucket = s3.Bucket(self.bucketName) 

1599 try: 

1600 bucket.objects.all().delete() 

1601 except botocore.exceptions.ClientError as e: 

1602 if e.response["Error"]["Code"] == "404": 

1603 # the key was not reachable - pass 

1604 pass 

1605 else: 

1606 raise 

1607 

1608 bucket = s3.Bucket(self.bucketName) 

1609 bucket.delete() 

1610 

1611 # unset any potentially set dummy credentials 

1612 if self.usingDummyCredentials: 

1613 unsetAwsEnvCredentials() 

1614 

1615 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1616 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1617 

1618 if self.useTempRoot and os.path.exists(self.root): 

1619 shutil.rmtree(self.root, ignore_errors=True) 

1620 

1621 

1622@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!") 

1623# Mock required environment variables during tests 

1624@unittest.mock.patch.dict( 

1625 os.environ, 

1626 { 

1627 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1628 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1629 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1630 }, 

1631) 

1632class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1633 """WebdavDatastore specialization of a butler; a Webdav storage Datastore + 

1634 a local in-memory SqlRegistry. 

1635 """ 

1636 

1637 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml") 

1638 fullConfigKey = None 

1639 validationCanFail = True 

1640 

1641 serverName = "localhost" 

1642 """Name of the server that will be used in the tests. 

1643 """ 

1644 

1645 portNumber = 8080 

1646 """Port on which the webdav server listens. Automatically chosen 

1647 at setUpClass via the _getfreeport() method 

1648 """ 

1649 

1650 root = "butlerRoot/" 

1651 """Root repository directory expected to be used in case useTempRoot=False. 

1652 Otherwise the root is set to a 20 characters long randomly generated string 

1653 during set-up. 

1654 """ 

1655 

1656 datastoreStr = [f"datastore={root}"] 

1657 """Contains all expected root locations in a format expected to be 

1658 returned by Butler stringification. 

1659 """ 

1660 

1661 datastoreName = ["FileDatastore@https://{serverName}/{root}"] 

1662 """The expected format of the WebdavDatastore string.""" 

1663 

1664 registryStr = "/gen3.sqlite3" 

1665 """Expected format of the Registry string.""" 

1666 

1667 serverThread = None 

1668 """Thread in which the local webdav server will run""" 

1669 

1670 stopWebdavServer = False 

1671 """This flag will cause the webdav server to 

1672 gracefully shut down when True 

1673 """ 

1674 

1675 def genRoot(self): 

1676 """Returns a random string of len 20 to serve as a root 

1677 name for the temporary bucket repo. 

1678 

1679 This is equivalent to tempfile.mkdtemp as this is what self.root 

1680 becomes when useTempRoot is True. 

1681 """ 

1682 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1683 return rndstr + "/" 

1684 

1685 @classmethod 

1686 def setUpClass(cls): 

1687 # Do the same as inherited class 

1688 cls.storageClassFactory = StorageClassFactory() 

1689 cls.storageClassFactory.addFromConfig(cls.configFile) 

1690 

1691 cls.portNumber = cls._getfreeport() 

1692 # Run a local webdav server on which tests will be run 

1693 cls.serverThread = Thread( 

1694 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True 

1695 ) 

1696 cls.serverThread.start() 

1697 # Wait for it to start 

1698 time.sleep(3) 

1699 

1700 @classmethod 

1701 def tearDownClass(cls): 

1702 # Ask for graceful shut down of the webdav server 

1703 cls.stopWebdavServer = True 

1704 # Wait for the thread to exit 

1705 cls.serverThread.join() 

1706 

1707 # Mock required environment variables during tests 

1708 @unittest.mock.patch.dict( 

1709 os.environ, 

1710 { 

1711 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1712 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1713 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1714 }, 

1715 ) 

1716 def setUp(self): 

1717 config = Config(self.configFile) 

1718 

1719 if self.useTempRoot: 

1720 self.root = self.genRoot() 

1721 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}" 

1722 config.update({"datastore": {"datastore": {"root": self.rooturi}}}) 

1723 

1724 # need local folder to store registry database 

1725 self.reg_dir = makeTestTempDir(TESTDIR) 

1726 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1727 

1728 self.datastoreStr = f"datastore={self.root}" 

1729 self.datastoreName = [f"FileDatastore@{self.rooturi}"] 

1730 

1731 if not isWebdavEndpoint(self.rooturi): 

1732 raise OSError("Webdav server not running properly: cannot run tests.") 

1733 

1734 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False) 

1735 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml") 

1736 

1737 # Mock required environment variables during tests 

1738 @unittest.mock.patch.dict( 

1739 os.environ, 

1740 { 

1741 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1742 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1743 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1744 }, 

1745 ) 

1746 def tearDown(self): 

1747 # Clear temporary directory 

1748 ButlerURI(self.rooturi).remove() 

1749 ButlerURI(self.rooturi).session.close() 

1750 

1751 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1752 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1753 

1754 if self.useTempRoot and os.path.exists(self.root): 

1755 shutil.rmtree(self.root, ignore_errors=True) 

1756 

1757 def _serveWebdav(self, port: int, stopWebdavServer): 

1758 """Starts a local webdav-compatible HTTP server, 

1759 Listening on http://localhost:port 

1760 This server only runs when this test class is instantiated, 

1761 and then shuts down. Must be started is a separate thread. 

1762 

1763 Parameters 

1764 ---------- 

1765 port : `int` 

1766 The port number on which the server should listen 

1767 """ 

1768 root_path = gettempdir() 

1769 

1770 config = { 

1771 "host": "0.0.0.0", 

1772 "port": port, 

1773 "provider_mapping": {"/": root_path}, 

1774 "http_authenticator": {"domain_controller": None}, 

1775 "simple_dc": {"user_mapping": {"*": True}}, 

1776 "verbose": 0, 

1777 } 

1778 app = WsgiDAVApp(config) 

1779 

1780 server_args = { 

1781 "bind_addr": (config["host"], config["port"]), 

1782 "wsgi_app": app, 

1783 } 

1784 server = wsgi.Server(**server_args) 

1785 server.prepare() 

1786 

1787 try: 

1788 # Start the actual server in a separate thread 

1789 t = Thread(target=server.serve, daemon=True) 

1790 t.start() 

1791 # watch stopWebdavServer, and gracefully 

1792 # shut down the server when True 

1793 while True: 

1794 if stopWebdavServer(): 

1795 break 

1796 time.sleep(1) 

1797 except KeyboardInterrupt: 

1798 print("Caught Ctrl-C, shutting down...") 

1799 finally: 

1800 server.stop() 

1801 t.join() 

1802 

1803 def _getfreeport(): 

1804 """ 

1805 Determines a free port using sockets. 

1806 """ 

1807 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

1808 free_socket.bind(("0.0.0.0", 0)) 

1809 free_socket.listen() 

1810 port = free_socket.getsockname()[1] 

1811 free_socket.close() 

1812 return port 

1813 

1814 

1815class PosixDatastoreTransfers(unittest.TestCase): 

1816 """Test data transfers between butlers. 

1817 

1818 Test for different managers. UUID to UUID and integer to integer are 

1819 tested. UUID to integer is not supported since we do not currently 

1820 want to allow that. Integer to UUID is supported with the caveat 

1821 that UUID4 will be generated and this will be incorrect for raw 

1822 dataset types. The test ignores that. 

1823 """ 

1824 

1825 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1826 

1827 @classmethod 

1828 def setUpClass(cls): 

1829 cls.storageClassFactory = StorageClassFactory() 

1830 cls.storageClassFactory.addFromConfig(cls.configFile) 

1831 

1832 def setUp(self): 

1833 self.root = makeTestTempDir(TESTDIR) 

1834 self.config = Config(self.configFile) 

1835 

1836 def tearDown(self): 

1837 removeTestTempDir(self.root) 

1838 

1839 def create_butler(self, manager, label): 

1840 config = Config(self.configFile) 

1841 config["registry", "managers", "datasets"] = manager 

1842 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True) 

1843 

1844 def create_butlers(self, manager1, manager2): 

1845 self.source_butler = self.create_butler(manager1, "1") 

1846 self.target_butler = self.create_butler(manager2, "2") 

1847 

1848 def testTransferUuidToUuid(self): 

1849 self.create_butlers( 

1850 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1851 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1852 ) 

1853 # Setting id_gen_map should have no effect here 

1854 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1855 

1856 def testTransferIntToInt(self): 

1857 self.create_butlers( 

1858 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1859 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1860 ) 

1861 # int dataset ID only allows UNIQUE 

1862 self.assertButlerTransfers() 

1863 

1864 def testTransferIntToUuid(self): 

1865 self.create_butlers( 

1866 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1867 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1868 ) 

1869 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1870 

1871 def testTransferMissing(self): 

1872 """Test transfers where datastore records are missing. 

1873 

1874 This is how execution butler works. 

1875 """ 

1876 self.create_butlers( 

1877 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1878 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1879 ) 

1880 

1881 # Configure the source butler to allow trust. 

1882 self.source_butler.datastore.trustGetRequest = True 

1883 

1884 self.assertButlerTransfers(purge=True) 

1885 

1886 def testTransferMissingDisassembly(self): 

1887 """Test transfers where datastore records are missing. 

1888 

1889 This is how execution butler works. 

1890 """ 

1891 self.create_butlers( 

1892 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1893 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1894 ) 

1895 

1896 # Configure the source butler to allow trust. 

1897 self.source_butler.datastore.trustGetRequest = True 

1898 

1899 # Test disassembly. 

1900 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

1901 

1902 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"): 

1903 """Test that a run can be transferred to another butler.""" 

1904 

1905 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

1906 datasetTypeName = "random_data" 

1907 

1908 # Test will create 3 collections and we will want to transfer 

1909 # two of those three. 

1910 runs = ["run1", "run2", "other"] 

1911 

1912 # Also want to use two different dataset types to ensure that 

1913 # grouping works. 

1914 datasetTypeNames = ["random_data", "random_data_2"] 

1915 

1916 # Create the run collections in the source butler. 

1917 for run in runs: 

1918 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

1919 

1920 # Create dimensions in both butlers (transfer will not create them). 

1921 n_exposures = 30 

1922 for butler in (self.source_butler, self.target_butler): 

1923 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1924 butler.registry.insertDimensionData( 

1925 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1926 ) 

1927 butler.registry.insertDimensionData( 

1928 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1929 ) 

1930 

1931 for i in range(n_exposures): 

1932 butler.registry.insertDimensionData( 

1933 "exposure", 

1934 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

1935 ) 

1936 

1937 # Create dataset types in the source butler. 

1938 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1939 for datasetTypeName in datasetTypeNames: 

1940 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1941 self.source_butler.registry.registerDatasetType(datasetType) 

1942 

1943 # Write a dataset to an unrelated run -- this will ensure that 

1944 # we are rewriting integer dataset ids in the target if necessary. 

1945 # Will not be relevant for UUID. 

1946 run = "distraction" 

1947 butler = Butler(butler=self.source_butler, run=run) 

1948 butler.put( 

1949 makeExampleMetrics(), 

1950 datasetTypeName, 

1951 exposure=1, 

1952 instrument="DummyCamComp", 

1953 physical_filter="d-r", 

1954 ) 

1955 

1956 # Write some example metrics to the source 

1957 butler = Butler(butler=self.source_butler) 

1958 

1959 # Set of DatasetRefs that should be in the list of refs to transfer 

1960 # but which will not be transferred. 

1961 deleted = set() 

1962 

1963 n_expected = 20 # Number of datasets expected to be transferred 

1964 source_refs = [] 

1965 for i in range(n_exposures): 

1966 # Put a third of datasets into each collection, only retain 

1967 # two thirds. 

1968 index = i % 3 

1969 run = runs[index] 

1970 datasetTypeName = datasetTypeNames[i % 2] 

1971 

1972 metric_data = { 

1973 "summary": {"counter": i}, 

1974 "output": {"text": "metric"}, 

1975 "data": [2 * x for x in range(i)], 

1976 } 

1977 metric = MetricsExample(**metric_data) 

1978 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1979 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

1980 

1981 # Remove the datastore record using low-level API 

1982 if purge: 

1983 # Remove records for a fraction. 

1984 if index == 1: 

1985 

1986 # For one of these delete the file as well. 

1987 # This allows the "missing" code to filter the 

1988 # file out. 

1989 if not deleted: 

1990 primary, uris = butler.datastore.getURIs(ref) 

1991 if primary: 

1992 primary.remove() 

1993 for uri in uris.values(): 

1994 uri.remove() 

1995 n_expected -= 1 

1996 deleted.add(ref) 

1997 

1998 # Remove the datastore record. 

1999 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

2000 

2001 if index < 2: 

2002 source_refs.append(ref) 

2003 if ref not in deleted: 

2004 new_metric = butler.get(ref.unresolved(), collections=run) 

2005 self.assertEqual(new_metric, metric) 

2006 

2007 # Create some bad dataset types to ensure we check for inconsistent 

2008 # definitions. 

2009 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2010 for datasetTypeName in datasetTypeNames: 

2011 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2012 self.target_butler.registry.registerDatasetType(datasetType) 

2013 with self.assertRaises(ConflictingDefinitionError): 

2014 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map) 

2015 # And remove the bad definitions. 

2016 for datasetTypeName in datasetTypeNames: 

2017 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2018 

2019 # Transfer without creating dataset types should fail. 

2020 with self.assertRaises(KeyError): 

2021 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map) 

2022 

2023 # Now transfer them to the second butler 

2024 with self.assertLogs(level=logging.DEBUG) as cm: 

2025 transferred = self.target_butler.transfer_from( 

2026 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True 

2027 ) 

2028 self.assertEqual(len(transferred), n_expected) 

2029 log_output = ";".join(cm.output) 

2030 self.assertIn("found in datastore for chunk", log_output) 

2031 self.assertIn("Creating output run", log_output) 

2032 

2033 # Do the transfer twice to ensure that it will do nothing extra. 

2034 # Only do this if purge=True because it does not work for int 

2035 # dataset_id. 

2036 if purge: 

2037 # This should not need to register dataset types. 

2038 transferred = self.target_butler.transfer_from( 

2039 self.source_butler, source_refs, id_gen_map=id_gen_map 

2040 ) 

2041 self.assertEqual(len(transferred), n_expected) 

2042 

2043 # Also do an explicit low-level transfer to trigger some 

2044 # edge cases. 

2045 with self.assertLogs(level=logging.DEBUG) as cm: 

2046 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

2047 log_output = ";".join(cm.output) 

2048 self.assertIn("no file artifacts exist", log_output) 

2049 

2050 with self.assertRaises(TypeError): 

2051 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) 

2052 

2053 with self.assertRaises(ValueError): 

2054 self.target_butler.datastore.transfer_from( 

2055 self.source_butler.datastore, source_refs, transfer="split" 

2056 ) 

2057 

2058 # Now try to get the same refs from the new butler. 

2059 for ref in source_refs: 

2060 if ref not in deleted: 

2061 unresolved_ref = ref.unresolved() 

2062 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run) 

2063 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run) 

2064 self.assertEqual(new_metric, old_metric) 

2065 

2066 # Now prune run2 collection and create instead a CHAINED collection. 

2067 # This should block the transfer. 

2068 self.target_butler.pruneCollection("run2", purge=True, unstore=True) 

2069 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2070 with self.assertRaises(TypeError): 

2071 # Re-importing the run1 datasets can be problematic if they 

2072 # use integer IDs so filter those out. 

2073 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2074 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map) 

2075 

2076 

2077if __name__ == "__main__": 2077 ↛ 2078line 2077 didn't jump to line 2078, because the condition on line 2077 was never true

2078 unittest.main()