Coverage for tests/test_butler.py: 17%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1071 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24 

25import logging 

26import os 

27import posixpath 

28import unittest 

29import tempfile 

30import shutil 

31import pickle 

32import string 

33import random 

34import time 

35import socket 

36import pathlib 

37 

38try: 

39 import boto3 

40 import botocore 

41 from moto import mock_s3 

42except ImportError: 

43 boto3 = None 

44 

45 def mock_s3(cls): 

46 """A no-op decorator in case moto mock_s3 can not be imported. 

47 """ 

48 return cls 

49 

50try: 

51 from cheroot import wsgi 

52 from wsgidav.wsgidav_app import WsgiDAVApp 

53except ImportError: 

54 WsgiDAVApp = None 

55 

56import astropy.time 

57from threading import Thread 

58from tempfile import gettempdir 

59from lsst.utils import doImport 

60from lsst.daf.butler import Butler, Config, ButlerConfig 

61from lsst.daf.butler import StorageClassFactory 

62from lsst.daf.butler import DatasetType, DatasetRef, DatasetIdGenEnum 

63from lsst.daf.butler import FileTemplateValidationError, ValidationError 

64from lsst.daf.butler import FileDataset 

65from lsst.daf.butler import CollectionSearch, CollectionType 

66from lsst.daf.butler import ButlerURI 

67from lsst.daf.butler import script 

68from lsst.daf.butler.registry import MissingCollectionError, ConflictingDefinitionError 

69from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

70from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

71from lsst.resources.http import isWebdavEndpoint 

72 

73from lsst.daf.butler.tests import MultiDetectorFormatter, MetricsExample 

74from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir 

75 

76TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

77 

78 

79def makeExampleMetrics(): 

80 return MetricsExample({"AM1": 5.2, "AM2": 30.6}, 

81 {"a": [1, 2, 3], 

82 "b": {"blue": 5, "red": "green"}}, 

83 [563, 234, 456.7, 752, 8, 9, 27] 

84 ) 

85 

86 

87class TransactionTestError(Exception): 

88 """Specific error for testing transactions, to prevent misdiagnosing 

89 that might otherwise occur when a standard exception is used. 

90 """ 

91 pass 

92 

93 

94class ButlerConfigTests(unittest.TestCase): 

95 """Simple tests for ButlerConfig that are not tested in other test cases. 

96 """ 

97 

98 def testSearchPath(self): 

99 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

100 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

101 config1 = ButlerConfig(configFile) 

102 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

103 

104 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

105 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

106 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

107 self.assertIn("testConfigs", "\n".join(cm.output)) 

108 

109 key = ("datastore", "records", "table") 

110 self.assertNotEqual(config1[key], config2[key]) 

111 self.assertEqual(config2[key], "override_record") 

112 

113 

114class ButlerPutGetTests: 

115 """Helper method for running a suite of put/get tests from different 

116 butler configurations.""" 

117 

118 root = None 

119 

120 @staticmethod 

121 def addDatasetType(datasetTypeName, dimensions, storageClass, registry): 

122 """Create a DatasetType and register it 

123 """ 

124 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

125 registry.registerDatasetType(datasetType) 

126 return datasetType 

127 

128 @classmethod 

129 def setUpClass(cls): 

130 cls.storageClassFactory = StorageClassFactory() 

131 cls.storageClassFactory.addFromConfig(cls.configFile) 

132 

133 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None): 

134 datasetType = datasetRef.datasetType 

135 dataId = datasetRef.dataId 

136 deferred = butler.getDirectDeferred(datasetRef) 

137 

138 for component in components: 

139 compTypeName = datasetType.componentTypeName(component) 

140 result = butler.get(compTypeName, dataId, collections=collections) 

141 self.assertEqual(result, getattr(reference, component)) 

142 result_deferred = deferred.get(component=component) 

143 self.assertEqual(result_deferred, result) 

144 

145 def tearDown(self): 

146 removeTestTempDir(self.root) 

147 

148 def runPutGetTest(self, storageClass, datasetTypeName): 

149 # New datasets will be added to run and tag, but we will only look in 

150 # tag when looking up datasets. 

151 run = "ingest" 

152 butler = Butler(self.tmpConfigFile, run=run) 

153 

154 collections = set(butler.registry.queryCollections()) 

155 self.assertEqual(collections, set([run])) 

156 

157 # Create and register a DatasetType 

158 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

159 

160 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

161 

162 # Add needed Dimensions 

163 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

164 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

165 "name": "d-r", 

166 "band": "R"}) 

167 butler.registry.insertDimensionData("visit_system", {"instrument": "DummyCamComp", 

168 "id": 1, 

169 "name": "default"}) 

170 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

171 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

172 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

173 "name": "fourtwentythree", "physical_filter": "d-r", 

174 "visit_system": 1, "datetime_begin": visit_start, 

175 "datetime_end": visit_end}) 

176 

177 # Add a second visit for some later tests 

178 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 424, 

179 "name": "fourtwentyfour", "physical_filter": "d-r", 

180 "visit_system": 1}) 

181 

182 # Create and store a dataset 

183 metric = makeExampleMetrics() 

184 dataId = {"instrument": "DummyCamComp", "visit": 423} 

185 

186 # Create a DatasetRef for put 

187 refIn = DatasetRef(datasetType, dataId, id=None) 

188 

189 # Put with a preexisting id should fail 

190 with self.assertRaises(ValueError): 

191 butler.put(metric, DatasetRef(datasetType, dataId, id=100)) 

192 

193 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

194 # and once with a DatasetType 

195 

196 # Keep track of any collections we add and do not clean up 

197 expected_collections = {run} 

198 

199 counter = 0 

200 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)): 

201 # Since we are using subTest we can get cascading failures 

202 # here with the first attempt failing and the others failing 

203 # immediately because the dataset already exists. Work around 

204 # this by using a distinct run collection each time 

205 counter += 1 

206 this_run = f"put_run_{counter}" 

207 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

208 expected_collections.update({this_run}) 

209 

210 with self.subTest(args=args): 

211 ref = butler.put(metric, *args, run=this_run) 

212 self.assertIsInstance(ref, DatasetRef) 

213 

214 # Test getDirect 

215 metricOut = butler.getDirect(ref) 

216 self.assertEqual(metric, metricOut) 

217 # Test get 

218 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

219 self.assertEqual(metric, metricOut) 

220 # Test get with a datasetRef 

221 metricOut = butler.get(ref, collections=this_run) 

222 self.assertEqual(metric, metricOut) 

223 # Test getDeferred with dataId 

224 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

225 self.assertEqual(metric, metricOut) 

226 # Test getDeferred with a datasetRef 

227 metricOut = butler.getDeferred(ref, collections=this_run).get() 

228 self.assertEqual(metric, metricOut) 

229 # and deferred direct with ref 

230 metricOut = butler.getDirectDeferred(ref).get() 

231 self.assertEqual(metric, metricOut) 

232 

233 # Check we can get components 

234 if storageClass.isComposite(): 

235 self.assertGetComponents(butler, ref, 

236 ("summary", "data", "output"), metric, 

237 collections=this_run) 

238 

239 # Can the artifacts themselves be retrieved? 

240 if not butler.datastore.isEphemeral: 

241 root_uri = ButlerURI(self.root) 

242 

243 for preserve_path in (True, False): 

244 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

245 # Use copy so that we can test that overwrite 

246 # protection works (using "auto" for File URIs would 

247 # use hard links and subsequent transfer would work 

248 # because it knows they are the same file). 

249 transferred = butler.retrieveArtifacts([ref], destination, 

250 preserve_path=preserve_path, transfer="copy") 

251 self.assertGreater(len(transferred), 0) 

252 artifacts = list(ButlerURI.findFileResources([destination])) 

253 self.assertEqual(set(transferred), set(artifacts)) 

254 

255 for artifact in transferred: 

256 path_in_destination = artifact.relative_to(destination) 

257 self.assertIsNotNone(path_in_destination) 

258 

259 # when path is not preserved there should not be 

260 # any path separators. 

261 num_seps = path_in_destination.count("/") 

262 if preserve_path: 

263 self.assertGreater(num_seps, 0) 

264 else: 

265 self.assertEqual(num_seps, 0) 

266 

267 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

268 n_uris = len(secondary_uris) 

269 if primary_uri: 

270 n_uris += 1 

271 self.assertEqual(len(artifacts), n_uris, "Comparing expected artifacts vs actual:" 

272 f" {artifacts} vs {primary_uri} and {secondary_uris}") 

273 

274 if preserve_path: 

275 # No need to run these twice 

276 with self.assertRaises(ValueError): 

277 butler.retrieveArtifacts([ref], destination, transfer="move") 

278 

279 with self.assertRaises(FileExistsError): 

280 butler.retrieveArtifacts([ref], destination) 

281 

282 transferred_again = butler.retrieveArtifacts([ref], destination, 

283 preserve_path=preserve_path, 

284 overwrite=True) 

285 self.assertEqual(set(transferred_again), set(transferred)) 

286 

287 # Now remove the dataset completely. 

288 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run) 

289 # Lookup with original args should still fail. 

290 with self.assertRaises(LookupError): 

291 butler.datasetExists(*args, collections=this_run) 

292 # getDirect() should still fail. 

293 with self.assertRaises(FileNotFoundError): 

294 butler.getDirect(ref) 

295 # Registry shouldn't be able to find it by dataset_id anymore. 

296 self.assertIsNone(butler.registry.getDataset(ref.id)) 

297 

298 # Do explicit registry removal since we know they are 

299 # empty 

300 butler.registry.removeCollection(this_run) 

301 expected_collections.remove(this_run) 

302 

303 # Put the dataset again, since the last thing we did was remove it 

304 # and we want to use the default collection. 

305 ref = butler.put(metric, refIn) 

306 

307 # Get with parameters 

308 stop = 4 

309 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

310 self.assertNotEqual(metric, sliced) 

311 self.assertEqual(metric.summary, sliced.summary) 

312 self.assertEqual(metric.output, sliced.output) 

313 self.assertEqual(metric.data[:stop], sliced.data) 

314 # getDeferred with parameters 

315 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

316 self.assertNotEqual(metric, sliced) 

317 self.assertEqual(metric.summary, sliced.summary) 

318 self.assertEqual(metric.output, sliced.output) 

319 self.assertEqual(metric.data[:stop], sliced.data) 

320 # getDeferred with deferred parameters 

321 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

322 self.assertNotEqual(metric, sliced) 

323 self.assertEqual(metric.summary, sliced.summary) 

324 self.assertEqual(metric.output, sliced.output) 

325 self.assertEqual(metric.data[:stop], sliced.data) 

326 

327 if storageClass.isComposite(): 

328 # Check that components can be retrieved 

329 metricOut = butler.get(ref.datasetType.name, dataId) 

330 compNameS = ref.datasetType.componentTypeName("summary") 

331 compNameD = ref.datasetType.componentTypeName("data") 

332 summary = butler.get(compNameS, dataId) 

333 self.assertEqual(summary, metric.summary) 

334 data = butler.get(compNameD, dataId) 

335 self.assertEqual(data, metric.data) 

336 

337 if "counter" in storageClass.derivedComponents: 

338 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

339 self.assertEqual(count, len(data)) 

340 

341 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId, 

342 parameters={"slice": slice(stop)}) 

343 self.assertEqual(count, stop) 

344 

345 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

346 summary = butler.getDirect(compRef) 

347 self.assertEqual(summary, metric.summary) 

348 

349 # Create a Dataset type that has the same name but is inconsistent. 

350 inconsistentDatasetType = DatasetType(datasetTypeName, dimensions, 

351 self.storageClassFactory.getStorageClass("Config")) 

352 

353 # Getting with a dataset type that does not match registry fails 

354 with self.assertRaises(ValueError): 

355 butler.get(inconsistentDatasetType, dataId) 

356 

357 # Combining a DatasetRef with a dataId should fail 

358 with self.assertRaises(ValueError): 

359 butler.get(ref, dataId) 

360 # Getting with an explicit ref should fail if the id doesn't match 

361 with self.assertRaises(ValueError): 

362 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101)) 

363 

364 # Getting a dataset with unknown parameters should fail 

365 with self.assertRaises(KeyError): 

366 butler.get(ref, parameters={"unsupported": True}) 

367 

368 # Check we have a collection 

369 collections = set(butler.registry.queryCollections()) 

370 self.assertEqual(collections, expected_collections) 

371 

372 # Clean up to check that we can remove something that may have 

373 # already had a component removed 

374 butler.pruneDatasets([ref], unstore=True, purge=True) 

375 

376 # Check that we can configure a butler to accept a put even 

377 # if it already has the dataset in registry. 

378 ref = butler.put(metric, refIn) 

379 

380 # Repeat put will fail. 

381 with self.assertRaises(ConflictingDefinitionError): 

382 butler.put(metric, refIn) 

383 

384 # Remove the datastore entry. 

385 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

386 

387 # Put will still fail 

388 with self.assertRaises(ConflictingDefinitionError): 

389 butler.put(metric, refIn) 

390 

391 # Allow the put to succeed 

392 butler._allow_put_of_predefined_dataset = True 

393 ref2 = butler.put(metric, refIn) 

394 self.assertEqual(ref2.id, ref.id) 

395 

396 # A second put will still fail but with a different exception 

397 # than before. 

398 with self.assertRaises(ConflictingDefinitionError): 

399 butler.put(metric, refIn) 

400 

401 # Reset the flag to avoid confusion 

402 butler._allow_put_of_predefined_dataset = False 

403 

404 # Leave the dataset in place since some downstream tests require 

405 # something to be present 

406 

407 return butler 

408 

409 def testDeferredCollectionPassing(self): 

410 # Construct a butler with no run or collection, but make it writeable. 

411 butler = Butler(self.tmpConfigFile, writeable=True) 

412 # Create and register a DatasetType 

413 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

414 datasetType = self.addDatasetType("example", dimensions, 

415 self.storageClassFactory.getStorageClass("StructuredData"), 

416 butler.registry) 

417 # Add needed Dimensions 

418 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

419 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

420 "name": "d-r", 

421 "band": "R"}) 

422 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

423 "name": "fourtwentythree", "physical_filter": "d-r"}) 

424 dataId = {"instrument": "DummyCamComp", "visit": 423} 

425 # Create dataset. 

426 metric = makeExampleMetrics() 

427 # Register a new run and put dataset. 

428 run = "deferred" 

429 self.assertTrue(butler.registry.registerRun(run)) 

430 # Second time it will be allowed but indicate no-op 

431 self.assertFalse(butler.registry.registerRun(run)) 

432 ref = butler.put(metric, datasetType, dataId, run=run) 

433 # Putting with no run should fail with TypeError. 

434 with self.assertRaises(TypeError): 

435 butler.put(metric, datasetType, dataId) 

436 # Dataset should exist. 

437 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

438 # We should be able to get the dataset back, but with and without 

439 # a deferred dataset handle. 

440 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

441 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

442 # Trying to find the dataset without any collection is a TypeError. 

443 with self.assertRaises(TypeError): 

444 butler.datasetExists(datasetType, dataId) 

445 with self.assertRaises(TypeError): 

446 butler.get(datasetType, dataId) 

447 # Associate the dataset with a different collection. 

448 butler.registry.registerCollection("tagged") 

449 butler.registry.associate("tagged", [ref]) 

450 # Deleting the dataset from the new collection should make it findable 

451 # in the original collection. 

452 butler.pruneDatasets([ref], tags=["tagged"]) 

453 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

454 

455 

456class ButlerTests(ButlerPutGetTests): 

457 """Tests for Butler. 

458 """ 

459 useTempRoot = True 

460 

461 def setUp(self): 

462 """Create a new butler root for each test.""" 

463 self.root = makeTestTempDir(TESTDIR) 

464 Butler.makeRepo(self.root, config=Config(self.configFile)) 

465 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

466 

467 def testConstructor(self): 

468 """Independent test of constructor. 

469 """ 

470 butler = Butler(self.tmpConfigFile, run="ingest") 

471 self.assertIsInstance(butler, Butler) 

472 

473 collections = set(butler.registry.queryCollections()) 

474 self.assertEqual(collections, {"ingest"}) 

475 

476 butler2 = Butler(butler=butler, collections=["other"]) 

477 self.assertEqual( 

478 butler2.collections, 

479 CollectionSearch.fromExpression(["other"]) 

480 ) 

481 self.assertIsNone(butler2.run) 

482 self.assertIs(butler.datastore, butler2.datastore) 

483 

484 # Test that we can use an environment variable to find this 

485 # repository. 

486 butler_index = Config() 

487 butler_index["label"] = self.tmpConfigFile 

488 for suffix in (".yaml", ".json"): 

489 # Ensure that the content differs so that we know that 

490 # we aren't reusing the cache. 

491 bad_label = f"s3://bucket/not_real{suffix}" 

492 butler_index["bad_label"] = bad_label 

493 with ButlerURI.temporary_uri(suffix=suffix) as temp_file: 

494 butler_index.dumpToUri(temp_file) 

495 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

496 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label"))) 

497 uri = Butler.get_repo_uri("bad_label") 

498 self.assertEqual(uri, ButlerURI(bad_label)) 

499 uri = Butler.get_repo_uri("label") 

500 butler = Butler(uri, writeable=False) 

501 self.assertIsInstance(butler, Butler) 

502 with self.assertRaises(KeyError) as cm: 

503 Butler.get_repo_uri("missing") 

504 self.assertIn("not known to", str(cm.exception)) 

505 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

506 with self.assertRaises(FileNotFoundError): 

507 Butler.get_repo_uri("label") 

508 self.assertEqual(Butler.get_known_repos(), set()) 

509 with self.assertRaises(KeyError) as cm: 

510 # No environment variable set. 

511 Butler.get_repo_uri("label") 

512 self.assertIn("No repository index defined", str(cm.exception)) 

513 self.assertEqual(Butler.get_known_repos(), set()) 

514 

515 def testBasicPutGet(self): 

516 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

517 self.runPutGetTest(storageClass, "test_metric") 

518 

519 def testCompositePutGetConcrete(self): 

520 

521 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

522 butler = self.runPutGetTest(storageClass, "test_metric") 

523 

524 # Should *not* be disassembled 

525 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

526 self.assertEqual(len(datasets), 1) 

527 uri, components = butler.getURIs(datasets[0]) 

528 self.assertIsInstance(uri, ButlerURI) 

529 self.assertFalse(components) 

530 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

531 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

532 

533 # Predicted dataset 

534 dataId = {"instrument": "DummyCamComp", "visit": 424} 

535 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

536 self.assertFalse(components) 

537 self.assertIsInstance(uri, ButlerURI) 

538 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

539 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

540 

541 def testCompositePutGetVirtual(self): 

542 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

543 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

544 

545 # Should be disassembled 

546 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

547 self.assertEqual(len(datasets), 1) 

548 uri, components = butler.getURIs(datasets[0]) 

549 

550 if butler.datastore.isEphemeral: 

551 # Never disassemble in-memory datastore 

552 self.assertIsInstance(uri, ButlerURI) 

553 self.assertFalse(components) 

554 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

555 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

556 else: 

557 self.assertIsNone(uri) 

558 self.assertEqual(set(components), set(storageClass.components)) 

559 for compuri in components.values(): 

560 self.assertIsInstance(compuri, ButlerURI) 

561 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

562 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

563 

564 # Predicted dataset 

565 dataId = {"instrument": "DummyCamComp", "visit": 424} 

566 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

567 

568 if butler.datastore.isEphemeral: 

569 # Never disassembled 

570 self.assertIsInstance(uri, ButlerURI) 

571 self.assertFalse(components) 

572 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

573 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

574 else: 

575 self.assertIsNone(uri) 

576 self.assertEqual(set(components), set(storageClass.components)) 

577 for compuri in components.values(): 

578 self.assertIsInstance(compuri, ButlerURI) 

579 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

580 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

581 

582 def testIngest(self): 

583 butler = Butler(self.tmpConfigFile, run="ingest") 

584 

585 # Create and register a DatasetType 

586 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"]) 

587 

588 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

589 datasetTypeName = "metric" 

590 

591 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

592 

593 # Add needed Dimensions 

594 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

595 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

596 "name": "d-r", 

597 "band": "R"}) 

598 for detector in (1, 2): 

599 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", "id": detector, 

600 "full_name": f"detector{detector}"}) 

601 

602 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

603 "name": "fourtwentythree", "physical_filter": "d-r"}, 

604 {"instrument": "DummyCamComp", "id": 424, 

605 "name": "fourtwentyfour", "physical_filter": "d-r"}) 

606 

607 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter") 

608 dataRoot = os.path.join(TESTDIR, "data", "basic") 

609 datasets = [] 

610 for detector in (1, 2): 

611 detector_name = f"detector_{detector}" 

612 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

613 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

614 # Create a DatasetRef for ingest 

615 refIn = DatasetRef(datasetType, dataId, id=None) 

616 

617 datasets.append(FileDataset(path=metricFile, 

618 refs=[refIn], 

619 formatter=formatter)) 

620 

621 butler.ingest(*datasets, transfer="copy") 

622 

623 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

624 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

625 

626 metrics1 = butler.get(datasetTypeName, dataId1) 

627 metrics2 = butler.get(datasetTypeName, dataId2) 

628 self.assertNotEqual(metrics1, metrics2) 

629 

630 # Compare URIs 

631 uri1 = butler.getURI(datasetTypeName, dataId1) 

632 uri2 = butler.getURI(datasetTypeName, dataId2) 

633 self.assertNotEqual(uri1, uri2) 

634 

635 # Now do a multi-dataset but single file ingest 

636 metricFile = os.path.join(dataRoot, "detectors.yaml") 

637 refs = [] 

638 for detector in (1, 2): 

639 detector_name = f"detector_{detector}" 

640 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

641 # Create a DatasetRef for ingest 

642 refs.append(DatasetRef(datasetType, dataId, id=None)) 

643 

644 datasets = [] 

645 datasets.append(FileDataset(path=metricFile, 

646 refs=refs, 

647 formatter=MultiDetectorFormatter)) 

648 

649 butler.ingest(*datasets, transfer="copy") 

650 

651 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

652 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

653 

654 multi1 = butler.get(datasetTypeName, dataId1) 

655 multi2 = butler.get(datasetTypeName, dataId2) 

656 

657 self.assertEqual(multi1, metrics1) 

658 self.assertEqual(multi2, metrics2) 

659 

660 # Compare URIs 

661 uri1 = butler.getURI(datasetTypeName, dataId1) 

662 uri2 = butler.getURI(datasetTypeName, dataId2) 

663 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

664 

665 # Test that removing one does not break the second 

666 # This line will issue a warning log message for a ChainedDatastore 

667 # that uses an InMemoryDatastore since in-memory can not ingest 

668 # files. 

669 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

670 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1)) 

671 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2)) 

672 multi2b = butler.get(datasetTypeName, dataId2) 

673 self.assertEqual(multi2, multi2b) 

674 

675 def testPruneCollections(self): 

676 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

677 butler = Butler(self.tmpConfigFile, writeable=True) 

678 # Load registry data with dimensions to hang datasets off of. 

679 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

680 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

681 # Add some RUN-type collections. 

682 run1 = "run1" 

683 butler.registry.registerRun(run1) 

684 run2 = "run2" 

685 butler.registry.registerRun(run2) 

686 # put some datasets. ref1 and ref2 have the same data ID, and are in 

687 # different runs. ref3 has a different data ID. 

688 metric = makeExampleMetrics() 

689 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

690 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

691 butler.registry) 

692 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

693 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

694 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

695 

696 # Try to delete a RUN collection without purge, or with purge and not 

697 # unstore. 

698 with self.assertRaises(TypeError): 

699 butler.pruneCollection(run1) 

700 with self.assertRaises(TypeError): 

701 butler.pruneCollection(run2, purge=True) 

702 # Add a TAGGED collection and associate ref3 only into it. 

703 tag1 = "tag1" 

704 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

705 self.assertTrue(registered) 

706 # Registering a second time should be allowed. 

707 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

708 self.assertFalse(registered) 

709 butler.registry.associate(tag1, [ref3]) 

710 # Add a CHAINED collection that searches run1 and then run2. It 

711 # logically contains only ref1, because ref2 is shadowed due to them 

712 # having the same data ID and dataset type. 

713 chain1 = "chain1" 

714 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

715 butler.registry.setCollectionChain(chain1, [run1, run2]) 

716 # Try to delete RUN collections, which should fail with complete 

717 # rollback because they're still referenced by the CHAINED 

718 # collection. 

719 with self.assertRaises(Exception): 

720 butler.pruneCollection(run1, pruge=True, unstore=True) 

721 with self.assertRaises(Exception): 

722 butler.pruneCollection(run2, pruge=True, unstore=True) 

723 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

724 [ref1, ref2, ref3]) 

725 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

726 self.assertTrue(existence[ref1]) 

727 self.assertTrue(existence[ref2]) 

728 self.assertTrue(existence[ref3]) 

729 # Try to delete CHAINED and TAGGED collections with purge; should not 

730 # work. 

731 with self.assertRaises(TypeError): 

732 butler.pruneCollection(tag1, purge=True, unstore=True) 

733 with self.assertRaises(TypeError): 

734 butler.pruneCollection(chain1, purge=True, unstore=True) 

735 # Remove the tagged collection with unstore=False. This should not 

736 # affect the datasets. 

737 butler.pruneCollection(tag1) 

738 with self.assertRaises(MissingCollectionError): 

739 butler.registry.getCollectionType(tag1) 

740 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

741 [ref1, ref2, ref3]) 

742 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

743 self.assertTrue(existence[ref1]) 

744 self.assertTrue(existence[ref2]) 

745 self.assertTrue(existence[ref3]) 

746 # Add the tagged collection back in, and remove it with unstore=True. 

747 # This should remove ref3 only from the datastore. 

748 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

749 butler.registry.associate(tag1, [ref3]) 

750 butler.pruneCollection(tag1, unstore=True) 

751 with self.assertRaises(MissingCollectionError): 

752 butler.registry.getCollectionType(tag1) 

753 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

754 [ref1, ref2, ref3]) 

755 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

756 self.assertTrue(existence[ref1]) 

757 self.assertTrue(existence[ref2]) 

758 self.assertFalse(existence[ref3]) 

759 # Delete the chain with unstore=False. The datasets should not be 

760 # affected at all. 

761 butler.pruneCollection(chain1) 

762 with self.assertRaises(MissingCollectionError): 

763 butler.registry.getCollectionType(chain1) 

764 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

765 [ref1, ref2, ref3]) 

766 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

767 self.assertTrue(existence[ref1]) 

768 self.assertTrue(existence[ref2]) 

769 self.assertFalse(existence[ref3]) 

770 # Redefine and then delete the chain with unstore=True. Only ref1 

771 # should be unstored (ref3 has already been unstored, but otherwise 

772 # would be now). 

773 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

774 butler.registry.setCollectionChain(chain1, [run1, run2]) 

775 butler.pruneCollection(chain1, unstore=True) 

776 with self.assertRaises(MissingCollectionError): 

777 butler.registry.getCollectionType(chain1) 

778 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

779 [ref1, ref2, ref3]) 

780 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

781 self.assertFalse(existence[ref1]) 

782 self.assertTrue(existence[ref2]) 

783 self.assertFalse(existence[ref3]) 

784 # Remove run1. This removes ref1 and ref3 from the registry (they're 

785 # already gone from the datastore, which is fine). 

786 butler.pruneCollection(run1, purge=True, unstore=True) 

787 with self.assertRaises(MissingCollectionError): 

788 butler.registry.getCollectionType(run1) 

789 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

790 [ref2]) 

791 self.assertTrue(butler.datastore.exists(ref2)) 

792 # Remove run2. This removes ref2 from the registry and the datastore. 

793 butler.pruneCollection(run2, purge=True, unstore=True) 

794 with self.assertRaises(MissingCollectionError): 

795 butler.registry.getCollectionType(run2) 

796 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

797 []) 

798 

799 # Now that the collections have been pruned we can remove the 

800 # dataset type 

801 butler.registry.removeDatasetType(datasetType.name) 

802 

803 def testPickle(self): 

804 """Test pickle support. 

805 """ 

806 butler = Butler(self.tmpConfigFile, run="ingest") 

807 butlerOut = pickle.loads(pickle.dumps(butler)) 

808 self.assertIsInstance(butlerOut, Butler) 

809 self.assertEqual(butlerOut._config, butler._config) 

810 self.assertEqual(butlerOut.collections, butler.collections) 

811 self.assertEqual(butlerOut.run, butler.run) 

812 

813 def testGetDatasetTypes(self): 

814 butler = Butler(self.tmpConfigFile, run="ingest") 

815 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"]) 

816 dimensionEntries = [ 

817 ("instrument", {"instrument": "DummyCam"}, {"instrument": "DummyHSC"}, 

818 {"instrument": "DummyCamComp"}), 

819 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

820 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}) 

821 ] 

822 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

823 # Add needed Dimensions 

824 for args in dimensionEntries: 

825 butler.registry.insertDimensionData(*args) 

826 

827 # When a DatasetType is added to the registry entries are not created 

828 # for components but querying them can return the components. 

829 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

830 components = set() 

831 for datasetTypeName in datasetTypeNames: 

832 # Create and register a DatasetType 

833 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

834 

835 for componentName in storageClass.components: 

836 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

837 

838 fromRegistry = set(butler.registry.queryDatasetTypes(components=True)) 

839 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

840 

841 # Now that we have some dataset types registered, validate them 

842 butler.validateConfiguration(ignore=["test_metric_comp", "metric3", "calexp", "DummySC", 

843 "datasetType.component", "random_data", "random_data_2"]) 

844 

845 # Add a new datasetType that will fail template validation 

846 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

847 if self.validationCanFail: 

848 with self.assertRaises(ValidationError): 

849 butler.validateConfiguration() 

850 

851 # Rerun validation but with a subset of dataset type names 

852 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

853 

854 # Rerun validation but ignore the bad datasetType 

855 butler.validateConfiguration(ignore=["test_metric_comp", "metric3", "calexp", "DummySC", 

856 "datasetType.component", "random_data", "random_data_2"]) 

857 

858 def testTransaction(self): 

859 butler = Butler(self.tmpConfigFile, run="ingest") 

860 datasetTypeName = "test_metric" 

861 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

862 dimensionEntries = (("instrument", {"instrument": "DummyCam"}), 

863 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", 

864 "band": "R"}), 

865 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", 

866 "physical_filter": "d-r"})) 

867 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

868 metric = makeExampleMetrics() 

869 dataId = {"instrument": "DummyCam", "visit": 42} 

870 # Create and register a DatasetType 

871 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

872 with self.assertRaises(TransactionTestError): 

873 with butler.transaction(): 

874 # Add needed Dimensions 

875 for args in dimensionEntries: 

876 butler.registry.insertDimensionData(*args) 

877 # Store a dataset 

878 ref = butler.put(metric, datasetTypeName, dataId) 

879 self.assertIsInstance(ref, DatasetRef) 

880 # Test getDirect 

881 metricOut = butler.getDirect(ref) 

882 self.assertEqual(metric, metricOut) 

883 # Test get 

884 metricOut = butler.get(datasetTypeName, dataId) 

885 self.assertEqual(metric, metricOut) 

886 # Check we can get components 

887 self.assertGetComponents(butler, ref, 

888 ("summary", "data", "output"), metric) 

889 raise TransactionTestError("This should roll back the entire transaction") 

890 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"): 

891 butler.registry.expandDataId(dataId) 

892 # Should raise LookupError for missing data ID value 

893 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

894 butler.get(datasetTypeName, dataId) 

895 # Also check explicitly if Dataset entry is missing 

896 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

897 # Direct retrieval should not find the file in the Datastore 

898 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

899 butler.getDirect(ref) 

900 

901 def testMakeRepo(self): 

902 """Test that we can write butler configuration to a new repository via 

903 the Butler.makeRepo interface and then instantiate a butler from the 

904 repo root. 

905 """ 

906 # Do not run the test if we know this datastore configuration does 

907 # not support a file system root 

908 if self.fullConfigKey is None: 

909 return 

910 

911 # create two separate directories 

912 root1 = tempfile.mkdtemp(dir=self.root) 

913 root2 = tempfile.mkdtemp(dir=self.root) 

914 

915 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

916 limited = Config(self.configFile) 

917 butler1 = Butler(butlerConfig) 

918 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

919 full = Config(self.tmpConfigFile) 

920 butler2 = Butler(butlerConfig) 

921 # Butlers should have the same configuration regardless of whether 

922 # defaults were expanded. 

923 self.assertEqual(butler1._config, butler2._config) 

924 # Config files loaded directly should not be the same. 

925 self.assertNotEqual(limited, full) 

926 # Make sure "limited" doesn't have a few keys we know it should be 

927 # inheriting from defaults. 

928 self.assertIn(self.fullConfigKey, full) 

929 self.assertNotIn(self.fullConfigKey, limited) 

930 

931 # Collections don't appear until something is put in them 

932 collections1 = set(butler1.registry.queryCollections()) 

933 self.assertEqual(collections1, set()) 

934 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

935 

936 # Check that a config with no associated file name will not 

937 # work properly with relocatable Butler repo 

938 butlerConfig.configFile = None 

939 with self.assertRaises(ValueError): 

940 Butler(butlerConfig) 

941 

942 with self.assertRaises(FileExistsError): 

943 Butler.makeRepo(self.root, standalone=True, 

944 config=Config(self.configFile), overwrite=False) 

945 

946 def testStringification(self): 

947 butler = Butler(self.tmpConfigFile, run="ingest") 

948 butlerStr = str(butler) 

949 

950 if self.datastoreStr is not None: 

951 for testStr in self.datastoreStr: 

952 self.assertIn(testStr, butlerStr) 

953 if self.registryStr is not None: 

954 self.assertIn(self.registryStr, butlerStr) 

955 

956 datastoreName = butler.datastore.name 

957 if self.datastoreName is not None: 

958 for testStr in self.datastoreName: 

959 self.assertIn(testStr, datastoreName) 

960 

961 def testButlerRewriteDataId(self): 

962 """Test that dataIds can be rewritten based on dimension records.""" 

963 

964 butler = Butler(self.tmpConfigFile, run="ingest") 

965 

966 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

967 datasetTypeName = "random_data" 

968 

969 # Create dimension records. 

970 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

971 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

972 "name": "d-r", 

973 "band": "R"}) 

974 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", 

975 "id": 1, "full_name": "det1"}) 

976 

977 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

978 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

979 butler.registry.registerDatasetType(datasetType) 

980 

981 n_exposures = 5 

982 dayobs = 20210530 

983 

984 for i in range(n_exposures): 

985 butler.registry.insertDimensionData("exposure", {"instrument": "DummyCamComp", 

986 "id": i, "obs_id": f"exp{i}", 

987 "seq_num": i, "day_obs": dayobs, 

988 "physical_filter": "d-r"}) 

989 

990 # Write some data. 

991 for i in range(n_exposures): 

992 metric = {"something": i, 

993 "other": "metric", 

994 "list": [2*x for x in range(i)]} 

995 

996 # Use the seq_num for the put to test rewriting. 

997 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", 

998 "physical_filter": "d-r"} 

999 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1000 

1001 # Check that the exposure is correct in the dataId 

1002 self.assertEqual(ref.dataId["exposure"], i) 

1003 

1004 # and check that we can get the dataset back with the same dataId 

1005 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1006 self.assertEqual(new_metric, metric) 

1007 

1008 

1009class FileDatastoreButlerTests(ButlerTests): 

1010 """Common tests and specialization of ButlerTests for butlers backed 

1011 by datastores that inherit from FileDatastore. 

1012 """ 

1013 

1014 def checkFileExists(self, root, relpath): 

1015 """Checks if file exists at a given path (relative to root). 

1016 

1017 Test testPutTemplates verifies actual physical existance of the files 

1018 in the requested location. 

1019 """ 

1020 uri = ButlerURI(root, forceDirectory=True) 

1021 return uri.join(relpath).exists() 

1022 

1023 def testPutTemplates(self): 

1024 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1025 butler = Butler(self.tmpConfigFile, run="ingest") 

1026 

1027 # Add needed Dimensions 

1028 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1029 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

1030 "name": "d-r", 

1031 "band": "R"}) 

1032 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", 

1033 "physical_filter": "d-r"}) 

1034 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", 

1035 "physical_filter": "d-r"}) 

1036 

1037 # Create and store a dataset 

1038 metric = makeExampleMetrics() 

1039 

1040 # Create two almost-identical DatasetTypes (both will use default 

1041 # template) 

1042 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

1043 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1044 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1045 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1046 

1047 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1048 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1049 

1050 # Put with exactly the data ID keys needed 

1051 ref = butler.put(metric, "metric1", dataId1) 

1052 uri = butler.getURI(ref) 

1053 self.assertTrue(self.checkFileExists(butler.datastore.root, 

1054 "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"), 

1055 f"Checking existence of {uri}") 

1056 

1057 # Check the template based on dimensions 

1058 butler.datastore.templates.validateTemplates([ref]) 

1059 

1060 # Put with extra data ID keys (physical_filter is an optional 

1061 # dependency); should not change template (at least the way we're 

1062 # defining them to behave now; the important thing is that they 

1063 # must be consistent). 

1064 ref = butler.put(metric, "metric2", dataId2) 

1065 uri = butler.getURI(ref) 

1066 self.assertTrue(self.checkFileExists(butler.datastore.root, 

1067 "ingest/metric2/d-r/DummyCamComp_v423.pickle"), 

1068 f"Checking existence of {uri}") 

1069 

1070 # Check the template based on dimensions 

1071 butler.datastore.templates.validateTemplates([ref]) 

1072 

1073 # Now use a file template that will not result in unique filenames 

1074 with self.assertRaises(FileTemplateValidationError): 

1075 butler.put(metric, "metric3", dataId1) 

1076 

1077 def testImportExport(self): 

1078 # Run put/get tests just to create and populate a repo. 

1079 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1080 self.runImportExportTest(storageClass) 

1081 

1082 @unittest.expectedFailure 

1083 def testImportExportVirtualComposite(self): 

1084 # Run put/get tests just to create and populate a repo. 

1085 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1086 self.runImportExportTest(storageClass) 

1087 

1088 def runImportExportTest(self, storageClass): 

1089 """This test does an export to a temp directory and an import back 

1090 into a new temp directory repo. It does not assume a posix datastore""" 

1091 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1092 print("Root:", exportButler.datastore.root) 

1093 # Test that the repo actually has at least one dataset. 

1094 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1095 self.assertGreater(len(datasets), 0) 

1096 # Add a DimensionRecord that's unused by those datasets. 

1097 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1098 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1099 # Export and then import datasets. 

1100 with safeTestTempDir(TESTDIR) as exportDir: 

1101 exportFile = os.path.join(exportDir, "exports.yaml") 

1102 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1103 export.saveDatasets(datasets) 

1104 # Export the same datasets again. This should quietly do 

1105 # nothing because of internal deduplication, and it shouldn't 

1106 # complain about being asked to export the "htm7" elements even 

1107 # though there aren't any in these datasets or in the database. 

1108 export.saveDatasets(datasets, elements=["htm7"]) 

1109 # Save one of the data IDs again; this should be harmless 

1110 # because of internal deduplication. 

1111 export.saveDataIds([datasets[0].dataId]) 

1112 # Save some dimension records directly. 

1113 export.saveDimensionData("skymap", [skymapRecord]) 

1114 self.assertTrue(os.path.exists(exportFile)) 

1115 with safeTestTempDir(TESTDIR) as importDir: 

1116 # We always want this to be a local posix butler 

1117 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1118 # Calling script.butlerImport tests the implementation of the 

1119 # butler command line interface "import" subcommand. Functions 

1120 # in the script folder are generally considered protected and 

1121 # should not be used as public api. 

1122 with open(exportFile, "r") as f: 

1123 script.butlerImport(importDir, export_file=f, directory=exportDir, 

1124 transfer="auto", skip_dimensions=None, reuse_ids=False) 

1125 importButler = Butler(importDir, run="ingest") 

1126 for ref in datasets: 

1127 with self.subTest(ref=ref): 

1128 # Test for existence by passing in the DatasetType and 

1129 # data ID separately, to avoid lookup by dataset_id. 

1130 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId)) 

1131 self.assertEqual(list(importButler.registry.queryDimensionRecords("skymap")), 

1132 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)]) 

1133 

1134 def testRemoveRuns(self): 

1135 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1136 butler = Butler(self.tmpConfigFile, writeable=True) 

1137 # Load registry data with dimensions to hang datasets off of. 

1138 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1139 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1140 # Add some RUN-type collection. 

1141 run1 = "run1" 

1142 butler.registry.registerRun(run1) 

1143 run2 = "run2" 

1144 butler.registry.registerRun(run2) 

1145 # put a dataset in each 

1146 metric = makeExampleMetrics() 

1147 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1148 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

1149 butler.registry) 

1150 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1151 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1152 uri1 = butler.getURI(ref1, collections=[run1]) 

1153 uri2 = butler.getURI(ref2, collections=[run2]) 

1154 # Remove from both runs with different values for unstore. 

1155 butler.removeRuns([run1], unstore=True) 

1156 butler.removeRuns([run2], unstore=False) 

1157 # Should be nothing in registry for either one, and datastore should 

1158 # not think either exists. 

1159 with self.assertRaises(MissingCollectionError): 

1160 butler.registry.getCollectionType(run1) 

1161 with self.assertRaises(MissingCollectionError): 

1162 butler.registry.getCollectionType(run2) 

1163 self.assertFalse(butler.datastore.exists(ref1)) 

1164 self.assertFalse(butler.datastore.exists(ref2)) 

1165 # The ref we unstored should be gone according to the URI, but the 

1166 # one we forgot should still be around. 

1167 self.assertFalse(uri1.exists()) 

1168 self.assertTrue(uri2.exists()) 

1169 

1170 

1171class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1172 """PosixDatastore specialization of a butler""" 

1173 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1174 fullConfigKey = ".datastore.formatters" 

1175 validationCanFail = True 

1176 datastoreStr = ["/tmp"] 

1177 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1178 registryStr = "/gen3.sqlite3" 

1179 

1180 def testPathConstructor(self): 

1181 """Independent test of constructor using PathLike. 

1182 """ 

1183 butler = Butler(self.tmpConfigFile, run="ingest") 

1184 self.assertIsInstance(butler, Butler) 

1185 

1186 # And again with a Path object with the butler yaml 

1187 path = pathlib.Path(self.tmpConfigFile) 

1188 butler = Butler(path, writeable=False) 

1189 self.assertIsInstance(butler, Butler) 

1190 

1191 # And again with a Path object without the butler yaml 

1192 # (making sure we skip it if the tmp config doesn't end 

1193 # in butler.yaml -- which is the case for a subclass) 

1194 if self.tmpConfigFile.endswith("butler.yaml"): 

1195 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1196 butler = Butler(path, writeable=False) 

1197 self.assertIsInstance(butler, Butler) 

1198 

1199 def testExportTransferCopy(self): 

1200 """Test local export using all transfer modes""" 

1201 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1202 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1203 # Test that the repo actually has at least one dataset. 

1204 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1205 self.assertGreater(len(datasets), 0) 

1206 uris = [exportButler.getURI(d) for d in datasets] 

1207 datastoreRoot = exportButler.datastore.root 

1208 

1209 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1210 

1211 for path in pathsInStore: 

1212 # Assume local file system 

1213 self.assertTrue(self.checkFileExists(datastoreRoot, path), 

1214 f"Checking path {path}") 

1215 

1216 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1217 with safeTestTempDir(TESTDIR) as exportDir: 

1218 with exportButler.export(directory=exportDir, format="yaml", 

1219 transfer=transfer) as export: 

1220 export.saveDatasets(datasets) 

1221 for path in pathsInStore: 

1222 self.assertTrue(self.checkFileExists(exportDir, path), 

1223 f"Check that mode {transfer} exported files") 

1224 

1225 def testPruneDatasets(self): 

1226 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1227 butler = Butler(self.tmpConfigFile, writeable=True) 

1228 # Load registry data with dimensions to hang datasets off of. 

1229 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1230 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1231 # Add some RUN-type collections. 

1232 run1 = "run1" 

1233 butler.registry.registerRun(run1) 

1234 run2 = "run2" 

1235 butler.registry.registerRun(run2) 

1236 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1237 # different runs. ref3 has a different data ID. 

1238 metric = makeExampleMetrics() 

1239 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1240 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

1241 butler.registry) 

1242 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1243 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1244 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1245 

1246 # Simple prune. 

1247 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1248 with self.assertRaises(LookupError): 

1249 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1) 

1250 

1251 # Put data back. 

1252 ref1 = butler.put(metric, ref1.unresolved(), run=run1) 

1253 ref2 = butler.put(metric, ref2.unresolved(), run=run2) 

1254 ref3 = butler.put(metric, ref3.unresolved(), run=run1) 

1255 

1256 # Check that in normal mode, deleting the record will lead to 

1257 # trash not touching the file. 

1258 uri1 = butler.datastore.getURI(ref1) 

1259 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table 

1260 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id}) 

1261 butler.datastore.trash(ref1) 

1262 butler.datastore.emptyTrash() 

1263 self.assertTrue(uri1.exists()) 

1264 uri1.remove() # Clean it up. 

1265 

1266 # Simulate execution butler setup by deleting the datastore 

1267 # record but keeping the file around and trusting. 

1268 butler.datastore.trustGetRequest = True 

1269 uri2 = butler.datastore.getURI(ref2) 

1270 uri3 = butler.datastore.getURI(ref3) 

1271 self.assertTrue(uri2.exists()) 

1272 self.assertTrue(uri3.exists()) 

1273 

1274 # Remove the datastore record. 

1275 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table 

1276 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id}) 

1277 self.assertTrue(uri2.exists()) 

1278 butler.datastore.trash([ref2, ref3]) 

1279 # Immediate removal for ref2 file 

1280 self.assertFalse(uri2.exists()) 

1281 # But ref3 has to wait for the empty. 

1282 self.assertTrue(uri3.exists()) 

1283 butler.datastore.emptyTrash() 

1284 self.assertFalse(uri3.exists()) 

1285 

1286 # Clear out the datasets from registry. 

1287 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1288 

1289 

1290class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1291 """InMemoryDatastore specialization of a butler""" 

1292 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1293 fullConfigKey = None 

1294 useTempRoot = False 

1295 validationCanFail = False 

1296 datastoreStr = ["datastore='InMemory"] 

1297 datastoreName = ["InMemoryDatastore@"] 

1298 registryStr = "/gen3.sqlite3" 

1299 

1300 def testIngest(self): 

1301 pass 

1302 

1303 

1304class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1305 """PosixDatastore specialization""" 

1306 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1307 fullConfigKey = ".datastore.datastores.1.formatters" 

1308 validationCanFail = True 

1309 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1310 datastoreName = ["InMemoryDatastore@", f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1311 "SecondDatastore"] 

1312 registryStr = "/gen3.sqlite3" 

1313 

1314 

1315class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1316 """Test that a yaml file in one location can refer to a root in another.""" 

1317 

1318 datastoreStr = ["dir1"] 

1319 # Disable the makeRepo test since we are deliberately not using 

1320 # butler.yaml as the config name. 

1321 fullConfigKey = None 

1322 

1323 def setUp(self): 

1324 self.root = makeTestTempDir(TESTDIR) 

1325 

1326 # Make a new repository in one place 

1327 self.dir1 = os.path.join(self.root, "dir1") 

1328 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1329 

1330 # Move the yaml file to a different place and add a "root" 

1331 self.dir2 = os.path.join(self.root, "dir2") 

1332 os.makedirs(self.dir2, exist_ok=True) 

1333 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1334 config = Config(configFile1) 

1335 config["root"] = self.dir1 

1336 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1337 config.dumpToUri(configFile2) 

1338 os.remove(configFile1) 

1339 self.tmpConfigFile = configFile2 

1340 

1341 def testFileLocations(self): 

1342 self.assertNotEqual(self.dir1, self.dir2) 

1343 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1344 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1345 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1346 

1347 

1348class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1349 """Test that a config file created by makeRepo outside of repo works.""" 

1350 

1351 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1352 

1353 def setUp(self): 

1354 self.root = makeTestTempDir(TESTDIR) 

1355 self.root2 = makeTestTempDir(TESTDIR) 

1356 

1357 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1358 Butler.makeRepo(self.root, config=Config(self.configFile), 

1359 outfile=self.tmpConfigFile) 

1360 

1361 def tearDown(self): 

1362 if os.path.exists(self.root2): 

1363 shutil.rmtree(self.root2, ignore_errors=True) 

1364 super().tearDown() 

1365 

1366 def testConfigExistence(self): 

1367 c = Config(self.tmpConfigFile) 

1368 uri_config = ButlerURI(c["root"]) 

1369 uri_expected = ButlerURI(self.root, forceDirectory=True) 

1370 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1371 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1372 

1373 def testPutGet(self): 

1374 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1375 self.runPutGetTest(storageClass, "test_metric") 

1376 

1377 

1378class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1379 """Test that a config file created by makeRepo outside of repo works.""" 

1380 

1381 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1382 

1383 def setUp(self): 

1384 self.root = makeTestTempDir(TESTDIR) 

1385 self.root2 = makeTestTempDir(TESTDIR) 

1386 

1387 self.tmpConfigFile = self.root2 

1388 Butler.makeRepo(self.root, config=Config(self.configFile), 

1389 outfile=self.tmpConfigFile) 

1390 

1391 def testConfigExistence(self): 

1392 # Append the yaml file else Config constructor does not know the file 

1393 # type. 

1394 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1395 super().testConfigExistence() 

1396 

1397 

1398class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1399 """Test that a config file created by makeRepo outside of repo works.""" 

1400 

1401 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1402 

1403 def setUp(self): 

1404 self.root = makeTestTempDir(TESTDIR) 

1405 self.root2 = makeTestTempDir(TESTDIR) 

1406 

1407 self.tmpConfigFile = ButlerURI(os.path.join(self.root2, "something.yaml")).geturl() 

1408 Butler.makeRepo(self.root, config=Config(self.configFile), 

1409 outfile=self.tmpConfigFile) 

1410 

1411 

1412@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1413@mock_s3 

1414class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1415 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1416 a local in-memory SqlRegistry. 

1417 """ 

1418 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1419 fullConfigKey = None 

1420 validationCanFail = True 

1421 

1422 bucketName = "anybucketname" 

1423 """Name of the Bucket that will be used in the tests. The name is read from 

1424 the config file used with the tests during set-up. 

1425 """ 

1426 

1427 root = "butlerRoot/" 

1428 """Root repository directory expected to be used in case useTempRoot=False. 

1429 Otherwise the root is set to a 20 characters long randomly generated string 

1430 during set-up. 

1431 """ 

1432 

1433 datastoreStr = [f"datastore={root}"] 

1434 """Contains all expected root locations in a format expected to be 

1435 returned by Butler stringification. 

1436 """ 

1437 

1438 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1439 """The expected format of the S3 Datastore string.""" 

1440 

1441 registryStr = "/gen3.sqlite3" 

1442 """Expected format of the Registry string.""" 

1443 

1444 def genRoot(self): 

1445 """Returns a random string of len 20 to serve as a root 

1446 name for the temporary bucket repo. 

1447 

1448 This is equivalent to tempfile.mkdtemp as this is what self.root 

1449 becomes when useTempRoot is True. 

1450 """ 

1451 rndstr = "".join( 

1452 random.choice(string.ascii_uppercase + string.digits) for _ in range(20) 

1453 ) 

1454 return rndstr + "/" 

1455 

1456 def setUp(self): 

1457 config = Config(self.configFile) 

1458 uri = ButlerURI(config[".datastore.datastore.root"]) 

1459 self.bucketName = uri.netloc 

1460 

1461 # set up some fake credentials if they do not exist 

1462 self.usingDummyCredentials = setAwsEnvCredentials() 

1463 

1464 if self.useTempRoot: 

1465 self.root = self.genRoot() 

1466 rooturi = f"s3://{self.bucketName}/{self.root}" 

1467 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1468 

1469 # need local folder to store registry database 

1470 self.reg_dir = makeTestTempDir(TESTDIR) 

1471 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1472 

1473 # MOTO needs to know that we expect Bucket bucketname to exist 

1474 # (this used to be the class attribute bucketName) 

1475 s3 = boto3.resource("s3") 

1476 s3.create_bucket(Bucket=self.bucketName) 

1477 

1478 self.datastoreStr = f"datastore={self.root}" 

1479 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1480 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1481 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1482 

1483 def tearDown(self): 

1484 s3 = boto3.resource("s3") 

1485 bucket = s3.Bucket(self.bucketName) 

1486 try: 

1487 bucket.objects.all().delete() 

1488 except botocore.exceptions.ClientError as e: 

1489 if e.response["Error"]["Code"] == "404": 

1490 # the key was not reachable - pass 

1491 pass 

1492 else: 

1493 raise 

1494 

1495 bucket = s3.Bucket(self.bucketName) 

1496 bucket.delete() 

1497 

1498 # unset any potentially set dummy credentials 

1499 if self.usingDummyCredentials: 

1500 unsetAwsEnvCredentials() 

1501 

1502 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1503 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1504 

1505 if self.useTempRoot and os.path.exists(self.root): 

1506 shutil.rmtree(self.root, ignore_errors=True) 

1507 

1508 

1509@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!") 

1510# Mock required environment variables during tests 

1511@unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1512 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1513 TESTDIR, "config/testConfigs/webdav/token"), 

1514 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1515class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1516 """WebdavDatastore specialization of a butler; a Webdav storage Datastore + 

1517 a local in-memory SqlRegistry. 

1518 """ 

1519 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml") 

1520 fullConfigKey = None 

1521 validationCanFail = True 

1522 

1523 serverName = "localhost" 

1524 """Name of the server that will be used in the tests. 

1525 """ 

1526 

1527 portNumber = 8080 

1528 """Port on which the webdav server listens. Automatically chosen 

1529 at setUpClass via the _getfreeport() method 

1530 """ 

1531 

1532 root = "butlerRoot/" 

1533 """Root repository directory expected to be used in case useTempRoot=False. 

1534 Otherwise the root is set to a 20 characters long randomly generated string 

1535 during set-up. 

1536 """ 

1537 

1538 datastoreStr = [f"datastore={root}"] 

1539 """Contains all expected root locations in a format expected to be 

1540 returned by Butler stringification. 

1541 """ 

1542 

1543 datastoreName = ["FileDatastore@https://{serverName}/{root}"] 

1544 """The expected format of the WebdavDatastore string.""" 

1545 

1546 registryStr = "/gen3.sqlite3" 

1547 """Expected format of the Registry string.""" 

1548 

1549 serverThread = None 

1550 """Thread in which the local webdav server will run""" 

1551 

1552 stopWebdavServer = False 

1553 """This flag will cause the webdav server to 

1554 gracefully shut down when True 

1555 """ 

1556 

1557 def genRoot(self): 

1558 """Returns a random string of len 20 to serve as a root 

1559 name for the temporary bucket repo. 

1560 

1561 This is equivalent to tempfile.mkdtemp as this is what self.root 

1562 becomes when useTempRoot is True. 

1563 """ 

1564 rndstr = "".join( 

1565 random.choice(string.ascii_uppercase + string.digits) for _ in range(20) 

1566 ) 

1567 return rndstr + "/" 

1568 

1569 @classmethod 

1570 def setUpClass(cls): 

1571 # Do the same as inherited class 

1572 cls.storageClassFactory = StorageClassFactory() 

1573 cls.storageClassFactory.addFromConfig(cls.configFile) 

1574 

1575 cls.portNumber = cls._getfreeport() 

1576 # Run a local webdav server on which tests will be run 

1577 cls.serverThread = Thread(target=cls._serveWebdav, 

1578 args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), 

1579 daemon=True) 

1580 cls.serverThread.start() 

1581 # Wait for it to start 

1582 time.sleep(3) 

1583 

1584 @classmethod 

1585 def tearDownClass(cls): 

1586 # Ask for graceful shut down of the webdav server 

1587 cls.stopWebdavServer = True 

1588 # Wait for the thread to exit 

1589 cls.serverThread.join() 

1590 

1591 # Mock required environment variables during tests 

1592 @unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1593 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1594 TESTDIR, "config/testConfigs/webdav/token"), 

1595 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1596 def setUp(self): 

1597 config = Config(self.configFile) 

1598 

1599 if self.useTempRoot: 

1600 self.root = self.genRoot() 

1601 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}" 

1602 config.update({"datastore": {"datastore": {"root": self.rooturi}}}) 

1603 

1604 # need local folder to store registry database 

1605 self.reg_dir = makeTestTempDir(TESTDIR) 

1606 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1607 

1608 self.datastoreStr = f"datastore={self.root}" 

1609 self.datastoreName = [f"FileDatastore@{self.rooturi}"] 

1610 

1611 if not isWebdavEndpoint(self.rooturi): 

1612 raise OSError("Webdav server not running properly: cannot run tests.") 

1613 

1614 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False) 

1615 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml") 

1616 

1617 # Mock required environment variables during tests 

1618 @unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1619 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1620 TESTDIR, "config/testConfigs/webdav/token"), 

1621 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1622 def tearDown(self): 

1623 # Clear temporary directory 

1624 ButlerURI(self.rooturi).remove() 

1625 ButlerURI(self.rooturi).session.close() 

1626 

1627 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1628 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1629 

1630 if self.useTempRoot and os.path.exists(self.root): 

1631 shutil.rmtree(self.root, ignore_errors=True) 

1632 

1633 def _serveWebdav(self, port: int, stopWebdavServer): 

1634 """Starts a local webdav-compatible HTTP server, 

1635 Listening on http://localhost:port 

1636 This server only runs when this test class is instantiated, 

1637 and then shuts down. Must be started is a separate thread. 

1638 

1639 Parameters 

1640 ---------- 

1641 port : `int` 

1642 The port number on which the server should listen 

1643 """ 

1644 root_path = gettempdir() 

1645 

1646 config = { 

1647 "host": "0.0.0.0", 

1648 "port": port, 

1649 "provider_mapping": {"/": root_path}, 

1650 "http_authenticator": { 

1651 "domain_controller": None 

1652 }, 

1653 "simple_dc": {"user_mapping": {"*": True}}, 

1654 "verbose": 0, 

1655 } 

1656 app = WsgiDAVApp(config) 

1657 

1658 server_args = { 

1659 "bind_addr": (config["host"], config["port"]), 

1660 "wsgi_app": app, 

1661 } 

1662 server = wsgi.Server(**server_args) 

1663 server.prepare() 

1664 

1665 try: 

1666 # Start the actual server in a separate thread 

1667 t = Thread(target=server.serve, daemon=True) 

1668 t.start() 

1669 # watch stopWebdavServer, and gracefully 

1670 # shut down the server when True 

1671 while True: 

1672 if stopWebdavServer(): 

1673 break 

1674 time.sleep(1) 

1675 except KeyboardInterrupt: 

1676 print("Caught Ctrl-C, shutting down...") 

1677 finally: 

1678 server.stop() 

1679 t.join() 

1680 

1681 def _getfreeport(): 

1682 """ 

1683 Determines a free port using sockets. 

1684 """ 

1685 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

1686 free_socket.bind(('0.0.0.0', 0)) 

1687 free_socket.listen() 

1688 port = free_socket.getsockname()[1] 

1689 free_socket.close() 

1690 return port 

1691 

1692 

1693class PosixDatastoreTransfers(unittest.TestCase): 

1694 """Test data transfers between butlers. 

1695 

1696 Test for different managers. UUID to UUID and integer to integer are 

1697 tested. UUID to integer is not supported since we do not currently 

1698 want to allow that. Integer to UUID is supported with the caveat 

1699 that UUID4 will be generated and this will be incorrect for raw 

1700 dataset types. The test ignores that. 

1701 """ 

1702 

1703 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1704 

1705 @classmethod 

1706 def setUpClass(cls): 

1707 cls.storageClassFactory = StorageClassFactory() 

1708 cls.storageClassFactory.addFromConfig(cls.configFile) 

1709 

1710 def setUp(self): 

1711 self.root = makeTestTempDir(TESTDIR) 

1712 self.config = Config(self.configFile) 

1713 

1714 def tearDown(self): 

1715 removeTestTempDir(self.root) 

1716 

1717 def create_butler(self, manager, label): 

1718 config = Config(self.configFile) 

1719 config["registry", "managers", "datasets"] = manager 

1720 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), 

1721 writeable=True) 

1722 

1723 def create_butlers(self, manager1, manager2): 

1724 self.source_butler = self.create_butler(manager1, "1") 

1725 self.target_butler = self.create_butler(manager2, "2") 

1726 

1727 def testTransferUuidToUuid(self): 

1728 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1729 "ByDimensionsDatasetRecordStorageManagerUUID", 

1730 "lsst.daf.butler.registry.datasets.byDimensions." 

1731 "ByDimensionsDatasetRecordStorageManagerUUID", 

1732 ) 

1733 # Setting id_gen_map should have no effect here 

1734 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1735 

1736 def testTransferIntToInt(self): 

1737 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1738 "ByDimensionsDatasetRecordStorageManager", 

1739 "lsst.daf.butler.registry.datasets.byDimensions." 

1740 "ByDimensionsDatasetRecordStorageManager", 

1741 ) 

1742 # int dataset ID only allows UNIQUE 

1743 self.assertButlerTransfers() 

1744 

1745 def testTransferIntToUuid(self): 

1746 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1747 "ByDimensionsDatasetRecordStorageManager", 

1748 "lsst.daf.butler.registry.datasets.byDimensions." 

1749 "ByDimensionsDatasetRecordStorageManagerUUID", 

1750 ) 

1751 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1752 

1753 def testTransferMissing(self): 

1754 """Test transfers where datastore records are missing. 

1755 

1756 This is how execution butler works. 

1757 """ 

1758 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1759 "ByDimensionsDatasetRecordStorageManagerUUID", 

1760 "lsst.daf.butler.registry.datasets.byDimensions." 

1761 "ByDimensionsDatasetRecordStorageManagerUUID", 

1762 ) 

1763 

1764 # Configure the source butler to allow trust. 

1765 self.source_butler.datastore.trustGetRequest = True 

1766 

1767 self.assertButlerTransfers(purge=True) 

1768 

1769 def testTransferMissingDisassembly(self): 

1770 """Test transfers where datastore records are missing. 

1771 

1772 This is how execution butler works. 

1773 """ 

1774 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1775 "ByDimensionsDatasetRecordStorageManagerUUID", 

1776 "lsst.daf.butler.registry.datasets.byDimensions." 

1777 "ByDimensionsDatasetRecordStorageManagerUUID", 

1778 ) 

1779 

1780 # Configure the source butler to allow trust. 

1781 self.source_butler.datastore.trustGetRequest = True 

1782 

1783 # Test disassembly. 

1784 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

1785 

1786 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"): 

1787 """Test that a run can be transferred to another butler.""" 

1788 

1789 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

1790 datasetTypeName = "random_data" 

1791 

1792 # Test will create 3 collections and we will want to transfer 

1793 # two of those three. 

1794 runs = ["run1", "run2", "other"] 

1795 

1796 # Also want to use two different dataset types to ensure that 

1797 # grouping works. 

1798 datasetTypeNames = ["random_data", "random_data_2"] 

1799 

1800 # Create the run collections in the source butler. 

1801 for run in runs: 

1802 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

1803 

1804 # Create dimensions in both butlers (transfer will not create them). 

1805 n_exposures = 30 

1806 for butler in (self.source_butler, self.target_butler): 

1807 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1808 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

1809 "name": "d-r", 

1810 "band": "R"}) 

1811 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", 

1812 "id": 1, "full_name": "det1"}) 

1813 

1814 for i in range(n_exposures): 

1815 butler.registry.insertDimensionData("exposure", {"instrument": "DummyCamComp", 

1816 "id": i, "obs_id": f"exp{i}", 

1817 "physical_filter": "d-r"}) 

1818 

1819 # Create dataset types in the source butler. 

1820 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1821 for datasetTypeName in datasetTypeNames: 

1822 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1823 self.source_butler.registry.registerDatasetType(datasetType) 

1824 

1825 # Write a dataset to an unrelated run -- this will ensure that 

1826 # we are rewriting integer dataset ids in the target if necessary. 

1827 # Will not be relevant for UUID. 

1828 run = "distraction" 

1829 butler = Butler(butler=self.source_butler, run=run) 

1830 butler.put(makeExampleMetrics(), datasetTypeName, 

1831 exposure=1, instrument="DummyCamComp", physical_filter="d-r") 

1832 

1833 # Write some example metrics to the source 

1834 butler = Butler(butler=self.source_butler) 

1835 

1836 # Set of DatasetRefs that should be in the list of refs to transfer 

1837 # but which will not be transferred. 

1838 deleted = set() 

1839 

1840 n_expected = 20 # Number of datasets expected to be transferred 

1841 source_refs = [] 

1842 for i in range(n_exposures): 

1843 # Put a third of datasets into each collection, only retain 

1844 # two thirds. 

1845 index = i % 3 

1846 run = runs[index] 

1847 datasetTypeName = datasetTypeNames[i % 2] 

1848 

1849 metric_data = {"summary": {"counter": i}, 

1850 "output": {"text": "metric"}, 

1851 "data": [2*x for x in range(i)]} 

1852 metric = MetricsExample(**metric_data) 

1853 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1854 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

1855 

1856 # Remove the datastore record using low-level API 

1857 if purge: 

1858 # Remove records for a fraction. 

1859 if index == 1: 

1860 

1861 # For one of these delete the file as well. 

1862 # This allows the "missing" code to filter the 

1863 # file out. 

1864 if not deleted: 

1865 primary, uris = butler.datastore.getURIs(ref) 

1866 if primary: 

1867 primary.remove() 

1868 for uri in uris.values(): 

1869 uri.remove() 

1870 n_expected -= 1 

1871 deleted.add(ref) 

1872 

1873 # Remove the datastore record. 

1874 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

1875 

1876 if index < 2: 

1877 source_refs.append(ref) 

1878 if ref not in deleted: 

1879 new_metric = butler.get(ref.unresolved(), collections=run) 

1880 self.assertEqual(new_metric, metric) 

1881 

1882 # Create some bad dataset types to ensure we check for inconsistent 

1883 # definitions. 

1884 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

1885 for datasetTypeName in datasetTypeNames: 

1886 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

1887 self.target_butler.registry.registerDatasetType(datasetType) 

1888 with self.assertRaises(ConflictingDefinitionError): 

1889 self.target_butler.transfer_from(self.source_butler, source_refs, 

1890 id_gen_map=id_gen_map) 

1891 # And remove the bad definitions. 

1892 for datasetTypeName in datasetTypeNames: 

1893 self.target_butler.registry.removeDatasetType(datasetTypeName) 

1894 

1895 # Transfer without creating dataset types should fail. 

1896 with self.assertRaises(KeyError): 

1897 self.target_butler.transfer_from(self.source_butler, source_refs, 

1898 id_gen_map=id_gen_map) 

1899 

1900 # Now transfer them to the second butler 

1901 with self.assertLogs(level=logging.DEBUG) as cm: 

1902 transferred = self.target_butler.transfer_from(self.source_butler, source_refs, 

1903 id_gen_map=id_gen_map, 

1904 register_dataset_types=True) 

1905 self.assertEqual(len(transferred), n_expected) 

1906 log_output = ";".join(cm.output) 

1907 self.assertIn("found in datastore for chunk", log_output) 

1908 self.assertIn("Creating output run", log_output) 

1909 

1910 # Do the transfer twice to ensure that it will do nothing extra. 

1911 # Only do this if purge=True because it does not work for int 

1912 # dataset_id. 

1913 if purge: 

1914 # This should not need to register dataset types. 

1915 transferred = self.target_butler.transfer_from(self.source_butler, source_refs, 

1916 id_gen_map=id_gen_map) 

1917 self.assertEqual(len(transferred), n_expected) 

1918 

1919 # Also do an explicit low-level transfer to trigger some 

1920 # edge cases. 

1921 with self.assertLogs(level=logging.DEBUG) as cm: 

1922 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

1923 log_output = ";".join(cm.output) 

1924 self.assertIn("no file artifacts exist", log_output) 

1925 

1926 with self.assertRaises(TypeError): 

1927 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) 

1928 

1929 with self.assertRaises(ValueError): 

1930 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs, 

1931 transfer="split") 

1932 

1933 # Now try to get the same refs from the new butler. 

1934 for ref in source_refs: 

1935 if ref not in deleted: 

1936 unresolved_ref = ref.unresolved() 

1937 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run) 

1938 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run) 

1939 self.assertEqual(new_metric, old_metric) 

1940 

1941 # Now prune run2 collection and create instead a CHAINED collection. 

1942 # This should block the transfer. 

1943 self.target_butler.pruneCollection("run2", purge=True, unstore=True) 

1944 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

1945 with self.assertRaises(TypeError): 

1946 # Re-importing the run1 datasets can be problematic if they 

1947 # use integer IDs so filter those out. 

1948 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

1949 self.target_butler.transfer_from(self.source_butler, to_transfer, 

1950 id_gen_map=id_gen_map) 

1951 

1952 

1953if __name__ == "__main__": 1953 ↛ 1954line 1953 didn't jump to line 1954, because the condition on line 1953 was never true

1954 unittest.main()