Coverage for tests/test_butler.py: 17%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1071 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24 

25import logging 

26import os 

27import posixpath 

28import unittest 

29import tempfile 

30import shutil 

31import pickle 

32import string 

33import random 

34import time 

35import socket 

36import pathlib 

37 

38try: 

39 import boto3 

40 import botocore 

41 from moto import mock_s3 

42except ImportError: 

43 boto3 = None 

44 

45 def mock_s3(cls): 

46 """A no-op decorator in case moto mock_s3 can not be imported. 

47 """ 

48 return cls 

49 

50try: 

51 from cheroot import wsgi 

52 from wsgidav.wsgidav_app import WsgiDAVApp 

53except ImportError: 

54 WsgiDAVApp = None 

55 

56import astropy.time 

57from threading import Thread 

58from tempfile import gettempdir 

59from lsst.utils import doImport 

60from lsst.daf.butler import Butler, Config, ButlerConfig 

61from lsst.daf.butler import StorageClassFactory 

62from lsst.daf.butler import DatasetType, DatasetRef, DatasetIdGenEnum 

63from lsst.daf.butler import FileTemplateValidationError, ValidationError 

64from lsst.daf.butler import FileDataset 

65from lsst.daf.butler import CollectionSearch, CollectionType 

66from lsst.daf.butler import ButlerURI 

67from lsst.daf.butler import script 

68from lsst.daf.butler.registry import MissingCollectionError, ConflictingDefinitionError 

69from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

70from lsst.daf.butler.core._butlerUri.s3utils import (setAwsEnvCredentials, 

71 unsetAwsEnvCredentials) 

72from lsst.daf.butler.core._butlerUri.http import isWebdavEndpoint 

73 

74from lsst.daf.butler.tests import MultiDetectorFormatter, MetricsExample 

75from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir 

76 

77TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

78 

79 

80def makeExampleMetrics(): 

81 return MetricsExample({"AM1": 5.2, "AM2": 30.6}, 

82 {"a": [1, 2, 3], 

83 "b": {"blue": 5, "red": "green"}}, 

84 [563, 234, 456.7, 752, 8, 9, 27] 

85 ) 

86 

87 

88class TransactionTestError(Exception): 

89 """Specific error for testing transactions, to prevent misdiagnosing 

90 that might otherwise occur when a standard exception is used. 

91 """ 

92 pass 

93 

94 

95class ButlerConfigTests(unittest.TestCase): 

96 """Simple tests for ButlerConfig that are not tested in other test cases. 

97 """ 

98 

99 def testSearchPath(self): 

100 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

101 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

102 config1 = ButlerConfig(configFile) 

103 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

104 

105 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

106 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

107 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

108 self.assertIn("testConfigs", "\n".join(cm.output)) 

109 

110 key = ("datastore", "records", "table") 

111 self.assertNotEqual(config1[key], config2[key]) 

112 self.assertEqual(config2[key], "override_record") 

113 

114 

115class ButlerPutGetTests: 

116 """Helper method for running a suite of put/get tests from different 

117 butler configurations.""" 

118 

119 root = None 

120 

121 @staticmethod 

122 def addDatasetType(datasetTypeName, dimensions, storageClass, registry): 

123 """Create a DatasetType and register it 

124 """ 

125 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

126 registry.registerDatasetType(datasetType) 

127 return datasetType 

128 

129 @classmethod 

130 def setUpClass(cls): 

131 cls.storageClassFactory = StorageClassFactory() 

132 cls.storageClassFactory.addFromConfig(cls.configFile) 

133 

134 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None): 

135 datasetType = datasetRef.datasetType 

136 dataId = datasetRef.dataId 

137 deferred = butler.getDirectDeferred(datasetRef) 

138 

139 for component in components: 

140 compTypeName = datasetType.componentTypeName(component) 

141 result = butler.get(compTypeName, dataId, collections=collections) 

142 self.assertEqual(result, getattr(reference, component)) 

143 result_deferred = deferred.get(component=component) 

144 self.assertEqual(result_deferred, result) 

145 

146 def tearDown(self): 

147 removeTestTempDir(self.root) 

148 

149 def runPutGetTest(self, storageClass, datasetTypeName): 

150 # New datasets will be added to run and tag, but we will only look in 

151 # tag when looking up datasets. 

152 run = "ingest" 

153 butler = Butler(self.tmpConfigFile, run=run) 

154 

155 collections = set(butler.registry.queryCollections()) 

156 self.assertEqual(collections, set([run])) 

157 

158 # Create and register a DatasetType 

159 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

160 

161 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

162 

163 # Add needed Dimensions 

164 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

165 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

166 "name": "d-r", 

167 "band": "R"}) 

168 butler.registry.insertDimensionData("visit_system", {"instrument": "DummyCamComp", 

169 "id": 1, 

170 "name": "default"}) 

171 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

172 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

173 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

174 "name": "fourtwentythree", "physical_filter": "d-r", 

175 "visit_system": 1, "datetime_begin": visit_start, 

176 "datetime_end": visit_end}) 

177 

178 # Add a second visit for some later tests 

179 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 424, 

180 "name": "fourtwentyfour", "physical_filter": "d-r", 

181 "visit_system": 1}) 

182 

183 # Create and store a dataset 

184 metric = makeExampleMetrics() 

185 dataId = {"instrument": "DummyCamComp", "visit": 423} 

186 

187 # Create a DatasetRef for put 

188 refIn = DatasetRef(datasetType, dataId, id=None) 

189 

190 # Put with a preexisting id should fail 

191 with self.assertRaises(ValueError): 

192 butler.put(metric, DatasetRef(datasetType, dataId, id=100)) 

193 

194 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

195 # and once with a DatasetType 

196 

197 # Keep track of any collections we add and do not clean up 

198 expected_collections = {run} 

199 

200 counter = 0 

201 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)): 

202 # Since we are using subTest we can get cascading failures 

203 # here with the first attempt failing and the others failing 

204 # immediately because the dataset already exists. Work around 

205 # this by using a distinct run collection each time 

206 counter += 1 

207 this_run = f"put_run_{counter}" 

208 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

209 expected_collections.update({this_run}) 

210 

211 with self.subTest(args=args): 

212 ref = butler.put(metric, *args, run=this_run) 

213 self.assertIsInstance(ref, DatasetRef) 

214 

215 # Test getDirect 

216 metricOut = butler.getDirect(ref) 

217 self.assertEqual(metric, metricOut) 

218 # Test get 

219 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

220 self.assertEqual(metric, metricOut) 

221 # Test get with a datasetRef 

222 metricOut = butler.get(ref, collections=this_run) 

223 self.assertEqual(metric, metricOut) 

224 # Test getDeferred with dataId 

225 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

226 self.assertEqual(metric, metricOut) 

227 # Test getDeferred with a datasetRef 

228 metricOut = butler.getDeferred(ref, collections=this_run).get() 

229 self.assertEqual(metric, metricOut) 

230 # and deferred direct with ref 

231 metricOut = butler.getDirectDeferred(ref).get() 

232 self.assertEqual(metric, metricOut) 

233 

234 # Check we can get components 

235 if storageClass.isComposite(): 

236 self.assertGetComponents(butler, ref, 

237 ("summary", "data", "output"), metric, 

238 collections=this_run) 

239 

240 # Can the artifacts themselves be retrieved? 

241 if not butler.datastore.isEphemeral: 

242 root_uri = ButlerURI(self.root) 

243 

244 for preserve_path in (True, False): 

245 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

246 # Use copy so that we can test that overwrite 

247 # protection works (using "auto" for File URIs would 

248 # use hard links and subsequent transfer would work 

249 # because it knows they are the same file). 

250 transferred = butler.retrieveArtifacts([ref], destination, 

251 preserve_path=preserve_path, transfer="copy") 

252 self.assertGreater(len(transferred), 0) 

253 artifacts = list(ButlerURI.findFileResources([destination])) 

254 self.assertEqual(set(transferred), set(artifacts)) 

255 

256 for artifact in transferred: 

257 path_in_destination = artifact.relative_to(destination) 

258 self.assertIsNotNone(path_in_destination) 

259 

260 # when path is not preserved there should not be 

261 # any path separators. 

262 num_seps = path_in_destination.count("/") 

263 if preserve_path: 

264 self.assertGreater(num_seps, 0) 

265 else: 

266 self.assertEqual(num_seps, 0) 

267 

268 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

269 n_uris = len(secondary_uris) 

270 if primary_uri: 

271 n_uris += 1 

272 self.assertEqual(len(artifacts), n_uris, "Comparing expected artifacts vs actual:" 

273 f" {artifacts} vs {primary_uri} and {secondary_uris}") 

274 

275 if preserve_path: 

276 # No need to run these twice 

277 with self.assertRaises(ValueError): 

278 butler.retrieveArtifacts([ref], destination, transfer="move") 

279 

280 with self.assertRaises(FileExistsError): 

281 butler.retrieveArtifacts([ref], destination) 

282 

283 transferred_again = butler.retrieveArtifacts([ref], destination, 

284 preserve_path=preserve_path, 

285 overwrite=True) 

286 self.assertEqual(set(transferred_again), set(transferred)) 

287 

288 # Now remove the dataset completely. 

289 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run) 

290 # Lookup with original args should still fail. 

291 with self.assertRaises(LookupError): 

292 butler.datasetExists(*args, collections=this_run) 

293 # getDirect() should still fail. 

294 with self.assertRaises(FileNotFoundError): 

295 butler.getDirect(ref) 

296 # Registry shouldn't be able to find it by dataset_id anymore. 

297 self.assertIsNone(butler.registry.getDataset(ref.id)) 

298 

299 # Do explicit registry removal since we know they are 

300 # empty 

301 butler.registry.removeCollection(this_run) 

302 expected_collections.remove(this_run) 

303 

304 # Put the dataset again, since the last thing we did was remove it 

305 # and we want to use the default collection. 

306 ref = butler.put(metric, refIn) 

307 

308 # Get with parameters 

309 stop = 4 

310 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

311 self.assertNotEqual(metric, sliced) 

312 self.assertEqual(metric.summary, sliced.summary) 

313 self.assertEqual(metric.output, sliced.output) 

314 self.assertEqual(metric.data[:stop], sliced.data) 

315 # getDeferred with parameters 

316 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

317 self.assertNotEqual(metric, sliced) 

318 self.assertEqual(metric.summary, sliced.summary) 

319 self.assertEqual(metric.output, sliced.output) 

320 self.assertEqual(metric.data[:stop], sliced.data) 

321 # getDeferred with deferred parameters 

322 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

323 self.assertNotEqual(metric, sliced) 

324 self.assertEqual(metric.summary, sliced.summary) 

325 self.assertEqual(metric.output, sliced.output) 

326 self.assertEqual(metric.data[:stop], sliced.data) 

327 

328 if storageClass.isComposite(): 

329 # Check that components can be retrieved 

330 metricOut = butler.get(ref.datasetType.name, dataId) 

331 compNameS = ref.datasetType.componentTypeName("summary") 

332 compNameD = ref.datasetType.componentTypeName("data") 

333 summary = butler.get(compNameS, dataId) 

334 self.assertEqual(summary, metric.summary) 

335 data = butler.get(compNameD, dataId) 

336 self.assertEqual(data, metric.data) 

337 

338 if "counter" in storageClass.derivedComponents: 

339 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

340 self.assertEqual(count, len(data)) 

341 

342 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId, 

343 parameters={"slice": slice(stop)}) 

344 self.assertEqual(count, stop) 

345 

346 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

347 summary = butler.getDirect(compRef) 

348 self.assertEqual(summary, metric.summary) 

349 

350 # Create a Dataset type that has the same name but is inconsistent. 

351 inconsistentDatasetType = DatasetType(datasetTypeName, dimensions, 

352 self.storageClassFactory.getStorageClass("Config")) 

353 

354 # Getting with a dataset type that does not match registry fails 

355 with self.assertRaises(ValueError): 

356 butler.get(inconsistentDatasetType, dataId) 

357 

358 # Combining a DatasetRef with a dataId should fail 

359 with self.assertRaises(ValueError): 

360 butler.get(ref, dataId) 

361 # Getting with an explicit ref should fail if the id doesn't match 

362 with self.assertRaises(ValueError): 

363 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101)) 

364 

365 # Getting a dataset with unknown parameters should fail 

366 with self.assertRaises(KeyError): 

367 butler.get(ref, parameters={"unsupported": True}) 

368 

369 # Check we have a collection 

370 collections = set(butler.registry.queryCollections()) 

371 self.assertEqual(collections, expected_collections) 

372 

373 # Clean up to check that we can remove something that may have 

374 # already had a component removed 

375 butler.pruneDatasets([ref], unstore=True, purge=True) 

376 

377 # Check that we can configure a butler to accept a put even 

378 # if it already has the dataset in registry. 

379 ref = butler.put(metric, refIn) 

380 

381 # Repeat put will fail. 

382 with self.assertRaises(ConflictingDefinitionError): 

383 butler.put(metric, refIn) 

384 

385 # Remove the datastore entry. 

386 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

387 

388 # Put will still fail 

389 with self.assertRaises(ConflictingDefinitionError): 

390 butler.put(metric, refIn) 

391 

392 # Allow the put to succeed 

393 butler._allow_put_of_predefined_dataset = True 

394 ref2 = butler.put(metric, refIn) 

395 self.assertEqual(ref2.id, ref.id) 

396 

397 # A second put will still fail but with a different exception 

398 # than before. 

399 with self.assertRaises(ConflictingDefinitionError): 

400 butler.put(metric, refIn) 

401 

402 # Reset the flag to avoid confusion 

403 butler._allow_put_of_predefined_dataset = False 

404 

405 # Leave the dataset in place since some downstream tests require 

406 # something to be present 

407 

408 return butler 

409 

410 def testDeferredCollectionPassing(self): 

411 # Construct a butler with no run or collection, but make it writeable. 

412 butler = Butler(self.tmpConfigFile, writeable=True) 

413 # Create and register a DatasetType 

414 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

415 datasetType = self.addDatasetType("example", dimensions, 

416 self.storageClassFactory.getStorageClass("StructuredData"), 

417 butler.registry) 

418 # Add needed Dimensions 

419 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

420 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

421 "name": "d-r", 

422 "band": "R"}) 

423 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

424 "name": "fourtwentythree", "physical_filter": "d-r"}) 

425 dataId = {"instrument": "DummyCamComp", "visit": 423} 

426 # Create dataset. 

427 metric = makeExampleMetrics() 

428 # Register a new run and put dataset. 

429 run = "deferred" 

430 self.assertTrue(butler.registry.registerRun(run)) 

431 # Second time it will be allowed but indicate no-op 

432 self.assertFalse(butler.registry.registerRun(run)) 

433 ref = butler.put(metric, datasetType, dataId, run=run) 

434 # Putting with no run should fail with TypeError. 

435 with self.assertRaises(TypeError): 

436 butler.put(metric, datasetType, dataId) 

437 # Dataset should exist. 

438 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

439 # We should be able to get the dataset back, but with and without 

440 # a deferred dataset handle. 

441 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

442 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

443 # Trying to find the dataset without any collection is a TypeError. 

444 with self.assertRaises(TypeError): 

445 butler.datasetExists(datasetType, dataId) 

446 with self.assertRaises(TypeError): 

447 butler.get(datasetType, dataId) 

448 # Associate the dataset with a different collection. 

449 butler.registry.registerCollection("tagged") 

450 butler.registry.associate("tagged", [ref]) 

451 # Deleting the dataset from the new collection should make it findable 

452 # in the original collection. 

453 butler.pruneDatasets([ref], tags=["tagged"]) 

454 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

455 

456 

457class ButlerTests(ButlerPutGetTests): 

458 """Tests for Butler. 

459 """ 

460 useTempRoot = True 

461 

462 def setUp(self): 

463 """Create a new butler root for each test.""" 

464 self.root = makeTestTempDir(TESTDIR) 

465 Butler.makeRepo(self.root, config=Config(self.configFile)) 

466 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

467 

468 def testConstructor(self): 

469 """Independent test of constructor. 

470 """ 

471 butler = Butler(self.tmpConfigFile, run="ingest") 

472 self.assertIsInstance(butler, Butler) 

473 

474 collections = set(butler.registry.queryCollections()) 

475 self.assertEqual(collections, {"ingest"}) 

476 

477 butler2 = Butler(butler=butler, collections=["other"]) 

478 self.assertEqual( 

479 butler2.collections, 

480 CollectionSearch.fromExpression(["other"]) 

481 ) 

482 self.assertIsNone(butler2.run) 

483 self.assertIs(butler.datastore, butler2.datastore) 

484 

485 # Test that we can use an environment variable to find this 

486 # repository. 

487 butler_index = Config() 

488 butler_index["label"] = self.tmpConfigFile 

489 for suffix in (".yaml", ".json"): 

490 # Ensure that the content differs so that we know that 

491 # we aren't reusing the cache. 

492 bad_label = f"s3://bucket/not_real{suffix}" 

493 butler_index["bad_label"] = bad_label 

494 with ButlerURI.temporary_uri(suffix=suffix) as temp_file: 

495 butler_index.dumpToUri(temp_file) 

496 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

497 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label"))) 

498 uri = Butler.get_repo_uri("bad_label") 

499 self.assertEqual(uri, ButlerURI(bad_label)) 

500 uri = Butler.get_repo_uri("label") 

501 butler = Butler(uri, writeable=False) 

502 self.assertIsInstance(butler, Butler) 

503 with self.assertRaises(KeyError) as cm: 

504 Butler.get_repo_uri("missing") 

505 self.assertIn("not known to", str(cm.exception)) 

506 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

507 with self.assertRaises(FileNotFoundError): 

508 Butler.get_repo_uri("label") 

509 self.assertEqual(Butler.get_known_repos(), set()) 

510 with self.assertRaises(KeyError) as cm: 

511 # No environment variable set. 

512 Butler.get_repo_uri("label") 

513 self.assertIn("No repository index defined", str(cm.exception)) 

514 self.assertEqual(Butler.get_known_repos(), set()) 

515 

516 def testBasicPutGet(self): 

517 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

518 self.runPutGetTest(storageClass, "test_metric") 

519 

520 def testCompositePutGetConcrete(self): 

521 

522 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

523 butler = self.runPutGetTest(storageClass, "test_metric") 

524 

525 # Should *not* be disassembled 

526 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

527 self.assertEqual(len(datasets), 1) 

528 uri, components = butler.getURIs(datasets[0]) 

529 self.assertIsInstance(uri, ButlerURI) 

530 self.assertFalse(components) 

531 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

532 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

533 

534 # Predicted dataset 

535 dataId = {"instrument": "DummyCamComp", "visit": 424} 

536 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

537 self.assertFalse(components) 

538 self.assertIsInstance(uri, ButlerURI) 

539 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

540 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

541 

542 def testCompositePutGetVirtual(self): 

543 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

544 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

545 

546 # Should be disassembled 

547 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

548 self.assertEqual(len(datasets), 1) 

549 uri, components = butler.getURIs(datasets[0]) 

550 

551 if butler.datastore.isEphemeral: 

552 # Never disassemble in-memory datastore 

553 self.assertIsInstance(uri, ButlerURI) 

554 self.assertFalse(components) 

555 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

556 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

557 else: 

558 self.assertIsNone(uri) 

559 self.assertEqual(set(components), set(storageClass.components)) 

560 for compuri in components.values(): 

561 self.assertIsInstance(compuri, ButlerURI) 

562 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

563 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

564 

565 # Predicted dataset 

566 dataId = {"instrument": "DummyCamComp", "visit": 424} 

567 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

568 

569 if butler.datastore.isEphemeral: 

570 # Never disassembled 

571 self.assertIsInstance(uri, ButlerURI) 

572 self.assertFalse(components) 

573 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

574 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

575 else: 

576 self.assertIsNone(uri) 

577 self.assertEqual(set(components), set(storageClass.components)) 

578 for compuri in components.values(): 

579 self.assertIsInstance(compuri, ButlerURI) 

580 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

581 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

582 

583 def testIngest(self): 

584 butler = Butler(self.tmpConfigFile, run="ingest") 

585 

586 # Create and register a DatasetType 

587 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"]) 

588 

589 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

590 datasetTypeName = "metric" 

591 

592 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

593 

594 # Add needed Dimensions 

595 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

596 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

597 "name": "d-r", 

598 "band": "R"}) 

599 for detector in (1, 2): 

600 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", "id": detector, 

601 "full_name": f"detector{detector}"}) 

602 

603 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

604 "name": "fourtwentythree", "physical_filter": "d-r"}, 

605 {"instrument": "DummyCamComp", "id": 424, 

606 "name": "fourtwentyfour", "physical_filter": "d-r"}) 

607 

608 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter") 

609 dataRoot = os.path.join(TESTDIR, "data", "basic") 

610 datasets = [] 

611 for detector in (1, 2): 

612 detector_name = f"detector_{detector}" 

613 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

614 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

615 # Create a DatasetRef for ingest 

616 refIn = DatasetRef(datasetType, dataId, id=None) 

617 

618 datasets.append(FileDataset(path=metricFile, 

619 refs=[refIn], 

620 formatter=formatter)) 

621 

622 butler.ingest(*datasets, transfer="copy") 

623 

624 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

625 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

626 

627 metrics1 = butler.get(datasetTypeName, dataId1) 

628 metrics2 = butler.get(datasetTypeName, dataId2) 

629 self.assertNotEqual(metrics1, metrics2) 

630 

631 # Compare URIs 

632 uri1 = butler.getURI(datasetTypeName, dataId1) 

633 uri2 = butler.getURI(datasetTypeName, dataId2) 

634 self.assertNotEqual(uri1, uri2) 

635 

636 # Now do a multi-dataset but single file ingest 

637 metricFile = os.path.join(dataRoot, "detectors.yaml") 

638 refs = [] 

639 for detector in (1, 2): 

640 detector_name = f"detector_{detector}" 

641 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

642 # Create a DatasetRef for ingest 

643 refs.append(DatasetRef(datasetType, dataId, id=None)) 

644 

645 datasets = [] 

646 datasets.append(FileDataset(path=metricFile, 

647 refs=refs, 

648 formatter=MultiDetectorFormatter)) 

649 

650 butler.ingest(*datasets, transfer="copy") 

651 

652 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

653 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

654 

655 multi1 = butler.get(datasetTypeName, dataId1) 

656 multi2 = butler.get(datasetTypeName, dataId2) 

657 

658 self.assertEqual(multi1, metrics1) 

659 self.assertEqual(multi2, metrics2) 

660 

661 # Compare URIs 

662 uri1 = butler.getURI(datasetTypeName, dataId1) 

663 uri2 = butler.getURI(datasetTypeName, dataId2) 

664 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

665 

666 # Test that removing one does not break the second 

667 # This line will issue a warning log message for a ChainedDatastore 

668 # that uses an InMemoryDatastore since in-memory can not ingest 

669 # files. 

670 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

671 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1)) 

672 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2)) 

673 multi2b = butler.get(datasetTypeName, dataId2) 

674 self.assertEqual(multi2, multi2b) 

675 

676 def testPruneCollections(self): 

677 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

678 butler = Butler(self.tmpConfigFile, writeable=True) 

679 # Load registry data with dimensions to hang datasets off of. 

680 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

681 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

682 # Add some RUN-type collections. 

683 run1 = "run1" 

684 butler.registry.registerRun(run1) 

685 run2 = "run2" 

686 butler.registry.registerRun(run2) 

687 # put some datasets. ref1 and ref2 have the same data ID, and are in 

688 # different runs. ref3 has a different data ID. 

689 metric = makeExampleMetrics() 

690 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

691 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

692 butler.registry) 

693 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

694 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

695 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

696 

697 # Try to delete a RUN collection without purge, or with purge and not 

698 # unstore. 

699 with self.assertRaises(TypeError): 

700 butler.pruneCollection(run1) 

701 with self.assertRaises(TypeError): 

702 butler.pruneCollection(run2, purge=True) 

703 # Add a TAGGED collection and associate ref3 only into it. 

704 tag1 = "tag1" 

705 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

706 self.assertTrue(registered) 

707 # Registering a second time should be allowed. 

708 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

709 self.assertFalse(registered) 

710 butler.registry.associate(tag1, [ref3]) 

711 # Add a CHAINED collection that searches run1 and then run2. It 

712 # logically contains only ref1, because ref2 is shadowed due to them 

713 # having the same data ID and dataset type. 

714 chain1 = "chain1" 

715 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

716 butler.registry.setCollectionChain(chain1, [run1, run2]) 

717 # Try to delete RUN collections, which should fail with complete 

718 # rollback because they're still referenced by the CHAINED 

719 # collection. 

720 with self.assertRaises(Exception): 

721 butler.pruneCollection(run1, pruge=True, unstore=True) 

722 with self.assertRaises(Exception): 

723 butler.pruneCollection(run2, pruge=True, unstore=True) 

724 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

725 [ref1, ref2, ref3]) 

726 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

727 self.assertTrue(existence[ref1]) 

728 self.assertTrue(existence[ref2]) 

729 self.assertTrue(existence[ref3]) 

730 # Try to delete CHAINED and TAGGED collections with purge; should not 

731 # work. 

732 with self.assertRaises(TypeError): 

733 butler.pruneCollection(tag1, purge=True, unstore=True) 

734 with self.assertRaises(TypeError): 

735 butler.pruneCollection(chain1, purge=True, unstore=True) 

736 # Remove the tagged collection with unstore=False. This should not 

737 # affect the datasets. 

738 butler.pruneCollection(tag1) 

739 with self.assertRaises(MissingCollectionError): 

740 butler.registry.getCollectionType(tag1) 

741 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

742 [ref1, ref2, ref3]) 

743 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

744 self.assertTrue(existence[ref1]) 

745 self.assertTrue(existence[ref2]) 

746 self.assertTrue(existence[ref3]) 

747 # Add the tagged collection back in, and remove it with unstore=True. 

748 # This should remove ref3 only from the datastore. 

749 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

750 butler.registry.associate(tag1, [ref3]) 

751 butler.pruneCollection(tag1, unstore=True) 

752 with self.assertRaises(MissingCollectionError): 

753 butler.registry.getCollectionType(tag1) 

754 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

755 [ref1, ref2, ref3]) 

756 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

757 self.assertTrue(existence[ref1]) 

758 self.assertTrue(existence[ref2]) 

759 self.assertFalse(existence[ref3]) 

760 # Delete the chain with unstore=False. The datasets should not be 

761 # affected at all. 

762 butler.pruneCollection(chain1) 

763 with self.assertRaises(MissingCollectionError): 

764 butler.registry.getCollectionType(chain1) 

765 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

766 [ref1, ref2, ref3]) 

767 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

768 self.assertTrue(existence[ref1]) 

769 self.assertTrue(existence[ref2]) 

770 self.assertFalse(existence[ref3]) 

771 # Redefine and then delete the chain with unstore=True. Only ref1 

772 # should be unstored (ref3 has already been unstored, but otherwise 

773 # would be now). 

774 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

775 butler.registry.setCollectionChain(chain1, [run1, run2]) 

776 butler.pruneCollection(chain1, unstore=True) 

777 with self.assertRaises(MissingCollectionError): 

778 butler.registry.getCollectionType(chain1) 

779 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

780 [ref1, ref2, ref3]) 

781 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

782 self.assertFalse(existence[ref1]) 

783 self.assertTrue(existence[ref2]) 

784 self.assertFalse(existence[ref3]) 

785 # Remove run1. This removes ref1 and ref3 from the registry (they're 

786 # already gone from the datastore, which is fine). 

787 butler.pruneCollection(run1, purge=True, unstore=True) 

788 with self.assertRaises(MissingCollectionError): 

789 butler.registry.getCollectionType(run1) 

790 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

791 [ref2]) 

792 self.assertTrue(butler.datastore.exists(ref2)) 

793 # Remove run2. This removes ref2 from the registry and the datastore. 

794 butler.pruneCollection(run2, purge=True, unstore=True) 

795 with self.assertRaises(MissingCollectionError): 

796 butler.registry.getCollectionType(run2) 

797 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

798 []) 

799 

800 # Now that the collections have been pruned we can remove the 

801 # dataset type 

802 butler.registry.removeDatasetType(datasetType.name) 

803 

804 def testPickle(self): 

805 """Test pickle support. 

806 """ 

807 butler = Butler(self.tmpConfigFile, run="ingest") 

808 butlerOut = pickle.loads(pickle.dumps(butler)) 

809 self.assertIsInstance(butlerOut, Butler) 

810 self.assertEqual(butlerOut._config, butler._config) 

811 self.assertEqual(butlerOut.collections, butler.collections) 

812 self.assertEqual(butlerOut.run, butler.run) 

813 

814 def testGetDatasetTypes(self): 

815 butler = Butler(self.tmpConfigFile, run="ingest") 

816 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"]) 

817 dimensionEntries = [ 

818 ("instrument", {"instrument": "DummyCam"}, {"instrument": "DummyHSC"}, 

819 {"instrument": "DummyCamComp"}), 

820 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

821 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}) 

822 ] 

823 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

824 # Add needed Dimensions 

825 for args in dimensionEntries: 

826 butler.registry.insertDimensionData(*args) 

827 

828 # When a DatasetType is added to the registry entries are not created 

829 # for components but querying them can return the components. 

830 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

831 components = set() 

832 for datasetTypeName in datasetTypeNames: 

833 # Create and register a DatasetType 

834 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

835 

836 for componentName in storageClass.components: 

837 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

838 

839 fromRegistry = set(butler.registry.queryDatasetTypes(components=True)) 

840 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

841 

842 # Now that we have some dataset types registered, validate them 

843 butler.validateConfiguration(ignore=["test_metric_comp", "metric3", "calexp", "DummySC", 

844 "datasetType.component", "random_data", "random_data_2"]) 

845 

846 # Add a new datasetType that will fail template validation 

847 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

848 if self.validationCanFail: 

849 with self.assertRaises(ValidationError): 

850 butler.validateConfiguration() 

851 

852 # Rerun validation but with a subset of dataset type names 

853 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

854 

855 # Rerun validation but ignore the bad datasetType 

856 butler.validateConfiguration(ignore=["test_metric_comp", "metric3", "calexp", "DummySC", 

857 "datasetType.component", "random_data", "random_data_2"]) 

858 

859 def testTransaction(self): 

860 butler = Butler(self.tmpConfigFile, run="ingest") 

861 datasetTypeName = "test_metric" 

862 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

863 dimensionEntries = (("instrument", {"instrument": "DummyCam"}), 

864 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", 

865 "band": "R"}), 

866 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", 

867 "physical_filter": "d-r"})) 

868 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

869 metric = makeExampleMetrics() 

870 dataId = {"instrument": "DummyCam", "visit": 42} 

871 # Create and register a DatasetType 

872 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

873 with self.assertRaises(TransactionTestError): 

874 with butler.transaction(): 

875 # Add needed Dimensions 

876 for args in dimensionEntries: 

877 butler.registry.insertDimensionData(*args) 

878 # Store a dataset 

879 ref = butler.put(metric, datasetTypeName, dataId) 

880 self.assertIsInstance(ref, DatasetRef) 

881 # Test getDirect 

882 metricOut = butler.getDirect(ref) 

883 self.assertEqual(metric, metricOut) 

884 # Test get 

885 metricOut = butler.get(datasetTypeName, dataId) 

886 self.assertEqual(metric, metricOut) 

887 # Check we can get components 

888 self.assertGetComponents(butler, ref, 

889 ("summary", "data", "output"), metric) 

890 raise TransactionTestError("This should roll back the entire transaction") 

891 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"): 

892 butler.registry.expandDataId(dataId) 

893 # Should raise LookupError for missing data ID value 

894 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

895 butler.get(datasetTypeName, dataId) 

896 # Also check explicitly if Dataset entry is missing 

897 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

898 # Direct retrieval should not find the file in the Datastore 

899 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

900 butler.getDirect(ref) 

901 

902 def testMakeRepo(self): 

903 """Test that we can write butler configuration to a new repository via 

904 the Butler.makeRepo interface and then instantiate a butler from the 

905 repo root. 

906 """ 

907 # Do not run the test if we know this datastore configuration does 

908 # not support a file system root 

909 if self.fullConfigKey is None: 

910 return 

911 

912 # create two separate directories 

913 root1 = tempfile.mkdtemp(dir=self.root) 

914 root2 = tempfile.mkdtemp(dir=self.root) 

915 

916 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

917 limited = Config(self.configFile) 

918 butler1 = Butler(butlerConfig) 

919 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

920 full = Config(self.tmpConfigFile) 

921 butler2 = Butler(butlerConfig) 

922 # Butlers should have the same configuration regardless of whether 

923 # defaults were expanded. 

924 self.assertEqual(butler1._config, butler2._config) 

925 # Config files loaded directly should not be the same. 

926 self.assertNotEqual(limited, full) 

927 # Make sure "limited" doesn't have a few keys we know it should be 

928 # inheriting from defaults. 

929 self.assertIn(self.fullConfigKey, full) 

930 self.assertNotIn(self.fullConfigKey, limited) 

931 

932 # Collections don't appear until something is put in them 

933 collections1 = set(butler1.registry.queryCollections()) 

934 self.assertEqual(collections1, set()) 

935 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

936 

937 # Check that a config with no associated file name will not 

938 # work properly with relocatable Butler repo 

939 butlerConfig.configFile = None 

940 with self.assertRaises(ValueError): 

941 Butler(butlerConfig) 

942 

943 with self.assertRaises(FileExistsError): 

944 Butler.makeRepo(self.root, standalone=True, 

945 config=Config(self.configFile), overwrite=False) 

946 

947 def testStringification(self): 

948 butler = Butler(self.tmpConfigFile, run="ingest") 

949 butlerStr = str(butler) 

950 

951 if self.datastoreStr is not None: 

952 for testStr in self.datastoreStr: 

953 self.assertIn(testStr, butlerStr) 

954 if self.registryStr is not None: 

955 self.assertIn(self.registryStr, butlerStr) 

956 

957 datastoreName = butler.datastore.name 

958 if self.datastoreName is not None: 

959 for testStr in self.datastoreName: 

960 self.assertIn(testStr, datastoreName) 

961 

962 def testButlerRewriteDataId(self): 

963 """Test that dataIds can be rewritten based on dimension records.""" 

964 

965 butler = Butler(self.tmpConfigFile, run="ingest") 

966 

967 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

968 datasetTypeName = "random_data" 

969 

970 # Create dimension records. 

971 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

972 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

973 "name": "d-r", 

974 "band": "R"}) 

975 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", 

976 "id": 1, "full_name": "det1"}) 

977 

978 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

979 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

980 butler.registry.registerDatasetType(datasetType) 

981 

982 n_exposures = 5 

983 dayobs = 20210530 

984 

985 for i in range(n_exposures): 

986 butler.registry.insertDimensionData("exposure", {"instrument": "DummyCamComp", 

987 "id": i, "obs_id": f"exp{i}", 

988 "seq_num": i, "day_obs": dayobs, 

989 "physical_filter": "d-r"}) 

990 

991 # Write some data. 

992 for i in range(n_exposures): 

993 metric = {"something": i, 

994 "other": "metric", 

995 "list": [2*x for x in range(i)]} 

996 

997 # Use the seq_num for the put to test rewriting. 

998 dataId = {"seq_num": i, "day_obs": dayobs, "detector": 1, "instrument": "DummyCamComp", 

999 "physical_filter": "d-r"} 

1000 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1001 

1002 # Check that the exposure is correct in the dataId 

1003 self.assertEqual(ref.dataId["exposure"], i) 

1004 

1005 # and check that we can get the dataset back with the same dataId 

1006 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1007 self.assertEqual(new_metric, metric) 

1008 

1009 

1010class FileDatastoreButlerTests(ButlerTests): 

1011 """Common tests and specialization of ButlerTests for butlers backed 

1012 by datastores that inherit from FileDatastore. 

1013 """ 

1014 

1015 def checkFileExists(self, root, relpath): 

1016 """Checks if file exists at a given path (relative to root). 

1017 

1018 Test testPutTemplates verifies actual physical existance of the files 

1019 in the requested location. 

1020 """ 

1021 uri = ButlerURI(root, forceDirectory=True) 

1022 return uri.join(relpath).exists() 

1023 

1024 def testPutTemplates(self): 

1025 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1026 butler = Butler(self.tmpConfigFile, run="ingest") 

1027 

1028 # Add needed Dimensions 

1029 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1030 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

1031 "name": "d-r", 

1032 "band": "R"}) 

1033 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", 

1034 "physical_filter": "d-r"}) 

1035 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", 

1036 "physical_filter": "d-r"}) 

1037 

1038 # Create and store a dataset 

1039 metric = makeExampleMetrics() 

1040 

1041 # Create two almost-identical DatasetTypes (both will use default 

1042 # template) 

1043 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

1044 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1045 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1046 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1047 

1048 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1049 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1050 

1051 # Put with exactly the data ID keys needed 

1052 ref = butler.put(metric, "metric1", dataId1) 

1053 uri = butler.getURI(ref) 

1054 self.assertTrue(self.checkFileExists(butler.datastore.root, 

1055 "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"), 

1056 f"Checking existence of {uri}") 

1057 

1058 # Check the template based on dimensions 

1059 butler.datastore.templates.validateTemplates([ref]) 

1060 

1061 # Put with extra data ID keys (physical_filter is an optional 

1062 # dependency); should not change template (at least the way we're 

1063 # defining them to behave now; the important thing is that they 

1064 # must be consistent). 

1065 ref = butler.put(metric, "metric2", dataId2) 

1066 uri = butler.getURI(ref) 

1067 self.assertTrue(self.checkFileExists(butler.datastore.root, 

1068 "ingest/metric2/d-r/DummyCamComp_v423.pickle"), 

1069 f"Checking existence of {uri}") 

1070 

1071 # Check the template based on dimensions 

1072 butler.datastore.templates.validateTemplates([ref]) 

1073 

1074 # Now use a file template that will not result in unique filenames 

1075 with self.assertRaises(FileTemplateValidationError): 

1076 butler.put(metric, "metric3", dataId1) 

1077 

1078 def testImportExport(self): 

1079 # Run put/get tests just to create and populate a repo. 

1080 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1081 self.runImportExportTest(storageClass) 

1082 

1083 @unittest.expectedFailure 

1084 def testImportExportVirtualComposite(self): 

1085 # Run put/get tests just to create and populate a repo. 

1086 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1087 self.runImportExportTest(storageClass) 

1088 

1089 def runImportExportTest(self, storageClass): 

1090 """This test does an export to a temp directory and an import back 

1091 into a new temp directory repo. It does not assume a posix datastore""" 

1092 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1093 print("Root:", exportButler.datastore.root) 

1094 # Test that the repo actually has at least one dataset. 

1095 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1096 self.assertGreater(len(datasets), 0) 

1097 # Add a DimensionRecord that's unused by those datasets. 

1098 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1099 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1100 # Export and then import datasets. 

1101 with safeTestTempDir(TESTDIR) as exportDir: 

1102 exportFile = os.path.join(exportDir, "exports.yaml") 

1103 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1104 export.saveDatasets(datasets) 

1105 # Export the same datasets again. This should quietly do 

1106 # nothing because of internal deduplication, and it shouldn't 

1107 # complain about being asked to export the "htm7" elements even 

1108 # though there aren't any in these datasets or in the database. 

1109 export.saveDatasets(datasets, elements=["htm7"]) 

1110 # Save one of the data IDs again; this should be harmless 

1111 # because of internal deduplication. 

1112 export.saveDataIds([datasets[0].dataId]) 

1113 # Save some dimension records directly. 

1114 export.saveDimensionData("skymap", [skymapRecord]) 

1115 self.assertTrue(os.path.exists(exportFile)) 

1116 with safeTestTempDir(TESTDIR) as importDir: 

1117 # We always want this to be a local posix butler 

1118 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1119 # Calling script.butlerImport tests the implementation of the 

1120 # butler command line interface "import" subcommand. Functions 

1121 # in the script folder are generally considered protected and 

1122 # should not be used as public api. 

1123 with open(exportFile, "r") as f: 

1124 script.butlerImport(importDir, export_file=f, directory=exportDir, 

1125 transfer="auto", skip_dimensions=None, reuse_ids=False) 

1126 importButler = Butler(importDir, run="ingest") 

1127 for ref in datasets: 

1128 with self.subTest(ref=ref): 

1129 # Test for existence by passing in the DatasetType and 

1130 # data ID separately, to avoid lookup by dataset_id. 

1131 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId)) 

1132 self.assertEqual(list(importButler.registry.queryDimensionRecords("skymap")), 

1133 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)]) 

1134 

1135 def testRemoveRuns(self): 

1136 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1137 butler = Butler(self.tmpConfigFile, writeable=True) 

1138 # Load registry data with dimensions to hang datasets off of. 

1139 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1140 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1141 # Add some RUN-type collection. 

1142 run1 = "run1" 

1143 butler.registry.registerRun(run1) 

1144 run2 = "run2" 

1145 butler.registry.registerRun(run2) 

1146 # put a dataset in each 

1147 metric = makeExampleMetrics() 

1148 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1149 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

1150 butler.registry) 

1151 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1152 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1153 uri1 = butler.getURI(ref1, collections=[run1]) 

1154 uri2 = butler.getURI(ref2, collections=[run2]) 

1155 # Remove from both runs with different values for unstore. 

1156 butler.removeRuns([run1], unstore=True) 

1157 butler.removeRuns([run2], unstore=False) 

1158 # Should be nothing in registry for either one, and datastore should 

1159 # not think either exists. 

1160 with self.assertRaises(MissingCollectionError): 

1161 butler.registry.getCollectionType(run1) 

1162 with self.assertRaises(MissingCollectionError): 

1163 butler.registry.getCollectionType(run2) 

1164 self.assertFalse(butler.datastore.exists(ref1)) 

1165 self.assertFalse(butler.datastore.exists(ref2)) 

1166 # The ref we unstored should be gone according to the URI, but the 

1167 # one we forgot should still be around. 

1168 self.assertFalse(uri1.exists()) 

1169 self.assertTrue(uri2.exists()) 

1170 

1171 

1172class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1173 """PosixDatastore specialization of a butler""" 

1174 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1175 fullConfigKey = ".datastore.formatters" 

1176 validationCanFail = True 

1177 datastoreStr = ["/tmp"] 

1178 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1179 registryStr = "/gen3.sqlite3" 

1180 

1181 def testPathConstructor(self): 

1182 """Independent test of constructor using PathLike. 

1183 """ 

1184 butler = Butler(self.tmpConfigFile, run="ingest") 

1185 self.assertIsInstance(butler, Butler) 

1186 

1187 # And again with a Path object with the butler yaml 

1188 path = pathlib.Path(self.tmpConfigFile) 

1189 butler = Butler(path, writeable=False) 

1190 self.assertIsInstance(butler, Butler) 

1191 

1192 # And again with a Path object without the butler yaml 

1193 # (making sure we skip it if the tmp config doesn't end 

1194 # in butler.yaml -- which is the case for a subclass) 

1195 if self.tmpConfigFile.endswith("butler.yaml"): 

1196 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1197 butler = Butler(path, writeable=False) 

1198 self.assertIsInstance(butler, Butler) 

1199 

1200 def testExportTransferCopy(self): 

1201 """Test local export using all transfer modes""" 

1202 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1203 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1204 # Test that the repo actually has at least one dataset. 

1205 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1206 self.assertGreater(len(datasets), 0) 

1207 uris = [exportButler.getURI(d) for d in datasets] 

1208 datastoreRoot = exportButler.datastore.root 

1209 

1210 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1211 

1212 for path in pathsInStore: 

1213 # Assume local file system 

1214 self.assertTrue(self.checkFileExists(datastoreRoot, path), 

1215 f"Checking path {path}") 

1216 

1217 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1218 with safeTestTempDir(TESTDIR) as exportDir: 

1219 with exportButler.export(directory=exportDir, format="yaml", 

1220 transfer=transfer) as export: 

1221 export.saveDatasets(datasets) 

1222 for path in pathsInStore: 

1223 self.assertTrue(self.checkFileExists(exportDir, path), 

1224 f"Check that mode {transfer} exported files") 

1225 

1226 def testPruneDatasets(self): 

1227 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1228 butler = Butler(self.tmpConfigFile, writeable=True) 

1229 # Load registry data with dimensions to hang datasets off of. 

1230 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1231 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1232 # Add some RUN-type collections. 

1233 run1 = "run1" 

1234 butler.registry.registerRun(run1) 

1235 run2 = "run2" 

1236 butler.registry.registerRun(run2) 

1237 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1238 # different runs. ref3 has a different data ID. 

1239 metric = makeExampleMetrics() 

1240 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1241 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

1242 butler.registry) 

1243 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1244 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1245 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1246 

1247 # Simple prune. 

1248 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1249 with self.assertRaises(LookupError): 

1250 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1) 

1251 

1252 # Put data back. 

1253 ref1 = butler.put(metric, ref1.unresolved(), run=run1) 

1254 ref2 = butler.put(metric, ref2.unresolved(), run=run2) 

1255 ref3 = butler.put(metric, ref3.unresolved(), run=run1) 

1256 

1257 # Check that in normal mode, deleting the record will lead to 

1258 # trash not touching the file. 

1259 uri1 = butler.datastore.getURI(ref1) 

1260 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table 

1261 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id}) 

1262 butler.datastore.trash(ref1) 

1263 butler.datastore.emptyTrash() 

1264 self.assertTrue(uri1.exists()) 

1265 uri1.remove() # Clean it up. 

1266 

1267 # Simulate execution butler setup by deleting the datastore 

1268 # record but keeping the file around and trusting. 

1269 butler.datastore.trustGetRequest = True 

1270 uri2 = butler.datastore.getURI(ref2) 

1271 uri3 = butler.datastore.getURI(ref3) 

1272 self.assertTrue(uri2.exists()) 

1273 self.assertTrue(uri3.exists()) 

1274 

1275 # Remove the datastore record. 

1276 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table 

1277 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id}) 

1278 self.assertTrue(uri2.exists()) 

1279 butler.datastore.trash([ref2, ref3]) 

1280 # Immediate removal for ref2 file 

1281 self.assertFalse(uri2.exists()) 

1282 # But ref3 has to wait for the empty. 

1283 self.assertTrue(uri3.exists()) 

1284 butler.datastore.emptyTrash() 

1285 self.assertFalse(uri3.exists()) 

1286 

1287 # Clear out the datasets from registry. 

1288 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1289 

1290 

1291class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1292 """InMemoryDatastore specialization of a butler""" 

1293 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1294 fullConfigKey = None 

1295 useTempRoot = False 

1296 validationCanFail = False 

1297 datastoreStr = ["datastore='InMemory"] 

1298 datastoreName = ["InMemoryDatastore@"] 

1299 registryStr = "/gen3.sqlite3" 

1300 

1301 def testIngest(self): 

1302 pass 

1303 

1304 

1305class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1306 """PosixDatastore specialization""" 

1307 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1308 fullConfigKey = ".datastore.datastores.1.formatters" 

1309 validationCanFail = True 

1310 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1311 datastoreName = ["InMemoryDatastore@", f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1312 "SecondDatastore"] 

1313 registryStr = "/gen3.sqlite3" 

1314 

1315 

1316class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1317 """Test that a yaml file in one location can refer to a root in another.""" 

1318 

1319 datastoreStr = ["dir1"] 

1320 # Disable the makeRepo test since we are deliberately not using 

1321 # butler.yaml as the config name. 

1322 fullConfigKey = None 

1323 

1324 def setUp(self): 

1325 self.root = makeTestTempDir(TESTDIR) 

1326 

1327 # Make a new repository in one place 

1328 self.dir1 = os.path.join(self.root, "dir1") 

1329 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1330 

1331 # Move the yaml file to a different place and add a "root" 

1332 self.dir2 = os.path.join(self.root, "dir2") 

1333 os.makedirs(self.dir2, exist_ok=True) 

1334 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1335 config = Config(configFile1) 

1336 config["root"] = self.dir1 

1337 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1338 config.dumpToUri(configFile2) 

1339 os.remove(configFile1) 

1340 self.tmpConfigFile = configFile2 

1341 

1342 def testFileLocations(self): 

1343 self.assertNotEqual(self.dir1, self.dir2) 

1344 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1345 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1346 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1347 

1348 

1349class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1350 """Test that a config file created by makeRepo outside of repo works.""" 

1351 

1352 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1353 

1354 def setUp(self): 

1355 self.root = makeTestTempDir(TESTDIR) 

1356 self.root2 = makeTestTempDir(TESTDIR) 

1357 

1358 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1359 Butler.makeRepo(self.root, config=Config(self.configFile), 

1360 outfile=self.tmpConfigFile) 

1361 

1362 def tearDown(self): 

1363 if os.path.exists(self.root2): 

1364 shutil.rmtree(self.root2, ignore_errors=True) 

1365 super().tearDown() 

1366 

1367 def testConfigExistence(self): 

1368 c = Config(self.tmpConfigFile) 

1369 uri_config = ButlerURI(c["root"]) 

1370 uri_expected = ButlerURI(self.root, forceDirectory=True) 

1371 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1372 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1373 

1374 def testPutGet(self): 

1375 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1376 self.runPutGetTest(storageClass, "test_metric") 

1377 

1378 

1379class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1380 """Test that a config file created by makeRepo outside of repo works.""" 

1381 

1382 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1383 

1384 def setUp(self): 

1385 self.root = makeTestTempDir(TESTDIR) 

1386 self.root2 = makeTestTempDir(TESTDIR) 

1387 

1388 self.tmpConfigFile = self.root2 

1389 Butler.makeRepo(self.root, config=Config(self.configFile), 

1390 outfile=self.tmpConfigFile) 

1391 

1392 def testConfigExistence(self): 

1393 # Append the yaml file else Config constructor does not know the file 

1394 # type. 

1395 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1396 super().testConfigExistence() 

1397 

1398 

1399class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1400 """Test that a config file created by makeRepo outside of repo works.""" 

1401 

1402 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1403 

1404 def setUp(self): 

1405 self.root = makeTestTempDir(TESTDIR) 

1406 self.root2 = makeTestTempDir(TESTDIR) 

1407 

1408 self.tmpConfigFile = ButlerURI(os.path.join(self.root2, "something.yaml")).geturl() 

1409 Butler.makeRepo(self.root, config=Config(self.configFile), 

1410 outfile=self.tmpConfigFile) 

1411 

1412 

1413@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1414@mock_s3 

1415class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1416 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1417 a local in-memory SqlRegistry. 

1418 """ 

1419 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1420 fullConfigKey = None 

1421 validationCanFail = True 

1422 

1423 bucketName = "anybucketname" 

1424 """Name of the Bucket that will be used in the tests. The name is read from 

1425 the config file used with the tests during set-up. 

1426 """ 

1427 

1428 root = "butlerRoot/" 

1429 """Root repository directory expected to be used in case useTempRoot=False. 

1430 Otherwise the root is set to a 20 characters long randomly generated string 

1431 during set-up. 

1432 """ 

1433 

1434 datastoreStr = [f"datastore={root}"] 

1435 """Contains all expected root locations in a format expected to be 

1436 returned by Butler stringification. 

1437 """ 

1438 

1439 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1440 """The expected format of the S3 Datastore string.""" 

1441 

1442 registryStr = "/gen3.sqlite3" 

1443 """Expected format of the Registry string.""" 

1444 

1445 def genRoot(self): 

1446 """Returns a random string of len 20 to serve as a root 

1447 name for the temporary bucket repo. 

1448 

1449 This is equivalent to tempfile.mkdtemp as this is what self.root 

1450 becomes when useTempRoot is True. 

1451 """ 

1452 rndstr = "".join( 

1453 random.choice(string.ascii_uppercase + string.digits) for _ in range(20) 

1454 ) 

1455 return rndstr + "/" 

1456 

1457 def setUp(self): 

1458 config = Config(self.configFile) 

1459 uri = ButlerURI(config[".datastore.datastore.root"]) 

1460 self.bucketName = uri.netloc 

1461 

1462 # set up some fake credentials if they do not exist 

1463 self.usingDummyCredentials = setAwsEnvCredentials() 

1464 

1465 if self.useTempRoot: 

1466 self.root = self.genRoot() 

1467 rooturi = f"s3://{self.bucketName}/{self.root}" 

1468 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1469 

1470 # need local folder to store registry database 

1471 self.reg_dir = makeTestTempDir(TESTDIR) 

1472 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1473 

1474 # MOTO needs to know that we expect Bucket bucketname to exist 

1475 # (this used to be the class attribute bucketName) 

1476 s3 = boto3.resource("s3") 

1477 s3.create_bucket(Bucket=self.bucketName) 

1478 

1479 self.datastoreStr = f"datastore={self.root}" 

1480 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1481 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1482 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1483 

1484 def tearDown(self): 

1485 s3 = boto3.resource("s3") 

1486 bucket = s3.Bucket(self.bucketName) 

1487 try: 

1488 bucket.objects.all().delete() 

1489 except botocore.exceptions.ClientError as e: 

1490 if e.response["Error"]["Code"] == "404": 

1491 # the key was not reachable - pass 

1492 pass 

1493 else: 

1494 raise 

1495 

1496 bucket = s3.Bucket(self.bucketName) 

1497 bucket.delete() 

1498 

1499 # unset any potentially set dummy credentials 

1500 if self.usingDummyCredentials: 

1501 unsetAwsEnvCredentials() 

1502 

1503 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1504 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1505 

1506 if self.useTempRoot and os.path.exists(self.root): 

1507 shutil.rmtree(self.root, ignore_errors=True) 

1508 

1509 

1510@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!") 

1511# Mock required environment variables during tests 

1512@unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1513 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1514 TESTDIR, "config/testConfigs/webdav/token"), 

1515 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1516class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1517 """WebdavDatastore specialization of a butler; a Webdav storage Datastore + 

1518 a local in-memory SqlRegistry. 

1519 """ 

1520 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml") 

1521 fullConfigKey = None 

1522 validationCanFail = True 

1523 

1524 serverName = "localhost" 

1525 """Name of the server that will be used in the tests. 

1526 """ 

1527 

1528 portNumber = 8080 

1529 """Port on which the webdav server listens. Automatically chosen 

1530 at setUpClass via the _getfreeport() method 

1531 """ 

1532 

1533 root = "butlerRoot/" 

1534 """Root repository directory expected to be used in case useTempRoot=False. 

1535 Otherwise the root is set to a 20 characters long randomly generated string 

1536 during set-up. 

1537 """ 

1538 

1539 datastoreStr = [f"datastore={root}"] 

1540 """Contains all expected root locations in a format expected to be 

1541 returned by Butler stringification. 

1542 """ 

1543 

1544 datastoreName = ["FileDatastore@https://{serverName}/{root}"] 

1545 """The expected format of the WebdavDatastore string.""" 

1546 

1547 registryStr = "/gen3.sqlite3" 

1548 """Expected format of the Registry string.""" 

1549 

1550 serverThread = None 

1551 """Thread in which the local webdav server will run""" 

1552 

1553 stopWebdavServer = False 

1554 """This flag will cause the webdav server to 

1555 gracefully shut down when True 

1556 """ 

1557 

1558 def genRoot(self): 

1559 """Returns a random string of len 20 to serve as a root 

1560 name for the temporary bucket repo. 

1561 

1562 This is equivalent to tempfile.mkdtemp as this is what self.root 

1563 becomes when useTempRoot is True. 

1564 """ 

1565 rndstr = "".join( 

1566 random.choice(string.ascii_uppercase + string.digits) for _ in range(20) 

1567 ) 

1568 return rndstr + "/" 

1569 

1570 @classmethod 

1571 def setUpClass(cls): 

1572 # Do the same as inherited class 

1573 cls.storageClassFactory = StorageClassFactory() 

1574 cls.storageClassFactory.addFromConfig(cls.configFile) 

1575 

1576 cls.portNumber = cls._getfreeport() 

1577 # Run a local webdav server on which tests will be run 

1578 cls.serverThread = Thread(target=cls._serveWebdav, 

1579 args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), 

1580 daemon=True) 

1581 cls.serverThread.start() 

1582 # Wait for it to start 

1583 time.sleep(3) 

1584 

1585 @classmethod 

1586 def tearDownClass(cls): 

1587 # Ask for graceful shut down of the webdav server 

1588 cls.stopWebdavServer = True 

1589 # Wait for the thread to exit 

1590 cls.serverThread.join() 

1591 

1592 # Mock required environment variables during tests 

1593 @unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1594 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1595 TESTDIR, "config/testConfigs/webdav/token"), 

1596 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1597 def setUp(self): 

1598 config = Config(self.configFile) 

1599 

1600 if self.useTempRoot: 

1601 self.root = self.genRoot() 

1602 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}" 

1603 config.update({"datastore": {"datastore": {"root": self.rooturi}}}) 

1604 

1605 # need local folder to store registry database 

1606 self.reg_dir = makeTestTempDir(TESTDIR) 

1607 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1608 

1609 self.datastoreStr = f"datastore={self.root}" 

1610 self.datastoreName = [f"FileDatastore@{self.rooturi}"] 

1611 

1612 if not isWebdavEndpoint(self.rooturi): 

1613 raise OSError("Webdav server not running properly: cannot run tests.") 

1614 

1615 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False) 

1616 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml") 

1617 

1618 # Mock required environment variables during tests 

1619 @unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1620 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1621 TESTDIR, "config/testConfigs/webdav/token"), 

1622 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1623 def tearDown(self): 

1624 # Clear temporary directory 

1625 ButlerURI(self.rooturi).remove() 

1626 ButlerURI(self.rooturi).session.close() 

1627 

1628 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1629 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1630 

1631 if self.useTempRoot and os.path.exists(self.root): 

1632 shutil.rmtree(self.root, ignore_errors=True) 

1633 

1634 def _serveWebdav(self, port: int, stopWebdavServer): 

1635 """Starts a local webdav-compatible HTTP server, 

1636 Listening on http://localhost:port 

1637 This server only runs when this test class is instantiated, 

1638 and then shuts down. Must be started is a separate thread. 

1639 

1640 Parameters 

1641 ---------- 

1642 port : `int` 

1643 The port number on which the server should listen 

1644 """ 

1645 root_path = gettempdir() 

1646 

1647 config = { 

1648 "host": "0.0.0.0", 

1649 "port": port, 

1650 "provider_mapping": {"/": root_path}, 

1651 "http_authenticator": { 

1652 "domain_controller": None 

1653 }, 

1654 "simple_dc": {"user_mapping": {"*": True}}, 

1655 "verbose": 0, 

1656 } 

1657 app = WsgiDAVApp(config) 

1658 

1659 server_args = { 

1660 "bind_addr": (config["host"], config["port"]), 

1661 "wsgi_app": app, 

1662 } 

1663 server = wsgi.Server(**server_args) 

1664 server.prepare() 

1665 

1666 try: 

1667 # Start the actual server in a separate thread 

1668 t = Thread(target=server.serve, daemon=True) 

1669 t.start() 

1670 # watch stopWebdavServer, and gracefully 

1671 # shut down the server when True 

1672 while True: 

1673 if stopWebdavServer(): 

1674 break 

1675 time.sleep(1) 

1676 except KeyboardInterrupt: 

1677 print("Caught Ctrl-C, shutting down...") 

1678 finally: 

1679 server.stop() 

1680 t.join() 

1681 

1682 def _getfreeport(): 

1683 """ 

1684 Determines a free port using sockets. 

1685 """ 

1686 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

1687 free_socket.bind(('0.0.0.0', 0)) 

1688 free_socket.listen() 

1689 port = free_socket.getsockname()[1] 

1690 free_socket.close() 

1691 return port 

1692 

1693 

1694class PosixDatastoreTransfers(unittest.TestCase): 

1695 """Test data transfers between butlers. 

1696 

1697 Test for different managers. UUID to UUID and integer to integer are 

1698 tested. UUID to integer is not supported since we do not currently 

1699 want to allow that. Integer to UUID is supported with the caveat 

1700 that UUID4 will be generated and this will be incorrect for raw 

1701 dataset types. The test ignores that. 

1702 """ 

1703 

1704 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1705 

1706 @classmethod 

1707 def setUpClass(cls): 

1708 cls.storageClassFactory = StorageClassFactory() 

1709 cls.storageClassFactory.addFromConfig(cls.configFile) 

1710 

1711 def setUp(self): 

1712 self.root = makeTestTempDir(TESTDIR) 

1713 self.config = Config(self.configFile) 

1714 

1715 def tearDown(self): 

1716 removeTestTempDir(self.root) 

1717 

1718 def create_butler(self, manager, label): 

1719 config = Config(self.configFile) 

1720 config["registry", "managers", "datasets"] = manager 

1721 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), 

1722 writeable=True) 

1723 

1724 def create_butlers(self, manager1, manager2): 

1725 self.source_butler = self.create_butler(manager1, "1") 

1726 self.target_butler = self.create_butler(manager2, "2") 

1727 

1728 def testTransferUuidToUuid(self): 

1729 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1730 "ByDimensionsDatasetRecordStorageManagerUUID", 

1731 "lsst.daf.butler.registry.datasets.byDimensions." 

1732 "ByDimensionsDatasetRecordStorageManagerUUID", 

1733 ) 

1734 # Setting id_gen_map should have no effect here 

1735 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1736 

1737 def testTransferIntToInt(self): 

1738 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1739 "ByDimensionsDatasetRecordStorageManager", 

1740 "lsst.daf.butler.registry.datasets.byDimensions." 

1741 "ByDimensionsDatasetRecordStorageManager", 

1742 ) 

1743 # int dataset ID only allows UNIQUE 

1744 self.assertButlerTransfers() 

1745 

1746 def testTransferIntToUuid(self): 

1747 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1748 "ByDimensionsDatasetRecordStorageManager", 

1749 "lsst.daf.butler.registry.datasets.byDimensions." 

1750 "ByDimensionsDatasetRecordStorageManagerUUID", 

1751 ) 

1752 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1753 

1754 def testTransferMissing(self): 

1755 """Test transfers where datastore records are missing. 

1756 

1757 This is how execution butler works. 

1758 """ 

1759 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1760 "ByDimensionsDatasetRecordStorageManagerUUID", 

1761 "lsst.daf.butler.registry.datasets.byDimensions." 

1762 "ByDimensionsDatasetRecordStorageManagerUUID", 

1763 ) 

1764 

1765 # Configure the source butler to allow trust. 

1766 self.source_butler.datastore.trustGetRequest = True 

1767 

1768 self.assertButlerTransfers(purge=True) 

1769 

1770 def testTransferMissingDisassembly(self): 

1771 """Test transfers where datastore records are missing. 

1772 

1773 This is how execution butler works. 

1774 """ 

1775 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1776 "ByDimensionsDatasetRecordStorageManagerUUID", 

1777 "lsst.daf.butler.registry.datasets.byDimensions." 

1778 "ByDimensionsDatasetRecordStorageManagerUUID", 

1779 ) 

1780 

1781 # Configure the source butler to allow trust. 

1782 self.source_butler.datastore.trustGetRequest = True 

1783 

1784 # Test disassembly. 

1785 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

1786 

1787 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"): 

1788 """Test that a run can be transferred to another butler.""" 

1789 

1790 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

1791 datasetTypeName = "random_data" 

1792 

1793 # Test will create 3 collections and we will want to transfer 

1794 # two of those three. 

1795 runs = ["run1", "run2", "other"] 

1796 

1797 # Also want to use two different dataset types to ensure that 

1798 # grouping works. 

1799 datasetTypeNames = ["random_data", "random_data_2"] 

1800 

1801 # Create the run collections in the source butler. 

1802 for run in runs: 

1803 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

1804 

1805 # Create dimensions in both butlers (transfer will not create them). 

1806 n_exposures = 30 

1807 for butler in (self.source_butler, self.target_butler): 

1808 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1809 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

1810 "name": "d-r", 

1811 "band": "R"}) 

1812 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", 

1813 "id": 1, "full_name": "det1"}) 

1814 

1815 for i in range(n_exposures): 

1816 butler.registry.insertDimensionData("exposure", {"instrument": "DummyCamComp", 

1817 "id": i, "obs_id": f"exp{i}", 

1818 "physical_filter": "d-r"}) 

1819 

1820 # Create dataset types in the source butler. 

1821 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1822 for datasetTypeName in datasetTypeNames: 

1823 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1824 self.source_butler.registry.registerDatasetType(datasetType) 

1825 

1826 # Write a dataset to an unrelated run -- this will ensure that 

1827 # we are rewriting integer dataset ids in the target if necessary. 

1828 # Will not be relevant for UUID. 

1829 run = "distraction" 

1830 butler = Butler(butler=self.source_butler, run=run) 

1831 butler.put(makeExampleMetrics(), datasetTypeName, 

1832 exposure=1, detector=1, instrument="DummyCamComp", physical_filter="d-r") 

1833 

1834 # Write some example metrics to the source 

1835 butler = Butler(butler=self.source_butler) 

1836 

1837 # Set of DatasetRefs that should be in the list of refs to transfer 

1838 # but which will not be transferred. 

1839 deleted = set() 

1840 

1841 n_expected = 20 # Number of datasets expected to be transferred 

1842 source_refs = [] 

1843 for i in range(n_exposures): 

1844 # Put a third of datasets into each collection, only retain 

1845 # two thirds. 

1846 index = i % 3 

1847 run = runs[index] 

1848 datasetTypeName = datasetTypeNames[i % 2] 

1849 

1850 metric_data = {"summary": {"counter": i}, 

1851 "output": {"text": "metric"}, 

1852 "data": [2*x for x in range(i)]} 

1853 metric = MetricsExample(**metric_data) 

1854 dataId = {"exposure": i, "detector": 1, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1855 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

1856 

1857 # Remove the datastore record using low-level API 

1858 if purge: 

1859 # Remove records for a fraction. 

1860 if index == 1: 

1861 

1862 # For one of these delete the file as well. 

1863 # This allows the "missing" code to filter the 

1864 # file out. 

1865 if not deleted: 

1866 primary, uris = butler.datastore.getURIs(ref) 

1867 if primary: 

1868 primary.remove() 

1869 for uri in uris.values(): 

1870 uri.remove() 

1871 n_expected -= 1 

1872 deleted.add(ref) 

1873 

1874 # Remove the datastore record. 

1875 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

1876 

1877 if index < 2: 

1878 source_refs.append(ref) 

1879 if ref not in deleted: 

1880 new_metric = butler.get(ref.unresolved(), collections=run) 

1881 self.assertEqual(new_metric, metric) 

1882 

1883 # Create some bad dataset types to ensure we check for inconsistent 

1884 # definitions. 

1885 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

1886 for datasetTypeName in datasetTypeNames: 

1887 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

1888 self.target_butler.registry.registerDatasetType(datasetType) 

1889 with self.assertRaises(ConflictingDefinitionError): 

1890 self.target_butler.transfer_from(self.source_butler, source_refs, 

1891 id_gen_map=id_gen_map) 

1892 # And remove the bad definitions. 

1893 for datasetTypeName in datasetTypeNames: 

1894 self.target_butler.registry.removeDatasetType(datasetTypeName) 

1895 

1896 # Transfer without creating dataset types should fail. 

1897 with self.assertRaises(KeyError): 

1898 self.target_butler.transfer_from(self.source_butler, source_refs, 

1899 id_gen_map=id_gen_map) 

1900 

1901 # Now transfer them to the second butler 

1902 with self.assertLogs(level=logging.DEBUG) as cm: 

1903 transferred = self.target_butler.transfer_from(self.source_butler, source_refs, 

1904 id_gen_map=id_gen_map, 

1905 register_dataset_types=True) 

1906 self.assertEqual(len(transferred), n_expected) 

1907 log_output = ";".join(cm.output) 

1908 self.assertIn("found in datastore for chunk", log_output) 

1909 self.assertIn("Creating output run", log_output) 

1910 

1911 # Do the transfer twice to ensure that it will do nothing extra. 

1912 # Only do this if purge=True because it does not work for int 

1913 # dataset_id. 

1914 if purge: 

1915 # This should not need to register dataset types. 

1916 transferred = self.target_butler.transfer_from(self.source_butler, source_refs, 

1917 id_gen_map=id_gen_map) 

1918 self.assertEqual(len(transferred), n_expected) 

1919 

1920 # Also do an explicit low-level transfer to trigger some 

1921 # edge cases. 

1922 with self.assertLogs(level=logging.DEBUG) as cm: 

1923 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

1924 log_output = ";".join(cm.output) 

1925 self.assertIn("no file artifacts exist", log_output) 

1926 

1927 with self.assertRaises(TypeError): 

1928 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) 

1929 

1930 with self.assertRaises(ValueError): 

1931 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs, 

1932 transfer="split") 

1933 

1934 # Now try to get the same refs from the new butler. 

1935 for ref in source_refs: 

1936 if ref not in deleted: 

1937 unresolved_ref = ref.unresolved() 

1938 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run) 

1939 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run) 

1940 self.assertEqual(new_metric, old_metric) 

1941 

1942 # Now prune run2 collection and create instead a CHAINED collection. 

1943 # This should block the transfer. 

1944 self.target_butler.pruneCollection("run2", purge=True, unstore=True) 

1945 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

1946 with self.assertRaises(TypeError): 

1947 # Re-importing the run1 datasets can be problematic if they 

1948 # use integer IDs so filter those out. 

1949 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

1950 self.target_butler.transfer_from(self.source_butler, to_transfer, 

1951 id_gen_map=id_gen_map) 

1952 

1953 

1954if __name__ == "__main__": 1954 ↛ 1955line 1954 didn't jump to line 1955, because the condition on line 1954 was never true

1955 unittest.main()