Coverage for tests/test_butler.py: 15%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1046 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24 

25import logging 

26import os 

27import posixpath 

28import unittest 

29import tempfile 

30import shutil 

31import pickle 

32import string 

33import random 

34import time 

35import socket 

36import pathlib 

37 

38try: 

39 import boto3 

40 import botocore 

41 from moto import mock_s3 

42except ImportError: 

43 boto3 = None 

44 

45 def mock_s3(cls): 

46 """A no-op decorator in case moto mock_s3 can not be imported. 

47 """ 

48 return cls 

49 

50try: 

51 from cheroot import wsgi 

52 from wsgidav.wsgidav_app import WsgiDAVApp 

53except ImportError: 

54 WsgiDAVApp = None 

55 

56import astropy.time 

57from threading import Thread 

58from tempfile import gettempdir 

59from lsst.utils import doImport 

60from lsst.daf.butler import Butler, Config, ButlerConfig 

61from lsst.daf.butler import StorageClassFactory 

62from lsst.daf.butler import DatasetType, DatasetRef, DatasetIdGenEnum 

63from lsst.daf.butler import FileTemplateValidationError, ValidationError 

64from lsst.daf.butler import FileDataset 

65from lsst.daf.butler import CollectionSearch, CollectionType 

66from lsst.daf.butler import ButlerURI 

67from lsst.daf.butler import script 

68from lsst.daf.butler.registry import MissingCollectionError, ConflictingDefinitionError 

69from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

70from lsst.daf.butler.core._butlerUri.s3utils import (setAwsEnvCredentials, 

71 unsetAwsEnvCredentials) 

72from lsst.daf.butler.core._butlerUri.http import isWebdavEndpoint 

73 

74from lsst.daf.butler.tests import MultiDetectorFormatter, MetricsExample 

75from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir 

76 

77TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

78 

79 

80def makeExampleMetrics(): 

81 return MetricsExample({"AM1": 5.2, "AM2": 30.6}, 

82 {"a": [1, 2, 3], 

83 "b": {"blue": 5, "red": "green"}}, 

84 [563, 234, 456.7, 752, 8, 9, 27] 

85 ) 

86 

87 

88class TransactionTestError(Exception): 

89 """Specific error for testing transactions, to prevent misdiagnosing 

90 that might otherwise occur when a standard exception is used. 

91 """ 

92 pass 

93 

94 

95class ButlerConfigTests(unittest.TestCase): 

96 """Simple tests for ButlerConfig that are not tested in other test cases. 

97 """ 

98 

99 def testSearchPath(self): 

100 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

101 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

102 config1 = ButlerConfig(configFile) 

103 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

104 

105 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

106 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

107 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

108 self.assertIn("testConfigs", "\n".join(cm.output)) 

109 

110 key = ("datastore", "records", "table") 

111 self.assertNotEqual(config1[key], config2[key]) 

112 self.assertEqual(config2[key], "override_record") 

113 

114 

115class ButlerPutGetTests: 

116 """Helper method for running a suite of put/get tests from different 

117 butler configurations.""" 

118 

119 root = None 

120 

121 @staticmethod 

122 def addDatasetType(datasetTypeName, dimensions, storageClass, registry): 

123 """Create a DatasetType and register it 

124 """ 

125 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

126 registry.registerDatasetType(datasetType) 

127 return datasetType 

128 

129 @classmethod 

130 def setUpClass(cls): 

131 cls.storageClassFactory = StorageClassFactory() 

132 cls.storageClassFactory.addFromConfig(cls.configFile) 

133 

134 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None): 

135 datasetType = datasetRef.datasetType 

136 dataId = datasetRef.dataId 

137 deferred = butler.getDirectDeferred(datasetRef) 

138 

139 for component in components: 

140 compTypeName = datasetType.componentTypeName(component) 

141 result = butler.get(compTypeName, dataId, collections=collections) 

142 self.assertEqual(result, getattr(reference, component)) 

143 result_deferred = deferred.get(component=component) 

144 self.assertEqual(result_deferred, result) 

145 

146 def tearDown(self): 

147 removeTestTempDir(self.root) 

148 

149 def runPutGetTest(self, storageClass, datasetTypeName): 

150 # New datasets will be added to run and tag, but we will only look in 

151 # tag when looking up datasets. 

152 run = "ingest" 

153 butler = Butler(self.tmpConfigFile, run=run) 

154 

155 collections = set(butler.registry.queryCollections()) 

156 self.assertEqual(collections, set([run])) 

157 

158 # Create and register a DatasetType 

159 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

160 

161 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

162 

163 # Add needed Dimensions 

164 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

165 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

166 "name": "d-r", 

167 "band": "R"}) 

168 butler.registry.insertDimensionData("visit_system", {"instrument": "DummyCamComp", 

169 "id": 1, 

170 "name": "default"}) 

171 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

172 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

173 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

174 "name": "fourtwentythree", "physical_filter": "d-r", 

175 "visit_system": 1, "datetime_begin": visit_start, 

176 "datetime_end": visit_end}) 

177 

178 # Add a second visit for some later tests 

179 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 424, 

180 "name": "fourtwentyfour", "physical_filter": "d-r", 

181 "visit_system": 1}) 

182 

183 # Create and store a dataset 

184 metric = makeExampleMetrics() 

185 dataId = {"instrument": "DummyCamComp", "visit": 423} 

186 

187 # Create a DatasetRef for put 

188 refIn = DatasetRef(datasetType, dataId, id=None) 

189 

190 # Put with a preexisting id should fail 

191 with self.assertRaises(ValueError): 

192 butler.put(metric, DatasetRef(datasetType, dataId, id=100)) 

193 

194 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

195 # and once with a DatasetType 

196 

197 # Keep track of any collections we add and do not clean up 

198 expected_collections = {run} 

199 

200 counter = 0 

201 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)): 

202 # Since we are using subTest we can get cascading failures 

203 # here with the first attempt failing and the others failing 

204 # immediately because the dataset already exists. Work around 

205 # this by using a distinct run collection each time 

206 counter += 1 

207 this_run = f"put_run_{counter}" 

208 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

209 expected_collections.update({this_run}) 

210 

211 with self.subTest(args=args): 

212 ref = butler.put(metric, *args, run=this_run) 

213 self.assertIsInstance(ref, DatasetRef) 

214 

215 # Test getDirect 

216 metricOut = butler.getDirect(ref) 

217 self.assertEqual(metric, metricOut) 

218 # Test get 

219 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

220 self.assertEqual(metric, metricOut) 

221 # Test get with a datasetRef 

222 metricOut = butler.get(ref, collections=this_run) 

223 self.assertEqual(metric, metricOut) 

224 # Test getDeferred with dataId 

225 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

226 self.assertEqual(metric, metricOut) 

227 # Test getDeferred with a datasetRef 

228 metricOut = butler.getDeferred(ref, collections=this_run).get() 

229 self.assertEqual(metric, metricOut) 

230 # and deferred direct with ref 

231 metricOut = butler.getDirectDeferred(ref).get() 

232 self.assertEqual(metric, metricOut) 

233 

234 # Check we can get components 

235 if storageClass.isComposite(): 

236 self.assertGetComponents(butler, ref, 

237 ("summary", "data", "output"), metric, 

238 collections=this_run) 

239 

240 # Can the artifacts themselves be retrieved? 

241 if not butler.datastore.isEphemeral: 

242 root_uri = ButlerURI(self.root) 

243 

244 for preserve_path in (True, False): 

245 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

246 # Use copy so that we can test that overwrite 

247 # protection works (using "auto" for File URIs would 

248 # use hard links and subsequent transfer would work 

249 # because it knows they are the same file). 

250 transferred = butler.retrieveArtifacts([ref], destination, 

251 preserve_path=preserve_path, transfer="copy") 

252 self.assertGreater(len(transferred), 0) 

253 artifacts = list(ButlerURI.findFileResources([destination])) 

254 self.assertEqual(set(transferred), set(artifacts)) 

255 

256 for artifact in transferred: 

257 path_in_destination = artifact.relative_to(destination) 

258 self.assertIsNotNone(path_in_destination) 

259 

260 # when path is not preserved there should not be 

261 # any path separators. 

262 num_seps = path_in_destination.count("/") 

263 if preserve_path: 

264 self.assertGreater(num_seps, 0) 

265 else: 

266 self.assertEqual(num_seps, 0) 

267 

268 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

269 n_uris = len(secondary_uris) 

270 if primary_uri: 

271 n_uris += 1 

272 self.assertEqual(len(artifacts), n_uris, "Comparing expected artifacts vs actual:" 

273 f" {artifacts} vs {primary_uri} and {secondary_uris}") 

274 

275 if preserve_path: 

276 # No need to run these twice 

277 with self.assertRaises(ValueError): 

278 butler.retrieveArtifacts([ref], destination, transfer="move") 

279 

280 with self.assertRaises(FileExistsError): 

281 butler.retrieveArtifacts([ref], destination) 

282 

283 transferred_again = butler.retrieveArtifacts([ref], destination, 

284 preserve_path=preserve_path, 

285 overwrite=True) 

286 self.assertEqual(set(transferred_again), set(transferred)) 

287 

288 # Now remove the dataset completely. 

289 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run) 

290 # Lookup with original args should still fail. 

291 with self.assertRaises(LookupError): 

292 butler.datasetExists(*args, collections=this_run) 

293 # getDirect() should still fail. 

294 with self.assertRaises(FileNotFoundError): 

295 butler.getDirect(ref) 

296 # Registry shouldn't be able to find it by dataset_id anymore. 

297 self.assertIsNone(butler.registry.getDataset(ref.id)) 

298 

299 # Do explicit registry removal since we know they are 

300 # empty 

301 butler.registry.removeCollection(this_run) 

302 expected_collections.remove(this_run) 

303 

304 # Put the dataset again, since the last thing we did was remove it 

305 # and we want to use the default collection. 

306 ref = butler.put(metric, refIn) 

307 

308 # Get with parameters 

309 stop = 4 

310 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

311 self.assertNotEqual(metric, sliced) 

312 self.assertEqual(metric.summary, sliced.summary) 

313 self.assertEqual(metric.output, sliced.output) 

314 self.assertEqual(metric.data[:stop], sliced.data) 

315 # getDeferred with parameters 

316 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

317 self.assertNotEqual(metric, sliced) 

318 self.assertEqual(metric.summary, sliced.summary) 

319 self.assertEqual(metric.output, sliced.output) 

320 self.assertEqual(metric.data[:stop], sliced.data) 

321 # getDeferred with deferred parameters 

322 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

323 self.assertNotEqual(metric, sliced) 

324 self.assertEqual(metric.summary, sliced.summary) 

325 self.assertEqual(metric.output, sliced.output) 

326 self.assertEqual(metric.data[:stop], sliced.data) 

327 

328 if storageClass.isComposite(): 

329 # Check that components can be retrieved 

330 metricOut = butler.get(ref.datasetType.name, dataId) 

331 compNameS = ref.datasetType.componentTypeName("summary") 

332 compNameD = ref.datasetType.componentTypeName("data") 

333 summary = butler.get(compNameS, dataId) 

334 self.assertEqual(summary, metric.summary) 

335 data = butler.get(compNameD, dataId) 

336 self.assertEqual(data, metric.data) 

337 

338 if "counter" in storageClass.derivedComponents: 

339 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

340 self.assertEqual(count, len(data)) 

341 

342 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId, 

343 parameters={"slice": slice(stop)}) 

344 self.assertEqual(count, stop) 

345 

346 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

347 summary = butler.getDirect(compRef) 

348 self.assertEqual(summary, metric.summary) 

349 

350 # Create a Dataset type that has the same name but is inconsistent. 

351 inconsistentDatasetType = DatasetType(datasetTypeName, dimensions, 

352 self.storageClassFactory.getStorageClass("Config")) 

353 

354 # Getting with a dataset type that does not match registry fails 

355 with self.assertRaises(ValueError): 

356 butler.get(inconsistentDatasetType, dataId) 

357 

358 # Combining a DatasetRef with a dataId should fail 

359 with self.assertRaises(ValueError): 

360 butler.get(ref, dataId) 

361 # Getting with an explicit ref should fail if the id doesn't match 

362 with self.assertRaises(ValueError): 

363 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101)) 

364 

365 # Getting a dataset with unknown parameters should fail 

366 with self.assertRaises(KeyError): 

367 butler.get(ref, parameters={"unsupported": True}) 

368 

369 # Check we have a collection 

370 collections = set(butler.registry.queryCollections()) 

371 self.assertEqual(collections, expected_collections) 

372 

373 # Clean up to check that we can remove something that may have 

374 # already had a component removed 

375 butler.pruneDatasets([ref], unstore=True, purge=True) 

376 

377 # Check that we can configure a butler to accept a put even 

378 # if it already has the dataset in registry. 

379 ref = butler.put(metric, refIn) 

380 

381 # Repeat put will fail. 

382 with self.assertRaises(ConflictingDefinitionError): 

383 butler.put(metric, refIn) 

384 

385 # Remove the datastore entry. 

386 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

387 

388 # Put will still fail 

389 with self.assertRaises(ConflictingDefinitionError): 

390 butler.put(metric, refIn) 

391 

392 # Allow the put to succeed 

393 butler._allow_put_of_predefined_dataset = True 

394 ref2 = butler.put(metric, refIn) 

395 self.assertEqual(ref2.id, ref.id) 

396 

397 # A second put will still fail but with a different exception 

398 # than before. 

399 with self.assertRaises(ConflictingDefinitionError): 

400 butler.put(metric, refIn) 

401 

402 # Reset the flag to avoid confusion 

403 butler._allow_put_of_predefined_dataset = False 

404 

405 # Leave the dataset in place since some downstream tests require 

406 # something to be present 

407 

408 return butler 

409 

410 def testDeferredCollectionPassing(self): 

411 # Construct a butler with no run or collection, but make it writeable. 

412 butler = Butler(self.tmpConfigFile, writeable=True) 

413 # Create and register a DatasetType 

414 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

415 datasetType = self.addDatasetType("example", dimensions, 

416 self.storageClassFactory.getStorageClass("StructuredData"), 

417 butler.registry) 

418 # Add needed Dimensions 

419 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

420 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

421 "name": "d-r", 

422 "band": "R"}) 

423 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

424 "name": "fourtwentythree", "physical_filter": "d-r"}) 

425 dataId = {"instrument": "DummyCamComp", "visit": 423} 

426 # Create dataset. 

427 metric = makeExampleMetrics() 

428 # Register a new run and put dataset. 

429 run = "deferred" 

430 self.assertTrue(butler.registry.registerRun(run)) 

431 # Second time it will be allowed but indicate no-op 

432 self.assertFalse(butler.registry.registerRun(run)) 

433 ref = butler.put(metric, datasetType, dataId, run=run) 

434 # Putting with no run should fail with TypeError. 

435 with self.assertRaises(TypeError): 

436 butler.put(metric, datasetType, dataId) 

437 # Dataset should exist. 

438 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

439 # We should be able to get the dataset back, but with and without 

440 # a deferred dataset handle. 

441 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

442 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

443 # Trying to find the dataset without any collection is a TypeError. 

444 with self.assertRaises(TypeError): 

445 butler.datasetExists(datasetType, dataId) 

446 with self.assertRaises(TypeError): 

447 butler.get(datasetType, dataId) 

448 # Associate the dataset with a different collection. 

449 butler.registry.registerCollection("tagged") 

450 butler.registry.associate("tagged", [ref]) 

451 # Deleting the dataset from the new collection should make it findable 

452 # in the original collection. 

453 butler.pruneDatasets([ref], tags=["tagged"]) 

454 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

455 

456 

457class ButlerTests(ButlerPutGetTests): 

458 """Tests for Butler. 

459 """ 

460 useTempRoot = True 

461 

462 def setUp(self): 

463 """Create a new butler root for each test.""" 

464 self.root = makeTestTempDir(TESTDIR) 

465 Butler.makeRepo(self.root, config=Config(self.configFile)) 

466 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

467 

468 def testConstructor(self): 

469 """Independent test of constructor. 

470 """ 

471 butler = Butler(self.tmpConfigFile, run="ingest") 

472 self.assertIsInstance(butler, Butler) 

473 

474 collections = set(butler.registry.queryCollections()) 

475 self.assertEqual(collections, {"ingest"}) 

476 

477 butler2 = Butler(butler=butler, collections=["other"]) 

478 self.assertEqual( 

479 butler2.collections, 

480 CollectionSearch.fromExpression(["other"]) 

481 ) 

482 self.assertIsNone(butler2.run) 

483 self.assertIs(butler.datastore, butler2.datastore) 

484 

485 def testBasicPutGet(self): 

486 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

487 self.runPutGetTest(storageClass, "test_metric") 

488 

489 def testCompositePutGetConcrete(self): 

490 

491 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

492 butler = self.runPutGetTest(storageClass, "test_metric") 

493 

494 # Should *not* be disassembled 

495 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

496 self.assertEqual(len(datasets), 1) 

497 uri, components = butler.getURIs(datasets[0]) 

498 self.assertIsInstance(uri, ButlerURI) 

499 self.assertFalse(components) 

500 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

501 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

502 

503 # Predicted dataset 

504 dataId = {"instrument": "DummyCamComp", "visit": 424} 

505 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

506 self.assertFalse(components) 

507 self.assertIsInstance(uri, ButlerURI) 

508 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

509 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

510 

511 def testCompositePutGetVirtual(self): 

512 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

513 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

514 

515 # Should be disassembled 

516 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

517 self.assertEqual(len(datasets), 1) 

518 uri, components = butler.getURIs(datasets[0]) 

519 

520 if butler.datastore.isEphemeral: 

521 # Never disassemble in-memory datastore 

522 self.assertIsInstance(uri, ButlerURI) 

523 self.assertFalse(components) 

524 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

525 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

526 else: 

527 self.assertIsNone(uri) 

528 self.assertEqual(set(components), set(storageClass.components)) 

529 for compuri in components.values(): 

530 self.assertIsInstance(compuri, ButlerURI) 

531 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

532 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

533 

534 # Predicted dataset 

535 dataId = {"instrument": "DummyCamComp", "visit": 424} 

536 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

537 

538 if butler.datastore.isEphemeral: 

539 # Never disassembled 

540 self.assertIsInstance(uri, ButlerURI) 

541 self.assertFalse(components) 

542 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

543 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

544 else: 

545 self.assertIsNone(uri) 

546 self.assertEqual(set(components), set(storageClass.components)) 

547 for compuri in components.values(): 

548 self.assertIsInstance(compuri, ButlerURI) 

549 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

550 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

551 

552 def testIngest(self): 

553 butler = Butler(self.tmpConfigFile, run="ingest") 

554 

555 # Create and register a DatasetType 

556 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"]) 

557 

558 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

559 datasetTypeName = "metric" 

560 

561 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

562 

563 # Add needed Dimensions 

564 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

565 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

566 "name": "d-r", 

567 "band": "R"}) 

568 for detector in (1, 2): 

569 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", "id": detector, 

570 "full_name": f"detector{detector}"}) 

571 

572 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, 

573 "name": "fourtwentythree", "physical_filter": "d-r"}, 

574 {"instrument": "DummyCamComp", "id": 424, 

575 "name": "fourtwentyfour", "physical_filter": "d-r"}) 

576 

577 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter") 

578 dataRoot = os.path.join(TESTDIR, "data", "basic") 

579 datasets = [] 

580 for detector in (1, 2): 

581 detector_name = f"detector_{detector}" 

582 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

583 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

584 # Create a DatasetRef for ingest 

585 refIn = DatasetRef(datasetType, dataId, id=None) 

586 

587 datasets.append(FileDataset(path=metricFile, 

588 refs=[refIn], 

589 formatter=formatter)) 

590 

591 butler.ingest(*datasets, transfer="copy") 

592 

593 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

594 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

595 

596 metrics1 = butler.get(datasetTypeName, dataId1) 

597 metrics2 = butler.get(datasetTypeName, dataId2) 

598 self.assertNotEqual(metrics1, metrics2) 

599 

600 # Compare URIs 

601 uri1 = butler.getURI(datasetTypeName, dataId1) 

602 uri2 = butler.getURI(datasetTypeName, dataId2) 

603 self.assertNotEqual(uri1, uri2) 

604 

605 # Now do a multi-dataset but single file ingest 

606 metricFile = os.path.join(dataRoot, "detectors.yaml") 

607 refs = [] 

608 for detector in (1, 2): 

609 detector_name = f"detector_{detector}" 

610 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

611 # Create a DatasetRef for ingest 

612 refs.append(DatasetRef(datasetType, dataId, id=None)) 

613 

614 datasets = [] 

615 datasets.append(FileDataset(path=metricFile, 

616 refs=refs, 

617 formatter=MultiDetectorFormatter)) 

618 

619 butler.ingest(*datasets, transfer="copy") 

620 

621 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

622 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

623 

624 multi1 = butler.get(datasetTypeName, dataId1) 

625 multi2 = butler.get(datasetTypeName, dataId2) 

626 

627 self.assertEqual(multi1, metrics1) 

628 self.assertEqual(multi2, metrics2) 

629 

630 # Compare URIs 

631 uri1 = butler.getURI(datasetTypeName, dataId1) 

632 uri2 = butler.getURI(datasetTypeName, dataId2) 

633 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

634 

635 # Test that removing one does not break the second 

636 # This line will issue a warning log message for a ChainedDatastore 

637 # that uses an InMemoryDatastore since in-memory can not ingest 

638 # files. 

639 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

640 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1)) 

641 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2)) 

642 multi2b = butler.get(datasetTypeName, dataId2) 

643 self.assertEqual(multi2, multi2b) 

644 

645 def testPruneCollections(self): 

646 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

647 butler = Butler(self.tmpConfigFile, writeable=True) 

648 # Load registry data with dimensions to hang datasets off of. 

649 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

650 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

651 # Add some RUN-type collections. 

652 run1 = "run1" 

653 butler.registry.registerRun(run1) 

654 run2 = "run2" 

655 butler.registry.registerRun(run2) 

656 # put some datasets. ref1 and ref2 have the same data ID, and are in 

657 # different runs. ref3 has a different data ID. 

658 metric = makeExampleMetrics() 

659 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

660 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

661 butler.registry) 

662 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

663 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

664 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

665 

666 # Try to delete a RUN collection without purge, or with purge and not 

667 # unstore. 

668 with self.assertRaises(TypeError): 

669 butler.pruneCollection(run1) 

670 with self.assertRaises(TypeError): 

671 butler.pruneCollection(run2, purge=True) 

672 # Add a TAGGED collection and associate ref3 only into it. 

673 tag1 = "tag1" 

674 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

675 self.assertTrue(registered) 

676 # Registering a second time should be allowed. 

677 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

678 self.assertFalse(registered) 

679 butler.registry.associate(tag1, [ref3]) 

680 # Add a CHAINED collection that searches run1 and then run2. It 

681 # logically contains only ref1, because ref2 is shadowed due to them 

682 # having the same data ID and dataset type. 

683 chain1 = "chain1" 

684 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

685 butler.registry.setCollectionChain(chain1, [run1, run2]) 

686 # Try to delete RUN collections, which should fail with complete 

687 # rollback because they're still referenced by the CHAINED 

688 # collection. 

689 with self.assertRaises(Exception): 

690 butler.pruneCollection(run1, pruge=True, unstore=True) 

691 with self.assertRaises(Exception): 

692 butler.pruneCollection(run2, pruge=True, unstore=True) 

693 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

694 [ref1, ref2, ref3]) 

695 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

696 self.assertTrue(existence[ref1]) 

697 self.assertTrue(existence[ref2]) 

698 self.assertTrue(existence[ref3]) 

699 # Try to delete CHAINED and TAGGED collections with purge; should not 

700 # work. 

701 with self.assertRaises(TypeError): 

702 butler.pruneCollection(tag1, purge=True, unstore=True) 

703 with self.assertRaises(TypeError): 

704 butler.pruneCollection(chain1, purge=True, unstore=True) 

705 # Remove the tagged collection with unstore=False. This should not 

706 # affect the datasets. 

707 butler.pruneCollection(tag1) 

708 with self.assertRaises(MissingCollectionError): 

709 butler.registry.getCollectionType(tag1) 

710 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

711 [ref1, ref2, ref3]) 

712 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

713 self.assertTrue(existence[ref1]) 

714 self.assertTrue(existence[ref2]) 

715 self.assertTrue(existence[ref3]) 

716 # Add the tagged collection back in, and remove it with unstore=True. 

717 # This should remove ref3 only from the datastore. 

718 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

719 butler.registry.associate(tag1, [ref3]) 

720 butler.pruneCollection(tag1, unstore=True) 

721 with self.assertRaises(MissingCollectionError): 

722 butler.registry.getCollectionType(tag1) 

723 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

724 [ref1, ref2, ref3]) 

725 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

726 self.assertTrue(existence[ref1]) 

727 self.assertTrue(existence[ref2]) 

728 self.assertFalse(existence[ref3]) 

729 # Delete the chain with unstore=False. The datasets should not be 

730 # affected at all. 

731 butler.pruneCollection(chain1) 

732 with self.assertRaises(MissingCollectionError): 

733 butler.registry.getCollectionType(chain1) 

734 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

735 [ref1, ref2, ref3]) 

736 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

737 self.assertTrue(existence[ref1]) 

738 self.assertTrue(existence[ref2]) 

739 self.assertFalse(existence[ref3]) 

740 # Redefine and then delete the chain with unstore=True. Only ref1 

741 # should be unstored (ref3 has already been unstored, but otherwise 

742 # would be now). 

743 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

744 butler.registry.setCollectionChain(chain1, [run1, run2]) 

745 butler.pruneCollection(chain1, unstore=True) 

746 with self.assertRaises(MissingCollectionError): 

747 butler.registry.getCollectionType(chain1) 

748 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

749 [ref1, ref2, ref3]) 

750 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

751 self.assertFalse(existence[ref1]) 

752 self.assertTrue(existence[ref2]) 

753 self.assertFalse(existence[ref3]) 

754 # Remove run1. This removes ref1 and ref3 from the registry (they're 

755 # already gone from the datastore, which is fine). 

756 butler.pruneCollection(run1, purge=True, unstore=True) 

757 with self.assertRaises(MissingCollectionError): 

758 butler.registry.getCollectionType(run1) 

759 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

760 [ref2]) 

761 self.assertTrue(butler.datastore.exists(ref2)) 

762 # Remove run2. This removes ref2 from the registry and the datastore. 

763 butler.pruneCollection(run2, purge=True, unstore=True) 

764 with self.assertRaises(MissingCollectionError): 

765 butler.registry.getCollectionType(run2) 

766 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), 

767 []) 

768 

769 # Now that the collections have been pruned we can remove the 

770 # dataset type 

771 butler.registry.removeDatasetType(datasetType.name) 

772 

773 def testPickle(self): 

774 """Test pickle support. 

775 """ 

776 butler = Butler(self.tmpConfigFile, run="ingest") 

777 butlerOut = pickle.loads(pickle.dumps(butler)) 

778 self.assertIsInstance(butlerOut, Butler) 

779 self.assertEqual(butlerOut._config, butler._config) 

780 self.assertEqual(butlerOut.collections, butler.collections) 

781 self.assertEqual(butlerOut.run, butler.run) 

782 

783 def testGetDatasetTypes(self): 

784 butler = Butler(self.tmpConfigFile, run="ingest") 

785 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"]) 

786 dimensionEntries = [ 

787 ("instrument", {"instrument": "DummyCam"}, {"instrument": "DummyHSC"}, 

788 {"instrument": "DummyCamComp"}), 

789 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

790 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}) 

791 ] 

792 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

793 # Add needed Dimensions 

794 for args in dimensionEntries: 

795 butler.registry.insertDimensionData(*args) 

796 

797 # When a DatasetType is added to the registry entries are not created 

798 # for components but querying them can return the components. 

799 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

800 components = set() 

801 for datasetTypeName in datasetTypeNames: 

802 # Create and register a DatasetType 

803 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

804 

805 for componentName in storageClass.components: 

806 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

807 

808 fromRegistry = set(butler.registry.queryDatasetTypes(components=True)) 

809 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

810 

811 # Now that we have some dataset types registered, validate them 

812 butler.validateConfiguration(ignore=["test_metric_comp", "metric3", "calexp", "DummySC", 

813 "datasetType.component", "random_data", "random_data_2"]) 

814 

815 # Add a new datasetType that will fail template validation 

816 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

817 if self.validationCanFail: 

818 with self.assertRaises(ValidationError): 

819 butler.validateConfiguration() 

820 

821 # Rerun validation but with a subset of dataset type names 

822 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

823 

824 # Rerun validation but ignore the bad datasetType 

825 butler.validateConfiguration(ignore=["test_metric_comp", "metric3", "calexp", "DummySC", 

826 "datasetType.component", "random_data", "random_data_2"]) 

827 

828 def testTransaction(self): 

829 butler = Butler(self.tmpConfigFile, run="ingest") 

830 datasetTypeName = "test_metric" 

831 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

832 dimensionEntries = (("instrument", {"instrument": "DummyCam"}), 

833 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", 

834 "band": "R"}), 

835 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", 

836 "physical_filter": "d-r"})) 

837 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

838 metric = makeExampleMetrics() 

839 dataId = {"instrument": "DummyCam", "visit": 42} 

840 # Create and register a DatasetType 

841 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

842 with self.assertRaises(TransactionTestError): 

843 with butler.transaction(): 

844 # Add needed Dimensions 

845 for args in dimensionEntries: 

846 butler.registry.insertDimensionData(*args) 

847 # Store a dataset 

848 ref = butler.put(metric, datasetTypeName, dataId) 

849 self.assertIsInstance(ref, DatasetRef) 

850 # Test getDirect 

851 metricOut = butler.getDirect(ref) 

852 self.assertEqual(metric, metricOut) 

853 # Test get 

854 metricOut = butler.get(datasetTypeName, dataId) 

855 self.assertEqual(metric, metricOut) 

856 # Check we can get components 

857 self.assertGetComponents(butler, ref, 

858 ("summary", "data", "output"), metric) 

859 raise TransactionTestError("This should roll back the entire transaction") 

860 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"): 

861 butler.registry.expandDataId(dataId) 

862 # Should raise LookupError for missing data ID value 

863 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

864 butler.get(datasetTypeName, dataId) 

865 # Also check explicitly if Dataset entry is missing 

866 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

867 # Direct retrieval should not find the file in the Datastore 

868 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

869 butler.getDirect(ref) 

870 

871 def testMakeRepo(self): 

872 """Test that we can write butler configuration to a new repository via 

873 the Butler.makeRepo interface and then instantiate a butler from the 

874 repo root. 

875 """ 

876 # Do not run the test if we know this datastore configuration does 

877 # not support a file system root 

878 if self.fullConfigKey is None: 

879 return 

880 

881 # create two separate directories 

882 root1 = tempfile.mkdtemp(dir=self.root) 

883 root2 = tempfile.mkdtemp(dir=self.root) 

884 

885 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

886 limited = Config(self.configFile) 

887 butler1 = Butler(butlerConfig) 

888 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

889 full = Config(self.tmpConfigFile) 

890 butler2 = Butler(butlerConfig) 

891 # Butlers should have the same configuration regardless of whether 

892 # defaults were expanded. 

893 self.assertEqual(butler1._config, butler2._config) 

894 # Config files loaded directly should not be the same. 

895 self.assertNotEqual(limited, full) 

896 # Make sure "limited" doesn't have a few keys we know it should be 

897 # inheriting from defaults. 

898 self.assertIn(self.fullConfigKey, full) 

899 self.assertNotIn(self.fullConfigKey, limited) 

900 

901 # Collections don't appear until something is put in them 

902 collections1 = set(butler1.registry.queryCollections()) 

903 self.assertEqual(collections1, set()) 

904 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

905 

906 # Check that a config with no associated file name will not 

907 # work properly with relocatable Butler repo 

908 butlerConfig.configFile = None 

909 with self.assertRaises(ValueError): 

910 Butler(butlerConfig) 

911 

912 with self.assertRaises(FileExistsError): 

913 Butler.makeRepo(self.root, standalone=True, 

914 config=Config(self.configFile), overwrite=False) 

915 

916 def testStringification(self): 

917 butler = Butler(self.tmpConfigFile, run="ingest") 

918 butlerStr = str(butler) 

919 

920 if self.datastoreStr is not None: 

921 for testStr in self.datastoreStr: 

922 self.assertIn(testStr, butlerStr) 

923 if self.registryStr is not None: 

924 self.assertIn(self.registryStr, butlerStr) 

925 

926 datastoreName = butler.datastore.name 

927 if self.datastoreName is not None: 

928 for testStr in self.datastoreName: 

929 self.assertIn(testStr, datastoreName) 

930 

931 def testButlerRewriteDataId(self): 

932 """Test that dataIds can be rewritten based on dimension records.""" 

933 

934 butler = Butler(self.tmpConfigFile, run="ingest") 

935 

936 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

937 datasetTypeName = "random_data" 

938 

939 # Create dimension records. 

940 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

941 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

942 "name": "d-r", 

943 "band": "R"}) 

944 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", 

945 "id": 1, "full_name": "det1"}) 

946 

947 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

948 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

949 butler.registry.registerDatasetType(datasetType) 

950 

951 n_exposures = 5 

952 dayobs = 20210530 

953 

954 for i in range(n_exposures): 

955 butler.registry.insertDimensionData("exposure", {"instrument": "DummyCamComp", 

956 "id": i, "obs_id": f"exp{i}", 

957 "seq_num": i, "day_obs": dayobs, 

958 "physical_filter": "d-r"}) 

959 

960 # Write some data. 

961 for i in range(n_exposures): 

962 metric = {"something": i, 

963 "other": "metric", 

964 "list": [2*x for x in range(i)]} 

965 

966 # Use the seq_num for the put to test rewriting. 

967 dataId = {"seq_num": i, "day_obs": dayobs, "detector": 1, "instrument": "DummyCamComp", 

968 "physical_filter": "d-r"} 

969 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

970 

971 # Check that the exposure is correct in the dataId 

972 self.assertEqual(ref.dataId["exposure"], i) 

973 

974 # and check that we can get the dataset back with the same dataId 

975 new_metric = butler.get(datasetTypeName, dataId=dataId) 

976 self.assertEqual(new_metric, metric) 

977 

978 

979class FileDatastoreButlerTests(ButlerTests): 

980 """Common tests and specialization of ButlerTests for butlers backed 

981 by datastores that inherit from FileDatastore. 

982 """ 

983 

984 def checkFileExists(self, root, relpath): 

985 """Checks if file exists at a given path (relative to root). 

986 

987 Test testPutTemplates verifies actual physical existance of the files 

988 in the requested location. 

989 """ 

990 uri = ButlerURI(root, forceDirectory=True) 

991 return uri.join(relpath).exists() 

992 

993 def testPutTemplates(self): 

994 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

995 butler = Butler(self.tmpConfigFile, run="ingest") 

996 

997 # Add needed Dimensions 

998 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

999 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

1000 "name": "d-r", 

1001 "band": "R"}) 

1002 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", 

1003 "physical_filter": "d-r"}) 

1004 butler.registry.insertDimensionData("visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", 

1005 "physical_filter": "d-r"}) 

1006 

1007 # Create and store a dataset 

1008 metric = makeExampleMetrics() 

1009 

1010 # Create two almost-identical DatasetTypes (both will use default 

1011 # template) 

1012 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

1013 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1014 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1015 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1016 

1017 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1018 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1019 

1020 # Put with exactly the data ID keys needed 

1021 ref = butler.put(metric, "metric1", dataId1) 

1022 uri = butler.getURI(ref) 

1023 self.assertTrue(self.checkFileExists(butler.datastore.root, 

1024 "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"), 

1025 f"Checking existence of {uri}") 

1026 

1027 # Check the template based on dimensions 

1028 butler.datastore.templates.validateTemplates([ref]) 

1029 

1030 # Put with extra data ID keys (physical_filter is an optional 

1031 # dependency); should not change template (at least the way we're 

1032 # defining them to behave now; the important thing is that they 

1033 # must be consistent). 

1034 ref = butler.put(metric, "metric2", dataId2) 

1035 uri = butler.getURI(ref) 

1036 self.assertTrue(self.checkFileExists(butler.datastore.root, 

1037 "ingest/metric2/d-r/DummyCamComp_v423.pickle"), 

1038 f"Checking existence of {uri}") 

1039 

1040 # Check the template based on dimensions 

1041 butler.datastore.templates.validateTemplates([ref]) 

1042 

1043 # Now use a file template that will not result in unique filenames 

1044 with self.assertRaises(FileTemplateValidationError): 

1045 butler.put(metric, "metric3", dataId1) 

1046 

1047 def testImportExport(self): 

1048 # Run put/get tests just to create and populate a repo. 

1049 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1050 self.runImportExportTest(storageClass) 

1051 

1052 @unittest.expectedFailure 

1053 def testImportExportVirtualComposite(self): 

1054 # Run put/get tests just to create and populate a repo. 

1055 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1056 self.runImportExportTest(storageClass) 

1057 

1058 def runImportExportTest(self, storageClass): 

1059 """This test does an export to a temp directory and an import back 

1060 into a new temp directory repo. It does not assume a posix datastore""" 

1061 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1062 print("Root:", exportButler.datastore.root) 

1063 # Test that the repo actually has at least one dataset. 

1064 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1065 self.assertGreater(len(datasets), 0) 

1066 # Add a DimensionRecord that's unused by those datasets. 

1067 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1068 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1069 # Export and then import datasets. 

1070 with safeTestTempDir(TESTDIR) as exportDir: 

1071 exportFile = os.path.join(exportDir, "exports.yaml") 

1072 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1073 export.saveDatasets(datasets) 

1074 # Export the same datasets again. This should quietly do 

1075 # nothing because of internal deduplication, and it shouldn't 

1076 # complain about being asked to export the "htm7" elements even 

1077 # though there aren't any in these datasets or in the database. 

1078 export.saveDatasets(datasets, elements=["htm7"]) 

1079 # Save one of the data IDs again; this should be harmless 

1080 # because of internal deduplication. 

1081 export.saveDataIds([datasets[0].dataId]) 

1082 # Save some dimension records directly. 

1083 export.saveDimensionData("skymap", [skymapRecord]) 

1084 self.assertTrue(os.path.exists(exportFile)) 

1085 with safeTestTempDir(TESTDIR) as importDir: 

1086 # We always want this to be a local posix butler 

1087 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1088 # Calling script.butlerImport tests the implementation of the 

1089 # butler command line interface "import" subcommand. Functions 

1090 # in the script folder are generally considered protected and 

1091 # should not be used as public api. 

1092 with open(exportFile, "r") as f: 

1093 script.butlerImport(importDir, export_file=f, directory=exportDir, 

1094 transfer="auto", skip_dimensions=None, reuse_ids=False) 

1095 importButler = Butler(importDir, run="ingest") 

1096 for ref in datasets: 

1097 with self.subTest(ref=ref): 

1098 # Test for existence by passing in the DatasetType and 

1099 # data ID separately, to avoid lookup by dataset_id. 

1100 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId)) 

1101 self.assertEqual(list(importButler.registry.queryDimensionRecords("skymap")), 

1102 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)]) 

1103 

1104 def testRemoveRuns(self): 

1105 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1106 butler = Butler(self.tmpConfigFile, writeable=True) 

1107 # Load registry data with dimensions to hang datasets off of. 

1108 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1109 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1110 # Add some RUN-type collection. 

1111 run1 = "run1" 

1112 butler.registry.registerRun(run1) 

1113 run2 = "run2" 

1114 butler.registry.registerRun(run2) 

1115 # put a dataset in each 

1116 metric = makeExampleMetrics() 

1117 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1118 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

1119 butler.registry) 

1120 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1121 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1122 uri1 = butler.getURI(ref1, collections=[run1]) 

1123 uri2 = butler.getURI(ref2, collections=[run2]) 

1124 # Remove from both runs with different values for unstore. 

1125 butler.removeRuns([run1], unstore=True) 

1126 butler.removeRuns([run2], unstore=False) 

1127 # Should be nothing in registry for either one, and datastore should 

1128 # not think either exists. 

1129 with self.assertRaises(MissingCollectionError): 

1130 butler.registry.getCollectionType(run1) 

1131 with self.assertRaises(MissingCollectionError): 

1132 butler.registry.getCollectionType(run2) 

1133 self.assertFalse(butler.datastore.exists(ref1)) 

1134 self.assertFalse(butler.datastore.exists(ref2)) 

1135 # The ref we unstored should be gone according to the URI, but the 

1136 # one we forgot should still be around. 

1137 self.assertFalse(uri1.exists()) 

1138 self.assertTrue(uri2.exists()) 

1139 

1140 

1141class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1142 """PosixDatastore specialization of a butler""" 

1143 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1144 fullConfigKey = ".datastore.formatters" 

1145 validationCanFail = True 

1146 datastoreStr = ["/tmp"] 

1147 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1148 registryStr = "/gen3.sqlite3" 

1149 

1150 def testPathConstructor(self): 

1151 """Independent test of constructor using PathLike. 

1152 """ 

1153 butler = Butler(self.tmpConfigFile, run="ingest") 

1154 self.assertIsInstance(butler, Butler) 

1155 

1156 # And again with a Path object with the butler yaml 

1157 path = pathlib.Path(self.tmpConfigFile) 

1158 butler = Butler(path, writeable=False) 

1159 self.assertIsInstance(butler, Butler) 

1160 

1161 # And again with a Path object without the butler yaml 

1162 # (making sure we skip it if the tmp config doesn't end 

1163 # in butler.yaml -- which is the case for a subclass) 

1164 if self.tmpConfigFile.endswith("butler.yaml"): 

1165 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1166 butler = Butler(path, writeable=False) 

1167 self.assertIsInstance(butler, Butler) 

1168 

1169 def testExportTransferCopy(self): 

1170 """Test local export using all transfer modes""" 

1171 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1172 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1173 # Test that the repo actually has at least one dataset. 

1174 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1175 self.assertGreater(len(datasets), 0) 

1176 uris = [exportButler.getURI(d) for d in datasets] 

1177 datastoreRoot = exportButler.datastore.root 

1178 

1179 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1180 

1181 for path in pathsInStore: 

1182 # Assume local file system 

1183 self.assertTrue(self.checkFileExists(datastoreRoot, path), 

1184 f"Checking path {path}") 

1185 

1186 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1187 with safeTestTempDir(TESTDIR) as exportDir: 

1188 with exportButler.export(directory=exportDir, format="yaml", 

1189 transfer=transfer) as export: 

1190 export.saveDatasets(datasets) 

1191 for path in pathsInStore: 

1192 self.assertTrue(self.checkFileExists(exportDir, path), 

1193 f"Check that mode {transfer} exported files") 

1194 

1195 def testPruneDatasets(self): 

1196 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1197 butler = Butler(self.tmpConfigFile, writeable=True) 

1198 # Load registry data with dimensions to hang datasets off of. 

1199 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1200 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1201 # Add some RUN-type collections. 

1202 run1 = "run1" 

1203 butler.registry.registerRun(run1) 

1204 run2 = "run2" 

1205 butler.registry.registerRun(run2) 

1206 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1207 # different runs. ref3 has a different data ID. 

1208 metric = makeExampleMetrics() 

1209 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1210 datasetType = self.addDatasetType("prune_collections_test_dataset", dimensions, storageClass, 

1211 butler.registry) 

1212 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1213 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1214 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1215 

1216 # Simple prune. 

1217 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1218 with self.assertRaises(LookupError): 

1219 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1) 

1220 

1221 # Put data back. 

1222 ref1 = butler.put(metric, ref1.unresolved(), run=run1) 

1223 ref2 = butler.put(metric, ref2.unresolved(), run=run2) 

1224 ref3 = butler.put(metric, ref3.unresolved(), run=run1) 

1225 

1226 # Check that in normal mode, deleting the record will lead to 

1227 # trash not touching the file. 

1228 uri1 = butler.datastore.getURI(ref1) 

1229 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table 

1230 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id}) 

1231 butler.datastore.trash(ref1) 

1232 butler.datastore.emptyTrash() 

1233 self.assertTrue(uri1.exists()) 

1234 uri1.remove() # Clean it up. 

1235 

1236 # Simulate execution butler setup by deleting the datastore 

1237 # record but keeping the file around and trusting. 

1238 butler.datastore.trustGetRequest = True 

1239 uri2 = butler.datastore.getURI(ref2) 

1240 uri3 = butler.datastore.getURI(ref3) 

1241 self.assertTrue(uri2.exists()) 

1242 self.assertTrue(uri3.exists()) 

1243 

1244 # Remove the datastore record. 

1245 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table 

1246 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id}) 

1247 self.assertTrue(uri2.exists()) 

1248 butler.datastore.trash([ref2, ref3]) 

1249 # Immediate removal for ref2 file 

1250 self.assertFalse(uri2.exists()) 

1251 # But ref3 has to wait for the empty. 

1252 self.assertTrue(uri3.exists()) 

1253 butler.datastore.emptyTrash() 

1254 self.assertFalse(uri3.exists()) 

1255 

1256 # Clear out the datasets from registry. 

1257 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1258 

1259 

1260class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1261 """InMemoryDatastore specialization of a butler""" 

1262 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1263 fullConfigKey = None 

1264 useTempRoot = False 

1265 validationCanFail = False 

1266 datastoreStr = ["datastore='InMemory"] 

1267 datastoreName = ["InMemoryDatastore@"] 

1268 registryStr = "/gen3.sqlite3" 

1269 

1270 def testIngest(self): 

1271 pass 

1272 

1273 

1274class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1275 """PosixDatastore specialization""" 

1276 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1277 fullConfigKey = ".datastore.datastores.1.formatters" 

1278 validationCanFail = True 

1279 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1280 datastoreName = ["InMemoryDatastore@", f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1281 "SecondDatastore"] 

1282 registryStr = "/gen3.sqlite3" 

1283 

1284 

1285class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1286 """Test that a yaml file in one location can refer to a root in another.""" 

1287 

1288 datastoreStr = ["dir1"] 

1289 # Disable the makeRepo test since we are deliberately not using 

1290 # butler.yaml as the config name. 

1291 fullConfigKey = None 

1292 

1293 def setUp(self): 

1294 self.root = makeTestTempDir(TESTDIR) 

1295 

1296 # Make a new repository in one place 

1297 self.dir1 = os.path.join(self.root, "dir1") 

1298 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1299 

1300 # Move the yaml file to a different place and add a "root" 

1301 self.dir2 = os.path.join(self.root, "dir2") 

1302 os.makedirs(self.dir2, exist_ok=True) 

1303 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1304 config = Config(configFile1) 

1305 config["root"] = self.dir1 

1306 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1307 config.dumpToUri(configFile2) 

1308 os.remove(configFile1) 

1309 self.tmpConfigFile = configFile2 

1310 

1311 def testFileLocations(self): 

1312 self.assertNotEqual(self.dir1, self.dir2) 

1313 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1314 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1315 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1316 

1317 

1318class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1319 """Test that a config file created by makeRepo outside of repo works.""" 

1320 

1321 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1322 

1323 def setUp(self): 

1324 self.root = makeTestTempDir(TESTDIR) 

1325 self.root2 = makeTestTempDir(TESTDIR) 

1326 

1327 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1328 Butler.makeRepo(self.root, config=Config(self.configFile), 

1329 outfile=self.tmpConfigFile) 

1330 

1331 def tearDown(self): 

1332 if os.path.exists(self.root2): 

1333 shutil.rmtree(self.root2, ignore_errors=True) 

1334 super().tearDown() 

1335 

1336 def testConfigExistence(self): 

1337 c = Config(self.tmpConfigFile) 

1338 uri_config = ButlerURI(c["root"]) 

1339 uri_expected = ButlerURI(self.root, forceDirectory=True) 

1340 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1341 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1342 

1343 def testPutGet(self): 

1344 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1345 self.runPutGetTest(storageClass, "test_metric") 

1346 

1347 

1348class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1349 """Test that a config file created by makeRepo outside of repo works.""" 

1350 

1351 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1352 

1353 def setUp(self): 

1354 self.root = makeTestTempDir(TESTDIR) 

1355 self.root2 = makeTestTempDir(TESTDIR) 

1356 

1357 self.tmpConfigFile = self.root2 

1358 Butler.makeRepo(self.root, config=Config(self.configFile), 

1359 outfile=self.tmpConfigFile) 

1360 

1361 def testConfigExistence(self): 

1362 # Append the yaml file else Config constructor does not know the file 

1363 # type. 

1364 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1365 super().testConfigExistence() 

1366 

1367 

1368class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1369 """Test that a config file created by makeRepo outside of repo works.""" 

1370 

1371 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1372 

1373 def setUp(self): 

1374 self.root = makeTestTempDir(TESTDIR) 

1375 self.root2 = makeTestTempDir(TESTDIR) 

1376 

1377 self.tmpConfigFile = ButlerURI(os.path.join(self.root2, "something.yaml")).geturl() 

1378 Butler.makeRepo(self.root, config=Config(self.configFile), 

1379 outfile=self.tmpConfigFile) 

1380 

1381 

1382@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1383@mock_s3 

1384class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1385 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1386 a local in-memory SqlRegistry. 

1387 """ 

1388 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1389 fullConfigKey = None 

1390 validationCanFail = True 

1391 

1392 bucketName = "anybucketname" 

1393 """Name of the Bucket that will be used in the tests. The name is read from 

1394 the config file used with the tests during set-up. 

1395 """ 

1396 

1397 root = "butlerRoot/" 

1398 """Root repository directory expected to be used in case useTempRoot=False. 

1399 Otherwise the root is set to a 20 characters long randomly generated string 

1400 during set-up. 

1401 """ 

1402 

1403 datastoreStr = [f"datastore={root}"] 

1404 """Contains all expected root locations in a format expected to be 

1405 returned by Butler stringification. 

1406 """ 

1407 

1408 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1409 """The expected format of the S3 Datastore string.""" 

1410 

1411 registryStr = "/gen3.sqlite3" 

1412 """Expected format of the Registry string.""" 

1413 

1414 def genRoot(self): 

1415 """Returns a random string of len 20 to serve as a root 

1416 name for the temporary bucket repo. 

1417 

1418 This is equivalent to tempfile.mkdtemp as this is what self.root 

1419 becomes when useTempRoot is True. 

1420 """ 

1421 rndstr = "".join( 

1422 random.choice(string.ascii_uppercase + string.digits) for _ in range(20) 

1423 ) 

1424 return rndstr + "/" 

1425 

1426 def setUp(self): 

1427 config = Config(self.configFile) 

1428 uri = ButlerURI(config[".datastore.datastore.root"]) 

1429 self.bucketName = uri.netloc 

1430 

1431 # set up some fake credentials if they do not exist 

1432 self.usingDummyCredentials = setAwsEnvCredentials() 

1433 

1434 if self.useTempRoot: 

1435 self.root = self.genRoot() 

1436 rooturi = f"s3://{self.bucketName}/{self.root}" 

1437 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1438 

1439 # need local folder to store registry database 

1440 self.reg_dir = makeTestTempDir(TESTDIR) 

1441 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1442 

1443 # MOTO needs to know that we expect Bucket bucketname to exist 

1444 # (this used to be the class attribute bucketName) 

1445 s3 = boto3.resource("s3") 

1446 s3.create_bucket(Bucket=self.bucketName) 

1447 

1448 self.datastoreStr = f"datastore={self.root}" 

1449 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1450 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1451 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1452 

1453 def tearDown(self): 

1454 s3 = boto3.resource("s3") 

1455 bucket = s3.Bucket(self.bucketName) 

1456 try: 

1457 bucket.objects.all().delete() 

1458 except botocore.exceptions.ClientError as e: 

1459 if e.response["Error"]["Code"] == "404": 

1460 # the key was not reachable - pass 

1461 pass 

1462 else: 

1463 raise 

1464 

1465 bucket = s3.Bucket(self.bucketName) 

1466 bucket.delete() 

1467 

1468 # unset any potentially set dummy credentials 

1469 if self.usingDummyCredentials: 

1470 unsetAwsEnvCredentials() 

1471 

1472 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1473 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1474 

1475 if self.useTempRoot and os.path.exists(self.root): 

1476 shutil.rmtree(self.root, ignore_errors=True) 

1477 

1478 

1479@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!") 

1480# Mock required environment variables during tests 

1481@unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1482 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1483 TESTDIR, "config/testConfigs/webdav/token"), 

1484 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1485class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1486 """WebdavDatastore specialization of a butler; a Webdav storage Datastore + 

1487 a local in-memory SqlRegistry. 

1488 """ 

1489 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml") 

1490 fullConfigKey = None 

1491 validationCanFail = True 

1492 

1493 serverName = "localhost" 

1494 """Name of the server that will be used in the tests. 

1495 """ 

1496 

1497 portNumber = 8080 

1498 """Port on which the webdav server listens. Automatically chosen 

1499 at setUpClass via the _getfreeport() method 

1500 """ 

1501 

1502 root = "butlerRoot/" 

1503 """Root repository directory expected to be used in case useTempRoot=False. 

1504 Otherwise the root is set to a 20 characters long randomly generated string 

1505 during set-up. 

1506 """ 

1507 

1508 datastoreStr = [f"datastore={root}"] 

1509 """Contains all expected root locations in a format expected to be 

1510 returned by Butler stringification. 

1511 """ 

1512 

1513 datastoreName = ["FileDatastore@https://{serverName}/{root}"] 

1514 """The expected format of the WebdavDatastore string.""" 

1515 

1516 registryStr = "/gen3.sqlite3" 

1517 """Expected format of the Registry string.""" 

1518 

1519 serverThread = None 

1520 """Thread in which the local webdav server will run""" 

1521 

1522 stopWebdavServer = False 

1523 """This flag will cause the webdav server to 

1524 gracefully shut down when True 

1525 """ 

1526 

1527 def genRoot(self): 

1528 """Returns a random string of len 20 to serve as a root 

1529 name for the temporary bucket repo. 

1530 

1531 This is equivalent to tempfile.mkdtemp as this is what self.root 

1532 becomes when useTempRoot is True. 

1533 """ 

1534 rndstr = "".join( 

1535 random.choice(string.ascii_uppercase + string.digits) for _ in range(20) 

1536 ) 

1537 return rndstr + "/" 

1538 

1539 @classmethod 

1540 def setUpClass(cls): 

1541 # Do the same as inherited class 

1542 cls.storageClassFactory = StorageClassFactory() 

1543 cls.storageClassFactory.addFromConfig(cls.configFile) 

1544 

1545 cls.portNumber = cls._getfreeport() 

1546 # Run a local webdav server on which tests will be run 

1547 cls.serverThread = Thread(target=cls._serveWebdav, 

1548 args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), 

1549 daemon=True) 

1550 cls.serverThread.start() 

1551 # Wait for it to start 

1552 time.sleep(3) 

1553 

1554 @classmethod 

1555 def tearDownClass(cls): 

1556 # Ask for graceful shut down of the webdav server 

1557 cls.stopWebdavServer = True 

1558 # Wait for the thread to exit 

1559 cls.serverThread.join() 

1560 

1561 # Mock required environment variables during tests 

1562 @unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1563 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1564 TESTDIR, "config/testConfigs/webdav/token"), 

1565 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1566 def setUp(self): 

1567 config = Config(self.configFile) 

1568 

1569 if self.useTempRoot: 

1570 self.root = self.genRoot() 

1571 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}" 

1572 config.update({"datastore": {"datastore": {"root": self.rooturi}}}) 

1573 

1574 # need local folder to store registry database 

1575 self.reg_dir = makeTestTempDir(TESTDIR) 

1576 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1577 

1578 self.datastoreStr = f"datastore={self.root}" 

1579 self.datastoreName = [f"FileDatastore@{self.rooturi}"] 

1580 

1581 if not isWebdavEndpoint(self.rooturi): 

1582 raise OSError("Webdav server not running properly: cannot run tests.") 

1583 

1584 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False) 

1585 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml") 

1586 

1587 # Mock required environment variables during tests 

1588 @unittest.mock.patch.dict(os.environ, {"LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1589 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join( 

1590 TESTDIR, "config/testConfigs/webdav/token"), 

1591 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs"}) 

1592 def tearDown(self): 

1593 # Clear temporary directory 

1594 ButlerURI(self.rooturi).remove() 

1595 ButlerURI(self.rooturi).session.close() 

1596 

1597 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1598 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1599 

1600 if self.useTempRoot and os.path.exists(self.root): 

1601 shutil.rmtree(self.root, ignore_errors=True) 

1602 

1603 def _serveWebdav(self, port: int, stopWebdavServer): 

1604 """Starts a local webdav-compatible HTTP server, 

1605 Listening on http://localhost:port 

1606 This server only runs when this test class is instantiated, 

1607 and then shuts down. Must be started is a separate thread. 

1608 

1609 Parameters 

1610 ---------- 

1611 port : `int` 

1612 The port number on which the server should listen 

1613 """ 

1614 root_path = gettempdir() 

1615 

1616 config = { 

1617 "host": "0.0.0.0", 

1618 "port": port, 

1619 "provider_mapping": {"/": root_path}, 

1620 "http_authenticator": { 

1621 "domain_controller": None 

1622 }, 

1623 "simple_dc": {"user_mapping": {"*": True}}, 

1624 "verbose": 0, 

1625 } 

1626 app = WsgiDAVApp(config) 

1627 

1628 server_args = { 

1629 "bind_addr": (config["host"], config["port"]), 

1630 "wsgi_app": app, 

1631 } 

1632 server = wsgi.Server(**server_args) 

1633 server.prepare() 

1634 

1635 try: 

1636 # Start the actual server in a separate thread 

1637 t = Thread(target=server.serve, daemon=True) 

1638 t.start() 

1639 # watch stopWebdavServer, and gracefully 

1640 # shut down the server when True 

1641 while True: 

1642 if stopWebdavServer(): 

1643 break 

1644 time.sleep(1) 

1645 except KeyboardInterrupt: 

1646 print("Caught Ctrl-C, shutting down...") 

1647 finally: 

1648 server.stop() 

1649 t.join() 

1650 

1651 def _getfreeport(): 

1652 """ 

1653 Determines a free port using sockets. 

1654 """ 

1655 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

1656 free_socket.bind(('0.0.0.0', 0)) 

1657 free_socket.listen() 

1658 port = free_socket.getsockname()[1] 

1659 free_socket.close() 

1660 return port 

1661 

1662 

1663class PosixDatastoreTransfers(unittest.TestCase): 

1664 """Test data transfers between butlers. 

1665 

1666 Test for different managers. UUID to UUID and integer to integer are 

1667 tested. UUID to integer is not supported since we do not currently 

1668 want to allow that. Integer to UUID is supported with the caveat 

1669 that UUID4 will be generated and this will be incorrect for raw 

1670 dataset types. The test ignores that. 

1671 """ 

1672 

1673 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1674 

1675 @classmethod 

1676 def setUpClass(cls): 

1677 cls.storageClassFactory = StorageClassFactory() 

1678 cls.storageClassFactory.addFromConfig(cls.configFile) 

1679 

1680 def setUp(self): 

1681 self.root = makeTestTempDir(TESTDIR) 

1682 self.config = Config(self.configFile) 

1683 

1684 def tearDown(self): 

1685 removeTestTempDir(self.root) 

1686 

1687 def create_butler(self, manager, label): 

1688 config = Config(self.configFile) 

1689 config["registry", "managers", "datasets"] = manager 

1690 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), 

1691 writeable=True) 

1692 

1693 def create_butlers(self, manager1, manager2): 

1694 self.source_butler = self.create_butler(manager1, "1") 

1695 self.target_butler = self.create_butler(manager2, "2") 

1696 

1697 def testTransferUuidToUuid(self): 

1698 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1699 "ByDimensionsDatasetRecordStorageManagerUUID", 

1700 "lsst.daf.butler.registry.datasets.byDimensions." 

1701 "ByDimensionsDatasetRecordStorageManagerUUID", 

1702 ) 

1703 # Setting id_gen_map should have no effect here 

1704 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1705 

1706 def testTransferIntToInt(self): 

1707 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1708 "ByDimensionsDatasetRecordStorageManager", 

1709 "lsst.daf.butler.registry.datasets.byDimensions." 

1710 "ByDimensionsDatasetRecordStorageManager", 

1711 ) 

1712 # int dataset ID only allows UNIQUE 

1713 self.assertButlerTransfers() 

1714 

1715 def testTransferIntToUuid(self): 

1716 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1717 "ByDimensionsDatasetRecordStorageManager", 

1718 "lsst.daf.butler.registry.datasets.byDimensions." 

1719 "ByDimensionsDatasetRecordStorageManagerUUID", 

1720 ) 

1721 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1722 

1723 def testTransferMissing(self): 

1724 """Test transfers where datastore records are missing. 

1725 

1726 This is how execution butler works. 

1727 """ 

1728 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1729 "ByDimensionsDatasetRecordStorageManagerUUID", 

1730 "lsst.daf.butler.registry.datasets.byDimensions." 

1731 "ByDimensionsDatasetRecordStorageManagerUUID", 

1732 ) 

1733 

1734 # Configure the source butler to allow trust. 

1735 self.source_butler.datastore.trustGetRequest = True 

1736 

1737 self.assertButlerTransfers(purge=True) 

1738 

1739 def testTransferMissingDisassembly(self): 

1740 """Test transfers where datastore records are missing. 

1741 

1742 This is how execution butler works. 

1743 """ 

1744 self.create_butlers("lsst.daf.butler.registry.datasets.byDimensions." 

1745 "ByDimensionsDatasetRecordStorageManagerUUID", 

1746 "lsst.daf.butler.registry.datasets.byDimensions." 

1747 "ByDimensionsDatasetRecordStorageManagerUUID", 

1748 ) 

1749 

1750 # Configure the source butler to allow trust. 

1751 self.source_butler.datastore.trustGetRequest = True 

1752 

1753 # Test disassembly. 

1754 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

1755 

1756 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"): 

1757 """Test that a run can be transferred to another butler.""" 

1758 

1759 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

1760 datasetTypeName = "random_data" 

1761 

1762 # Test will create 3 collections and we will want to transfer 

1763 # two of those three. 

1764 runs = ["run1", "run2", "other"] 

1765 

1766 # Also want to use two different dataset types to ensure that 

1767 # grouping works. 

1768 datasetTypeNames = ["random_data", "random_data_2"] 

1769 

1770 # Create the run collections in the source butler. 

1771 for run in runs: 

1772 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

1773 

1774 # Create dimensions in both butlers (transfer will not create them). 

1775 n_exposures = 30 

1776 for butler in (self.source_butler, self.target_butler): 

1777 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1778 butler.registry.insertDimensionData("physical_filter", {"instrument": "DummyCamComp", 

1779 "name": "d-r", 

1780 "band": "R"}) 

1781 butler.registry.insertDimensionData("detector", {"instrument": "DummyCamComp", 

1782 "id": 1, "full_name": "det1"}) 

1783 

1784 for i in range(n_exposures): 

1785 butler.registry.insertDimensionData("exposure", {"instrument": "DummyCamComp", 

1786 "id": i, "obs_id": f"exp{i}", 

1787 "physical_filter": "d-r"}) 

1788 

1789 # Create dataset types in the source butler. 

1790 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1791 for datasetTypeName in datasetTypeNames: 

1792 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1793 self.source_butler.registry.registerDatasetType(datasetType) 

1794 

1795 # Write a dataset to an unrelated run -- this will ensure that 

1796 # we are rewriting integer dataset ids in the target if necessary. 

1797 # Will not be relevant for UUID. 

1798 run = "distraction" 

1799 butler = Butler(butler=self.source_butler, run=run) 

1800 butler.put(makeExampleMetrics(), datasetTypeName, 

1801 exposure=1, detector=1, instrument="DummyCamComp", physical_filter="d-r") 

1802 

1803 # Write some example metrics to the source 

1804 butler = Butler(butler=self.source_butler) 

1805 

1806 # Set of DatasetRefs that should be in the list of refs to transfer 

1807 # but which will not be transferred. 

1808 deleted = set() 

1809 

1810 n_expected = 20 # Number of datasets expected to be transferred 

1811 source_refs = [] 

1812 for i in range(n_exposures): 

1813 # Put a third of datasets into each collection, only retain 

1814 # two thirds. 

1815 index = i % 3 

1816 run = runs[index] 

1817 datasetTypeName = datasetTypeNames[i % 2] 

1818 

1819 metric_data = {"summary": {"counter": i}, 

1820 "output": {"text": "metric"}, 

1821 "data": [2*x for x in range(i)]} 

1822 metric = MetricsExample(**metric_data) 

1823 dataId = {"exposure": i, "detector": 1, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1824 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

1825 

1826 # Remove the datastore record using low-level API 

1827 if purge: 

1828 # Remove records for a fraction. 

1829 if index == 1: 

1830 

1831 # For one of these delete the file as well. 

1832 # This allows the "missing" code to filter the 

1833 # file out. 

1834 if not deleted: 

1835 primary, uris = butler.datastore.getURIs(ref) 

1836 if primary: 

1837 primary.remove() 

1838 for uri in uris.values(): 

1839 uri.remove() 

1840 n_expected -= 1 

1841 deleted.add(ref) 

1842 

1843 # Remove the datastore record. 

1844 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

1845 

1846 if index < 2: 

1847 source_refs.append(ref) 

1848 if ref not in deleted: 

1849 new_metric = butler.get(ref.unresolved(), collections=run) 

1850 self.assertEqual(new_metric, metric) 

1851 

1852 # Create some bad dataset types to ensure we check for inconsistent 

1853 # definitions. 

1854 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

1855 for datasetTypeName in datasetTypeNames: 

1856 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

1857 self.target_butler.registry.registerDatasetType(datasetType) 

1858 with self.assertRaises(ConflictingDefinitionError): 

1859 self.target_butler.transfer_from(self.source_butler, source_refs, 

1860 id_gen_map=id_gen_map) 

1861 # And remove the bad definitions. 

1862 for datasetTypeName in datasetTypeNames: 

1863 self.target_butler.registry.removeDatasetType(datasetTypeName) 

1864 

1865 # Transfer without creating dataset types should fail. 

1866 with self.assertRaises(KeyError): 

1867 self.target_butler.transfer_from(self.source_butler, source_refs, 

1868 id_gen_map=id_gen_map) 

1869 

1870 # Now transfer them to the second butler 

1871 with self.assertLogs(level=logging.DEBUG) as cm: 

1872 transferred = self.target_butler.transfer_from(self.source_butler, source_refs, 

1873 id_gen_map=id_gen_map, 

1874 register_dataset_types=True) 

1875 self.assertEqual(len(transferred), n_expected) 

1876 log_output = ";".join(cm.output) 

1877 self.assertIn("found in datastore for chunk", log_output) 

1878 self.assertIn("Creating output run", log_output) 

1879 

1880 # Do the transfer twice to ensure that it will do nothing extra. 

1881 # Only do this if purge=True because it does not work for int 

1882 # dataset_id. 

1883 if purge: 

1884 # This should not need to register dataset types. 

1885 transferred = self.target_butler.transfer_from(self.source_butler, source_refs, 

1886 id_gen_map=id_gen_map) 

1887 self.assertEqual(len(transferred), n_expected) 

1888 

1889 # Also do an explicit low-level transfer to trigger some 

1890 # edge cases. 

1891 with self.assertLogs(level=logging.DEBUG) as cm: 

1892 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

1893 log_output = ";".join(cm.output) 

1894 self.assertIn("no file artifacts exist", log_output) 

1895 

1896 with self.assertRaises(TypeError): 

1897 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) 

1898 

1899 with self.assertRaises(ValueError): 

1900 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs, 

1901 transfer="split") 

1902 

1903 # Now try to get the same refs from the new butler. 

1904 for ref in source_refs: 

1905 if ref not in deleted: 

1906 unresolved_ref = ref.unresolved() 

1907 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run) 

1908 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run) 

1909 self.assertEqual(new_metric, old_metric) 

1910 

1911 # Now prune run2 collection and create instead a CHAINED collection. 

1912 # This should block the transfer. 

1913 self.target_butler.pruneCollection("run2", purge=True, unstore=True) 

1914 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

1915 with self.assertRaises(TypeError): 

1916 # Re-importing the run1 datasets can be problematic if they 

1917 # use integer IDs so filter those out. 

1918 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

1919 self.target_butler.transfer_from(self.source_butler, to_transfer, 

1920 id_gen_map=id_gen_map) 

1921 

1922 

1923if __name__ == "__main__": 1923 ↛ 1924line 1923 didn't jump to line 1924, because the condition on line 1923 was never true

1924 unittest.main()