Coverage for tests/test_butler.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1099 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24 

25import logging 

26import os 

27import pathlib 

28import pickle 

29import posixpath 

30import random 

31import shutil 

32import socket 

33import string 

34import tempfile 

35import time 

36import unittest 

37 

38try: 

39 import boto3 

40 import botocore 

41 from moto import mock_s3 

42except ImportError: 

43 boto3 = None 

44 

45 def mock_s3(cls): 

46 """A no-op decorator in case moto mock_s3 can not be imported.""" 

47 return cls 

48 

49 

50try: 

51 from cheroot import wsgi 

52 from wsgidav.wsgidav_app import WsgiDAVApp 

53except ImportError: 

54 WsgiDAVApp = None 

55 

56from tempfile import gettempdir 

57from threading import Thread 

58 

59import astropy.time 

60from lsst.daf.butler import ( 

61 Butler, 

62 ButlerConfig, 

63 CollectionSearch, 

64 CollectionType, 

65 Config, 

66 DatasetIdGenEnum, 

67 DatasetRef, 

68 DatasetType, 

69 FileDataset, 

70 FileTemplateValidationError, 

71 StorageClassFactory, 

72 ValidationError, 

73 script, 

74) 

75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

76from lsst.daf.butler.registry import ConflictingDefinitionError, MissingCollectionError 

77from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

78from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir 

79from lsst.resources import ResourcePath 

80from lsst.resources.http import isWebdavEndpoint 

81from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

82from lsst.utils import doImport 

83from lsst.utils.introspection import get_full_type_name 

84 

85TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

86 

87 

88def makeExampleMetrics(): 

89 return MetricsExample( 

90 {"AM1": 5.2, "AM2": 30.6}, 

91 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

92 [563, 234, 456.7, 752, 8, 9, 27], 

93 ) 

94 

95 

96class TransactionTestError(Exception): 

97 """Specific error for testing transactions, to prevent misdiagnosing 

98 that might otherwise occur when a standard exception is used. 

99 """ 

100 

101 pass 

102 

103 

104class ButlerConfigTests(unittest.TestCase): 

105 """Simple tests for ButlerConfig that are not tested in any other test 

106 cases.""" 

107 

108 def testSearchPath(self): 

109 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

110 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

111 config1 = ButlerConfig(configFile) 

112 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

113 

114 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

115 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

116 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

117 self.assertIn("testConfigs", "\n".join(cm.output)) 

118 

119 key = ("datastore", "records", "table") 

120 self.assertNotEqual(config1[key], config2[key]) 

121 self.assertEqual(config2[key], "override_record") 

122 

123 

124class ButlerPutGetTests: 

125 """Helper method for running a suite of put/get tests from different 

126 butler configurations.""" 

127 

128 root = None 

129 

130 @staticmethod 

131 def addDatasetType(datasetTypeName, dimensions, storageClass, registry): 

132 """Create a DatasetType and register it""" 

133 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

134 registry.registerDatasetType(datasetType) 

135 return datasetType 

136 

137 @classmethod 

138 def setUpClass(cls): 

139 cls.storageClassFactory = StorageClassFactory() 

140 cls.storageClassFactory.addFromConfig(cls.configFile) 

141 

142 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None): 

143 datasetType = datasetRef.datasetType 

144 dataId = datasetRef.dataId 

145 deferred = butler.getDirectDeferred(datasetRef) 

146 

147 for component in components: 

148 compTypeName = datasetType.componentTypeName(component) 

149 result = butler.get(compTypeName, dataId, collections=collections) 

150 self.assertEqual(result, getattr(reference, component)) 

151 result_deferred = deferred.get(component=component) 

152 self.assertEqual(result_deferred, result) 

153 

154 def tearDown(self): 

155 removeTestTempDir(self.root) 

156 

157 def runPutGetTest(self, storageClass, datasetTypeName): 

158 # New datasets will be added to run and tag, but we will only look in 

159 # tag when looking up datasets. 

160 run = "ingest" 

161 butler = Butler(self.tmpConfigFile, run=run) 

162 

163 collections = set(butler.registry.queryCollections()) 

164 self.assertEqual(collections, set([run])) 

165 

166 # Create and register a DatasetType 

167 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

168 

169 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

170 

171 # Add needed Dimensions 

172 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

173 butler.registry.insertDimensionData( 

174 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

175 ) 

176 butler.registry.insertDimensionData( 

177 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

178 ) 

179 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

180 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

181 butler.registry.insertDimensionData( 

182 "visit", 

183 { 

184 "instrument": "DummyCamComp", 

185 "id": 423, 

186 "name": "fourtwentythree", 

187 "physical_filter": "d-r", 

188 "visit_system": 1, 

189 "datetime_begin": visit_start, 

190 "datetime_end": visit_end, 

191 }, 

192 ) 

193 

194 # Add a second visit for some later tests 

195 butler.registry.insertDimensionData( 

196 "visit", 

197 { 

198 "instrument": "DummyCamComp", 

199 "id": 424, 

200 "name": "fourtwentyfour", 

201 "physical_filter": "d-r", 

202 "visit_system": 1, 

203 }, 

204 ) 

205 

206 # Create and store a dataset 

207 metric = makeExampleMetrics() 

208 dataId = {"instrument": "DummyCamComp", "visit": 423} 

209 

210 # Create a DatasetRef for put 

211 refIn = DatasetRef(datasetType, dataId, id=None) 

212 

213 # Put with a preexisting id should fail 

214 with self.assertRaises(ValueError): 

215 butler.put(metric, DatasetRef(datasetType, dataId, id=100)) 

216 

217 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

218 # and once with a DatasetType 

219 

220 # Keep track of any collections we add and do not clean up 

221 expected_collections = {run} 

222 

223 counter = 0 

224 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)): 

225 # Since we are using subTest we can get cascading failures 

226 # here with the first attempt failing and the others failing 

227 # immediately because the dataset already exists. Work around 

228 # this by using a distinct run collection each time 

229 counter += 1 

230 this_run = f"put_run_{counter}" 

231 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

232 expected_collections.update({this_run}) 

233 

234 with self.subTest(args=args): 

235 ref = butler.put(metric, *args, run=this_run) 

236 self.assertIsInstance(ref, DatasetRef) 

237 

238 # Test getDirect 

239 metricOut = butler.getDirect(ref) 

240 self.assertEqual(metric, metricOut) 

241 # Test get 

242 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

243 self.assertEqual(metric, metricOut) 

244 # Test get with a datasetRef 

245 metricOut = butler.get(ref, collections=this_run) 

246 self.assertEqual(metric, metricOut) 

247 # Test getDeferred with dataId 

248 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

249 self.assertEqual(metric, metricOut) 

250 # Test getDeferred with a datasetRef 

251 metricOut = butler.getDeferred(ref, collections=this_run).get() 

252 self.assertEqual(metric, metricOut) 

253 # and deferred direct with ref 

254 metricOut = butler.getDirectDeferred(ref).get() 

255 self.assertEqual(metric, metricOut) 

256 

257 # Check we can get components 

258 if storageClass.isComposite(): 

259 self.assertGetComponents( 

260 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

261 ) 

262 

263 # Can the artifacts themselves be retrieved? 

264 if not butler.datastore.isEphemeral: 

265 root_uri = ResourcePath(self.root) 

266 

267 for preserve_path in (True, False): 

268 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

269 # Use copy so that we can test that overwrite 

270 # protection works (using "auto" for File URIs would 

271 # use hard links and subsequent transfer would work 

272 # because it knows they are the same file). 

273 transferred = butler.retrieveArtifacts( 

274 [ref], destination, preserve_path=preserve_path, transfer="copy" 

275 ) 

276 self.assertGreater(len(transferred), 0) 

277 artifacts = list(ResourcePath.findFileResources([destination])) 

278 self.assertEqual(set(transferred), set(artifacts)) 

279 

280 for artifact in transferred: 

281 path_in_destination = artifact.relative_to(destination) 

282 self.assertIsNotNone(path_in_destination) 

283 

284 # when path is not preserved there should not be 

285 # any path separators. 

286 num_seps = path_in_destination.count("/") 

287 if preserve_path: 

288 self.assertGreater(num_seps, 0) 

289 else: 

290 self.assertEqual(num_seps, 0) 

291 

292 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

293 n_uris = len(secondary_uris) 

294 if primary_uri: 

295 n_uris += 1 

296 self.assertEqual( 

297 len(artifacts), 

298 n_uris, 

299 "Comparing expected artifacts vs actual:" 

300 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

301 ) 

302 

303 if preserve_path: 

304 # No need to run these twice 

305 with self.assertRaises(ValueError): 

306 butler.retrieveArtifacts([ref], destination, transfer="move") 

307 

308 with self.assertRaises(FileExistsError): 

309 butler.retrieveArtifacts([ref], destination) 

310 

311 transferred_again = butler.retrieveArtifacts( 

312 [ref], destination, preserve_path=preserve_path, overwrite=True 

313 ) 

314 self.assertEqual(set(transferred_again), set(transferred)) 

315 

316 # Now remove the dataset completely. 

317 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run) 

318 # Lookup with original args should still fail. 

319 with self.assertRaises(LookupError): 

320 butler.datasetExists(*args, collections=this_run) 

321 # getDirect() should still fail. 

322 with self.assertRaises(FileNotFoundError): 

323 butler.getDirect(ref) 

324 # Registry shouldn't be able to find it by dataset_id anymore. 

325 self.assertIsNone(butler.registry.getDataset(ref.id)) 

326 

327 # Do explicit registry removal since we know they are 

328 # empty 

329 butler.registry.removeCollection(this_run) 

330 expected_collections.remove(this_run) 

331 

332 # Put the dataset again, since the last thing we did was remove it 

333 # and we want to use the default collection. 

334 ref = butler.put(metric, refIn) 

335 

336 # Get with parameters 

337 stop = 4 

338 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

339 self.assertNotEqual(metric, sliced) 

340 self.assertEqual(metric.summary, sliced.summary) 

341 self.assertEqual(metric.output, sliced.output) 

342 self.assertEqual(metric.data[:stop], sliced.data) 

343 # getDeferred with parameters 

344 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

345 self.assertNotEqual(metric, sliced) 

346 self.assertEqual(metric.summary, sliced.summary) 

347 self.assertEqual(metric.output, sliced.output) 

348 self.assertEqual(metric.data[:stop], sliced.data) 

349 # getDeferred with deferred parameters 

350 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

351 self.assertNotEqual(metric, sliced) 

352 self.assertEqual(metric.summary, sliced.summary) 

353 self.assertEqual(metric.output, sliced.output) 

354 self.assertEqual(metric.data[:stop], sliced.data) 

355 

356 if storageClass.isComposite(): 

357 # Check that components can be retrieved 

358 metricOut = butler.get(ref.datasetType.name, dataId) 

359 compNameS = ref.datasetType.componentTypeName("summary") 

360 compNameD = ref.datasetType.componentTypeName("data") 

361 summary = butler.get(compNameS, dataId) 

362 self.assertEqual(summary, metric.summary) 

363 data = butler.get(compNameD, dataId) 

364 self.assertEqual(data, metric.data) 

365 

366 if "counter" in storageClass.derivedComponents: 

367 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

368 self.assertEqual(count, len(data)) 

369 

370 count = butler.get( 

371 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

372 ) 

373 self.assertEqual(count, stop) 

374 

375 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

376 summary = butler.getDirect(compRef) 

377 self.assertEqual(summary, metric.summary) 

378 

379 # Create a Dataset type that has the same name but is inconsistent. 

380 inconsistentDatasetType = DatasetType( 

381 datasetTypeName, dimensions, self.storageClassFactory.getStorageClass("Config") 

382 ) 

383 

384 # Getting with a dataset type that does not match registry fails 

385 with self.assertRaises(ValueError): 

386 butler.get(inconsistentDatasetType, dataId) 

387 

388 # Combining a DatasetRef with a dataId should fail 

389 with self.assertRaises(ValueError): 

390 butler.get(ref, dataId) 

391 # Getting with an explicit ref should fail if the id doesn't match 

392 with self.assertRaises(ValueError): 

393 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101)) 

394 

395 # Getting a dataset with unknown parameters should fail 

396 with self.assertRaises(KeyError): 

397 butler.get(ref, parameters={"unsupported": True}) 

398 

399 # Check we have a collection 

400 collections = set(butler.registry.queryCollections()) 

401 self.assertEqual(collections, expected_collections) 

402 

403 # Clean up to check that we can remove something that may have 

404 # already had a component removed 

405 butler.pruneDatasets([ref], unstore=True, purge=True) 

406 

407 # Check that we can configure a butler to accept a put even 

408 # if it already has the dataset in registry. 

409 ref = butler.put(metric, refIn) 

410 

411 # Repeat put will fail. 

412 with self.assertRaises(ConflictingDefinitionError): 

413 butler.put(metric, refIn) 

414 

415 # Remove the datastore entry. 

416 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

417 

418 # Put will still fail 

419 with self.assertRaises(ConflictingDefinitionError): 

420 butler.put(metric, refIn) 

421 

422 # Allow the put to succeed 

423 butler._allow_put_of_predefined_dataset = True 

424 ref2 = butler.put(metric, refIn) 

425 self.assertEqual(ref2.id, ref.id) 

426 

427 # A second put will still fail but with a different exception 

428 # than before. 

429 with self.assertRaises(ConflictingDefinitionError): 

430 butler.put(metric, refIn) 

431 

432 # Reset the flag to avoid confusion 

433 butler._allow_put_of_predefined_dataset = False 

434 

435 # Leave the dataset in place since some downstream tests require 

436 # something to be present 

437 

438 return butler 

439 

440 def testDeferredCollectionPassing(self): 

441 # Construct a butler with no run or collection, but make it writeable. 

442 butler = Butler(self.tmpConfigFile, writeable=True) 

443 # Create and register a DatasetType 

444 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

445 datasetType = self.addDatasetType( 

446 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

447 ) 

448 # Add needed Dimensions 

449 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

450 butler.registry.insertDimensionData( 

451 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

452 ) 

453 butler.registry.insertDimensionData( 

454 "visit", 

455 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

456 ) 

457 dataId = {"instrument": "DummyCamComp", "visit": 423} 

458 # Create dataset. 

459 metric = makeExampleMetrics() 

460 # Register a new run and put dataset. 

461 run = "deferred" 

462 self.assertTrue(butler.registry.registerRun(run)) 

463 # Second time it will be allowed but indicate no-op 

464 self.assertFalse(butler.registry.registerRun(run)) 

465 ref = butler.put(metric, datasetType, dataId, run=run) 

466 # Putting with no run should fail with TypeError. 

467 with self.assertRaises(TypeError): 

468 butler.put(metric, datasetType, dataId) 

469 # Dataset should exist. 

470 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

471 # We should be able to get the dataset back, but with and without 

472 # a deferred dataset handle. 

473 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

474 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

475 # Trying to find the dataset without any collection is a TypeError. 

476 with self.assertRaises(TypeError): 

477 butler.datasetExists(datasetType, dataId) 

478 with self.assertRaises(TypeError): 

479 butler.get(datasetType, dataId) 

480 # Associate the dataset with a different collection. 

481 butler.registry.registerCollection("tagged") 

482 butler.registry.associate("tagged", [ref]) 

483 # Deleting the dataset from the new collection should make it findable 

484 # in the original collection. 

485 butler.pruneDatasets([ref], tags=["tagged"]) 

486 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

487 

488 

489class ButlerTests(ButlerPutGetTests): 

490 """Tests for Butler.""" 

491 

492 useTempRoot = True 

493 

494 def setUp(self): 

495 """Create a new butler root for each test.""" 

496 self.root = makeTestTempDir(TESTDIR) 

497 Butler.makeRepo(self.root, config=Config(self.configFile)) 

498 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

499 

500 def testConstructor(self): 

501 """Independent test of constructor.""" 

502 butler = Butler(self.tmpConfigFile, run="ingest") 

503 self.assertIsInstance(butler, Butler) 

504 

505 # Check that butler.yaml is added automatically. 

506 if self.tmpConfigFile.endswith(end := "/butler.yaml"): 

507 config_dir = self.tmpConfigFile[: -len(end)] 

508 butler = Butler(config_dir, run="ingest") 

509 self.assertIsInstance(butler, Butler) 

510 

511 collections = set(butler.registry.queryCollections()) 

512 self.assertEqual(collections, {"ingest"}) 

513 

514 # Check that some special characters can be included in run name. 

515 special_run = "u@b.c-A" 

516 butler_special = Butler(butler=butler, run=special_run) 

517 collections = set(butler_special.registry.queryCollections("*@*")) 

518 self.assertEqual(collections, {special_run}) 

519 

520 butler2 = Butler(butler=butler, collections=["other"]) 

521 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"])) 

522 self.assertIsNone(butler2.run) 

523 self.assertIs(butler.datastore, butler2.datastore) 

524 

525 # Test that we can use an environment variable to find this 

526 # repository. 

527 butler_index = Config() 

528 butler_index["label"] = self.tmpConfigFile 

529 for suffix in (".yaml", ".json"): 

530 # Ensure that the content differs so that we know that 

531 # we aren't reusing the cache. 

532 bad_label = f"s3://bucket/not_real{suffix}" 

533 butler_index["bad_label"] = bad_label 

534 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

535 butler_index.dumpToUri(temp_file) 

536 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

537 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label"))) 

538 uri = Butler.get_repo_uri("bad_label") 

539 self.assertEqual(uri, ResourcePath(bad_label)) 

540 uri = Butler.get_repo_uri("label") 

541 butler = Butler(uri, writeable=False) 

542 self.assertIsInstance(butler, Butler) 

543 with self.assertRaises(KeyError) as cm: 

544 Butler.get_repo_uri("missing") 

545 self.assertIn("not known to", str(cm.exception)) 

546 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

547 with self.assertRaises(FileNotFoundError): 

548 Butler.get_repo_uri("label") 

549 self.assertEqual(Butler.get_known_repos(), set()) 

550 with self.assertRaises(KeyError) as cm: 

551 # No environment variable set. 

552 Butler.get_repo_uri("label") 

553 self.assertIn("No repository index defined", str(cm.exception)) 

554 self.assertEqual(Butler.get_known_repos(), set()) 

555 

556 def testBasicPutGet(self): 

557 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

558 self.runPutGetTest(storageClass, "test_metric") 

559 

560 def testCompositePutGetConcrete(self): 

561 

562 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

563 butler = self.runPutGetTest(storageClass, "test_metric") 

564 

565 # Should *not* be disassembled 

566 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

567 self.assertEqual(len(datasets), 1) 

568 uri, components = butler.getURIs(datasets[0]) 

569 self.assertIsInstance(uri, ResourcePath) 

570 self.assertFalse(components) 

571 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

572 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

573 

574 # Predicted dataset 

575 dataId = {"instrument": "DummyCamComp", "visit": 424} 

576 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

577 self.assertFalse(components) 

578 self.assertIsInstance(uri, ResourcePath) 

579 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

580 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

581 

582 def testCompositePutGetVirtual(self): 

583 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

584 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

585 

586 # Should be disassembled 

587 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

588 self.assertEqual(len(datasets), 1) 

589 uri, components = butler.getURIs(datasets[0]) 

590 

591 if butler.datastore.isEphemeral: 

592 # Never disassemble in-memory datastore 

593 self.assertIsInstance(uri, ResourcePath) 

594 self.assertFalse(components) 

595 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

596 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

597 else: 

598 self.assertIsNone(uri) 

599 self.assertEqual(set(components), set(storageClass.components)) 

600 for compuri in components.values(): 

601 self.assertIsInstance(compuri, ResourcePath) 

602 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

603 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

604 

605 # Predicted dataset 

606 dataId = {"instrument": "DummyCamComp", "visit": 424} 

607 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

608 

609 if butler.datastore.isEphemeral: 

610 # Never disassembled 

611 self.assertIsInstance(uri, ResourcePath) 

612 self.assertFalse(components) 

613 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

614 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

615 else: 

616 self.assertIsNone(uri) 

617 self.assertEqual(set(components), set(storageClass.components)) 

618 for compuri in components.values(): 

619 self.assertIsInstance(compuri, ResourcePath) 

620 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

621 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

622 

623 def testIngest(self): 

624 butler = Butler(self.tmpConfigFile, run="ingest") 

625 

626 # Create and register a DatasetType 

627 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"]) 

628 

629 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

630 datasetTypeName = "metric" 

631 

632 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

633 

634 # Add needed Dimensions 

635 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

636 butler.registry.insertDimensionData( 

637 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

638 ) 

639 for detector in (1, 2): 

640 butler.registry.insertDimensionData( 

641 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

642 ) 

643 

644 butler.registry.insertDimensionData( 

645 "visit", 

646 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

647 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

648 ) 

649 

650 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter") 

651 dataRoot = os.path.join(TESTDIR, "data", "basic") 

652 datasets = [] 

653 for detector in (1, 2): 

654 detector_name = f"detector_{detector}" 

655 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

656 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

657 # Create a DatasetRef for ingest 

658 refIn = DatasetRef(datasetType, dataId, id=None) 

659 

660 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

661 

662 butler.ingest(*datasets, transfer="copy") 

663 

664 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

665 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

666 

667 metrics1 = butler.get(datasetTypeName, dataId1) 

668 metrics2 = butler.get(datasetTypeName, dataId2) 

669 self.assertNotEqual(metrics1, metrics2) 

670 

671 # Compare URIs 

672 uri1 = butler.getURI(datasetTypeName, dataId1) 

673 uri2 = butler.getURI(datasetTypeName, dataId2) 

674 self.assertNotEqual(uri1, uri2) 

675 

676 # Now do a multi-dataset but single file ingest 

677 metricFile = os.path.join(dataRoot, "detectors.yaml") 

678 refs = [] 

679 for detector in (1, 2): 

680 detector_name = f"detector_{detector}" 

681 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

682 # Create a DatasetRef for ingest 

683 refs.append(DatasetRef(datasetType, dataId, id=None)) 

684 

685 datasets = [] 

686 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter)) 

687 

688 butler.ingest(*datasets, transfer="copy") 

689 

690 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

691 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

692 

693 multi1 = butler.get(datasetTypeName, dataId1) 

694 multi2 = butler.get(datasetTypeName, dataId2) 

695 

696 self.assertEqual(multi1, metrics1) 

697 self.assertEqual(multi2, metrics2) 

698 

699 # Compare URIs 

700 uri1 = butler.getURI(datasetTypeName, dataId1) 

701 uri2 = butler.getURI(datasetTypeName, dataId2) 

702 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

703 

704 # Test that removing one does not break the second 

705 # This line will issue a warning log message for a ChainedDatastore 

706 # that uses an InMemoryDatastore since in-memory can not ingest 

707 # files. 

708 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

709 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1)) 

710 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2)) 

711 multi2b = butler.get(datasetTypeName, dataId2) 

712 self.assertEqual(multi2, multi2b) 

713 

714 def testPruneCollections(self): 

715 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

716 butler = Butler(self.tmpConfigFile, writeable=True) 

717 # Load registry data with dimensions to hang datasets off of. 

718 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

719 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

720 # Add some RUN-type collections. 

721 run1 = "run1" 

722 butler.registry.registerRun(run1) 

723 run2 = "run2" 

724 butler.registry.registerRun(run2) 

725 # put some datasets. ref1 and ref2 have the same data ID, and are in 

726 # different runs. ref3 has a different data ID. 

727 metric = makeExampleMetrics() 

728 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

729 datasetType = self.addDatasetType( 

730 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

731 ) 

732 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

733 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

734 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

735 

736 # Try to delete a RUN collection without purge, or with purge and not 

737 # unstore. 

738 with self.assertRaises(TypeError): 

739 butler.pruneCollection(run1) 

740 with self.assertRaises(TypeError): 

741 butler.pruneCollection(run2, purge=True) 

742 # Add a TAGGED collection and associate ref3 only into it. 

743 tag1 = "tag1" 

744 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

745 self.assertTrue(registered) 

746 # Registering a second time should be allowed. 

747 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

748 self.assertFalse(registered) 

749 butler.registry.associate(tag1, [ref3]) 

750 # Add a CHAINED collection that searches run1 and then run2. It 

751 # logically contains only ref1, because ref2 is shadowed due to them 

752 # having the same data ID and dataset type. 

753 chain1 = "chain1" 

754 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

755 butler.registry.setCollectionChain(chain1, [run1, run2]) 

756 # Try to delete RUN collections, which should fail with complete 

757 # rollback because they're still referenced by the CHAINED 

758 # collection. 

759 with self.assertRaises(Exception): 

760 butler.pruneCollection(run1, pruge=True, unstore=True) 

761 with self.assertRaises(Exception): 

762 butler.pruneCollection(run2, pruge=True, unstore=True) 

763 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

764 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

765 self.assertTrue(existence[ref1]) 

766 self.assertTrue(existence[ref2]) 

767 self.assertTrue(existence[ref3]) 

768 # Try to delete CHAINED and TAGGED collections with purge; should not 

769 # work. 

770 with self.assertRaises(TypeError): 

771 butler.pruneCollection(tag1, purge=True, unstore=True) 

772 with self.assertRaises(TypeError): 

773 butler.pruneCollection(chain1, purge=True, unstore=True) 

774 # Remove the tagged collection with unstore=False. This should not 

775 # affect the datasets. 

776 butler.pruneCollection(tag1) 

777 with self.assertRaises(MissingCollectionError): 

778 butler.registry.getCollectionType(tag1) 

779 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

780 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

781 self.assertTrue(existence[ref1]) 

782 self.assertTrue(existence[ref2]) 

783 self.assertTrue(existence[ref3]) 

784 # Add the tagged collection back in, and remove it with unstore=True. 

785 # This should remove ref3 only from the datastore. 

786 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

787 butler.registry.associate(tag1, [ref3]) 

788 butler.pruneCollection(tag1, unstore=True) 

789 with self.assertRaises(MissingCollectionError): 

790 butler.registry.getCollectionType(tag1) 

791 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

792 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

793 self.assertTrue(existence[ref1]) 

794 self.assertTrue(existence[ref2]) 

795 self.assertFalse(existence[ref3]) 

796 # Delete the chain with unstore=False. The datasets should not be 

797 # affected at all. 

798 butler.pruneCollection(chain1) 

799 with self.assertRaises(MissingCollectionError): 

800 butler.registry.getCollectionType(chain1) 

801 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

802 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

803 self.assertTrue(existence[ref1]) 

804 self.assertTrue(existence[ref2]) 

805 self.assertFalse(existence[ref3]) 

806 # Redefine and then delete the chain with unstore=True. Only ref1 

807 # should be unstored (ref3 has already been unstored, but otherwise 

808 # would be now). 

809 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

810 butler.registry.setCollectionChain(chain1, [run1, run2]) 

811 butler.pruneCollection(chain1, unstore=True) 

812 with self.assertRaises(MissingCollectionError): 

813 butler.registry.getCollectionType(chain1) 

814 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

815 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

816 self.assertFalse(existence[ref1]) 

817 self.assertTrue(existence[ref2]) 

818 self.assertFalse(existence[ref3]) 

819 # Remove run1. This removes ref1 and ref3 from the registry (they're 

820 # already gone from the datastore, which is fine). 

821 butler.pruneCollection(run1, purge=True, unstore=True) 

822 with self.assertRaises(MissingCollectionError): 

823 butler.registry.getCollectionType(run1) 

824 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2]) 

825 self.assertTrue(butler.datastore.exists(ref2)) 

826 # Remove run2. This removes ref2 from the registry and the datastore. 

827 butler.pruneCollection(run2, purge=True, unstore=True) 

828 with self.assertRaises(MissingCollectionError): 

829 butler.registry.getCollectionType(run2) 

830 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), []) 

831 

832 # Now that the collections have been pruned we can remove the 

833 # dataset type 

834 butler.registry.removeDatasetType(datasetType.name) 

835 

836 def testPickle(self): 

837 """Test pickle support.""" 

838 butler = Butler(self.tmpConfigFile, run="ingest") 

839 butlerOut = pickle.loads(pickle.dumps(butler)) 

840 self.assertIsInstance(butlerOut, Butler) 

841 self.assertEqual(butlerOut._config, butler._config) 

842 self.assertEqual(butlerOut.collections, butler.collections) 

843 self.assertEqual(butlerOut.run, butler.run) 

844 

845 def testGetDatasetTypes(self): 

846 butler = Butler(self.tmpConfigFile, run="ingest") 

847 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"]) 

848 dimensionEntries = [ 

849 ( 

850 "instrument", 

851 {"instrument": "DummyCam"}, 

852 {"instrument": "DummyHSC"}, 

853 {"instrument": "DummyCamComp"}, 

854 ), 

855 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

856 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

857 ] 

858 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

859 # Add needed Dimensions 

860 for args in dimensionEntries: 

861 butler.registry.insertDimensionData(*args) 

862 

863 # When a DatasetType is added to the registry entries are not created 

864 # for components but querying them can return the components. 

865 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

866 components = set() 

867 for datasetTypeName in datasetTypeNames: 

868 # Create and register a DatasetType 

869 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

870 

871 for componentName in storageClass.components: 

872 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

873 

874 fromRegistry = set(butler.registry.queryDatasetTypes(components=True)) 

875 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

876 

877 # Now that we have some dataset types registered, validate them 

878 butler.validateConfiguration( 

879 ignore=[ 

880 "test_metric_comp", 

881 "metric3", 

882 "calexp", 

883 "DummySC", 

884 "datasetType.component", 

885 "random_data", 

886 "random_data_2", 

887 ] 

888 ) 

889 

890 # Add a new datasetType that will fail template validation 

891 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

892 if self.validationCanFail: 

893 with self.assertRaises(ValidationError): 

894 butler.validateConfiguration() 

895 

896 # Rerun validation but with a subset of dataset type names 

897 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

898 

899 # Rerun validation but ignore the bad datasetType 

900 butler.validateConfiguration( 

901 ignore=[ 

902 "test_metric_comp", 

903 "metric3", 

904 "calexp", 

905 "DummySC", 

906 "datasetType.component", 

907 "random_data", 

908 "random_data_2", 

909 ] 

910 ) 

911 

912 def testTransaction(self): 

913 butler = Butler(self.tmpConfigFile, run="ingest") 

914 datasetTypeName = "test_metric" 

915 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

916 dimensionEntries = ( 

917 ("instrument", {"instrument": "DummyCam"}), 

918 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

919 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

920 ) 

921 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

922 metric = makeExampleMetrics() 

923 dataId = {"instrument": "DummyCam", "visit": 42} 

924 # Create and register a DatasetType 

925 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

926 with self.assertRaises(TransactionTestError): 

927 with butler.transaction(): 

928 # Add needed Dimensions 

929 for args in dimensionEntries: 

930 butler.registry.insertDimensionData(*args) 

931 # Store a dataset 

932 ref = butler.put(metric, datasetTypeName, dataId) 

933 self.assertIsInstance(ref, DatasetRef) 

934 # Test getDirect 

935 metricOut = butler.getDirect(ref) 

936 self.assertEqual(metric, metricOut) 

937 # Test get 

938 metricOut = butler.get(datasetTypeName, dataId) 

939 self.assertEqual(metric, metricOut) 

940 # Check we can get components 

941 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

942 raise TransactionTestError("This should roll back the entire transaction") 

943 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"): 

944 butler.registry.expandDataId(dataId) 

945 # Should raise LookupError for missing data ID value 

946 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

947 butler.get(datasetTypeName, dataId) 

948 # Also check explicitly if Dataset entry is missing 

949 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

950 # Direct retrieval should not find the file in the Datastore 

951 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

952 butler.getDirect(ref) 

953 

954 def testMakeRepo(self): 

955 """Test that we can write butler configuration to a new repository via 

956 the Butler.makeRepo interface and then instantiate a butler from the 

957 repo root. 

958 """ 

959 # Do not run the test if we know this datastore configuration does 

960 # not support a file system root 

961 if self.fullConfigKey is None: 

962 return 

963 

964 # create two separate directories 

965 root1 = tempfile.mkdtemp(dir=self.root) 

966 root2 = tempfile.mkdtemp(dir=self.root) 

967 

968 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

969 limited = Config(self.configFile) 

970 butler1 = Butler(butlerConfig) 

971 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

972 full = Config(self.tmpConfigFile) 

973 butler2 = Butler(butlerConfig) 

974 # Butlers should have the same configuration regardless of whether 

975 # defaults were expanded. 

976 self.assertEqual(butler1._config, butler2._config) 

977 # Config files loaded directly should not be the same. 

978 self.assertNotEqual(limited, full) 

979 # Make sure "limited" doesn't have a few keys we know it should be 

980 # inheriting from defaults. 

981 self.assertIn(self.fullConfigKey, full) 

982 self.assertNotIn(self.fullConfigKey, limited) 

983 

984 # Collections don't appear until something is put in them 

985 collections1 = set(butler1.registry.queryCollections()) 

986 self.assertEqual(collections1, set()) 

987 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

988 

989 # Check that a config with no associated file name will not 

990 # work properly with relocatable Butler repo 

991 butlerConfig.configFile = None 

992 with self.assertRaises(ValueError): 

993 Butler(butlerConfig) 

994 

995 with self.assertRaises(FileExistsError): 

996 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

997 

998 def testStringification(self): 

999 butler = Butler(self.tmpConfigFile, run="ingest") 

1000 butlerStr = str(butler) 

1001 

1002 if self.datastoreStr is not None: 

1003 for testStr in self.datastoreStr: 

1004 self.assertIn(testStr, butlerStr) 

1005 if self.registryStr is not None: 

1006 self.assertIn(self.registryStr, butlerStr) 

1007 

1008 datastoreName = butler.datastore.name 

1009 if self.datastoreName is not None: 

1010 for testStr in self.datastoreName: 

1011 self.assertIn(testStr, datastoreName) 

1012 

1013 def testButlerRewriteDataId(self): 

1014 """Test that dataIds can be rewritten based on dimension records.""" 

1015 

1016 butler = Butler(self.tmpConfigFile, run="ingest") 

1017 

1018 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1019 datasetTypeName = "random_data" 

1020 

1021 # Create dimension records. 

1022 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1023 butler.registry.insertDimensionData( 

1024 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1025 ) 

1026 butler.registry.insertDimensionData( 

1027 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1028 ) 

1029 

1030 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1031 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1032 butler.registry.registerDatasetType(datasetType) 

1033 

1034 n_exposures = 5 

1035 dayobs = 20210530 

1036 

1037 for i in range(n_exposures): 

1038 butler.registry.insertDimensionData( 

1039 "exposure", 

1040 { 

1041 "instrument": "DummyCamComp", 

1042 "id": i, 

1043 "obs_id": f"exp{i}", 

1044 "seq_num": i, 

1045 "day_obs": dayobs, 

1046 "physical_filter": "d-r", 

1047 }, 

1048 ) 

1049 

1050 # Write some data. 

1051 for i in range(n_exposures): 

1052 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1053 

1054 # Use the seq_num for the put to test rewriting. 

1055 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1056 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1057 

1058 # Check that the exposure is correct in the dataId 

1059 self.assertEqual(ref.dataId["exposure"], i) 

1060 

1061 # and check that we can get the dataset back with the same dataId 

1062 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1063 self.assertEqual(new_metric, metric) 

1064 

1065 

1066class FileDatastoreButlerTests(ButlerTests): 

1067 """Common tests and specialization of ButlerTests for butlers backed 

1068 by datastores that inherit from FileDatastore. 

1069 """ 

1070 

1071 def checkFileExists(self, root, relpath): 

1072 """Checks if file exists at a given path (relative to root). 

1073 

1074 Test testPutTemplates verifies actual physical existance of the files 

1075 in the requested location. 

1076 """ 

1077 uri = ResourcePath(root, forceDirectory=True) 

1078 return uri.join(relpath).exists() 

1079 

1080 def testPutTemplates(self): 

1081 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1082 butler = Butler(self.tmpConfigFile, run="ingest") 

1083 

1084 # Add needed Dimensions 

1085 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1086 butler.registry.insertDimensionData( 

1087 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1088 ) 

1089 butler.registry.insertDimensionData( 

1090 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1091 ) 

1092 butler.registry.insertDimensionData( 

1093 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1094 ) 

1095 

1096 # Create and store a dataset 

1097 metric = makeExampleMetrics() 

1098 

1099 # Create two almost-identical DatasetTypes (both will use default 

1100 # template) 

1101 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

1102 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1103 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1104 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1105 

1106 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1107 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1108 

1109 # Put with exactly the data ID keys needed 

1110 ref = butler.put(metric, "metric1", dataId1) 

1111 uri = butler.getURI(ref) 

1112 self.assertTrue( 

1113 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"), 

1114 f"Checking existence of {uri}", 

1115 ) 

1116 

1117 # Check the template based on dimensions 

1118 butler.datastore.templates.validateTemplates([ref]) 

1119 

1120 # Put with extra data ID keys (physical_filter is an optional 

1121 # dependency); should not change template (at least the way we're 

1122 # defining them to behave now; the important thing is that they 

1123 # must be consistent). 

1124 ref = butler.put(metric, "metric2", dataId2) 

1125 uri = butler.getURI(ref) 

1126 self.assertTrue( 

1127 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"), 

1128 f"Checking existence of {uri}", 

1129 ) 

1130 

1131 # Check the template based on dimensions 

1132 butler.datastore.templates.validateTemplates([ref]) 

1133 

1134 # Now use a file template that will not result in unique filenames 

1135 with self.assertRaises(FileTemplateValidationError): 

1136 butler.put(metric, "metric3", dataId1) 

1137 

1138 def testImportExport(self): 

1139 # Run put/get tests just to create and populate a repo. 

1140 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1141 self.runImportExportTest(storageClass) 

1142 

1143 @unittest.expectedFailure 

1144 def testImportExportVirtualComposite(self): 

1145 # Run put/get tests just to create and populate a repo. 

1146 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1147 self.runImportExportTest(storageClass) 

1148 

1149 def runImportExportTest(self, storageClass): 

1150 """This test does an export to a temp directory and an import back 

1151 into a new temp directory repo. It does not assume a posix datastore""" 

1152 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1153 print("Root:", exportButler.datastore.root) 

1154 # Test that the repo actually has at least one dataset. 

1155 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1156 self.assertGreater(len(datasets), 0) 

1157 # Add a DimensionRecord that's unused by those datasets. 

1158 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1159 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1160 # Export and then import datasets. 

1161 with safeTestTempDir(TESTDIR) as exportDir: 

1162 exportFile = os.path.join(exportDir, "exports.yaml") 

1163 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1164 export.saveDatasets(datasets) 

1165 # Export the same datasets again. This should quietly do 

1166 # nothing because of internal deduplication, and it shouldn't 

1167 # complain about being asked to export the "htm7" elements even 

1168 # though there aren't any in these datasets or in the database. 

1169 export.saveDatasets(datasets, elements=["htm7"]) 

1170 # Save one of the data IDs again; this should be harmless 

1171 # because of internal deduplication. 

1172 export.saveDataIds([datasets[0].dataId]) 

1173 # Save some dimension records directly. 

1174 export.saveDimensionData("skymap", [skymapRecord]) 

1175 self.assertTrue(os.path.exists(exportFile)) 

1176 with safeTestTempDir(TESTDIR) as importDir: 

1177 # We always want this to be a local posix butler 

1178 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1179 # Calling script.butlerImport tests the implementation of the 

1180 # butler command line interface "import" subcommand. Functions 

1181 # in the script folder are generally considered protected and 

1182 # should not be used as public api. 

1183 with open(exportFile, "r") as f: 

1184 script.butlerImport( 

1185 importDir, 

1186 export_file=f, 

1187 directory=exportDir, 

1188 transfer="auto", 

1189 skip_dimensions=None, 

1190 reuse_ids=False, 

1191 ) 

1192 importButler = Butler(importDir, run="ingest") 

1193 for ref in datasets: 

1194 with self.subTest(ref=ref): 

1195 # Test for existence by passing in the DatasetType and 

1196 # data ID separately, to avoid lookup by dataset_id. 

1197 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId)) 

1198 self.assertEqual( 

1199 list(importButler.registry.queryDimensionRecords("skymap")), 

1200 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)], 

1201 ) 

1202 

1203 def testRemoveRuns(self): 

1204 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1205 butler = Butler(self.tmpConfigFile, writeable=True) 

1206 # Load registry data with dimensions to hang datasets off of. 

1207 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1208 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1209 # Add some RUN-type collection. 

1210 run1 = "run1" 

1211 butler.registry.registerRun(run1) 

1212 run2 = "run2" 

1213 butler.registry.registerRun(run2) 

1214 # put a dataset in each 

1215 metric = makeExampleMetrics() 

1216 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1217 datasetType = self.addDatasetType( 

1218 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1219 ) 

1220 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1221 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1222 uri1 = butler.getURI(ref1, collections=[run1]) 

1223 uri2 = butler.getURI(ref2, collections=[run2]) 

1224 # Remove from both runs with different values for unstore. 

1225 butler.removeRuns([run1], unstore=True) 

1226 butler.removeRuns([run2], unstore=False) 

1227 # Should be nothing in registry for either one, and datastore should 

1228 # not think either exists. 

1229 with self.assertRaises(MissingCollectionError): 

1230 butler.registry.getCollectionType(run1) 

1231 with self.assertRaises(MissingCollectionError): 

1232 butler.registry.getCollectionType(run2) 

1233 self.assertFalse(butler.datastore.exists(ref1)) 

1234 self.assertFalse(butler.datastore.exists(ref2)) 

1235 # The ref we unstored should be gone according to the URI, but the 

1236 # one we forgot should still be around. 

1237 self.assertFalse(uri1.exists()) 

1238 self.assertTrue(uri2.exists()) 

1239 

1240 

1241class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1242 """PosixDatastore specialization of a butler""" 

1243 

1244 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1245 fullConfigKey = ".datastore.formatters" 

1246 validationCanFail = True 

1247 datastoreStr = ["/tmp"] 

1248 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1249 registryStr = "/gen3.sqlite3" 

1250 

1251 def testPathConstructor(self): 

1252 """Independent test of constructor using PathLike.""" 

1253 butler = Butler(self.tmpConfigFile, run="ingest") 

1254 self.assertIsInstance(butler, Butler) 

1255 

1256 # And again with a Path object with the butler yaml 

1257 path = pathlib.Path(self.tmpConfigFile) 

1258 butler = Butler(path, writeable=False) 

1259 self.assertIsInstance(butler, Butler) 

1260 

1261 # And again with a Path object without the butler yaml 

1262 # (making sure we skip it if the tmp config doesn't end 

1263 # in butler.yaml -- which is the case for a subclass) 

1264 if self.tmpConfigFile.endswith("butler.yaml"): 

1265 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1266 butler = Butler(path, writeable=False) 

1267 self.assertIsInstance(butler, Butler) 

1268 

1269 def testExportTransferCopy(self): 

1270 """Test local export using all transfer modes""" 

1271 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1272 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1273 # Test that the repo actually has at least one dataset. 

1274 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1275 self.assertGreater(len(datasets), 0) 

1276 uris = [exportButler.getURI(d) for d in datasets] 

1277 datastoreRoot = exportButler.datastore.root 

1278 

1279 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1280 

1281 for path in pathsInStore: 

1282 # Assume local file system 

1283 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1284 

1285 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1286 with safeTestTempDir(TESTDIR) as exportDir: 

1287 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1288 export.saveDatasets(datasets) 

1289 for path in pathsInStore: 

1290 self.assertTrue( 

1291 self.checkFileExists(exportDir, path), 

1292 f"Check that mode {transfer} exported files", 

1293 ) 

1294 

1295 def testPruneDatasets(self): 

1296 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1297 butler = Butler(self.tmpConfigFile, writeable=True) 

1298 # Load registry data with dimensions to hang datasets off of. 

1299 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1300 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1301 # Add some RUN-type collections. 

1302 run1 = "run1" 

1303 butler.registry.registerRun(run1) 

1304 run2 = "run2" 

1305 butler.registry.registerRun(run2) 

1306 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1307 # different runs. ref3 has a different data ID. 

1308 metric = makeExampleMetrics() 

1309 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1310 datasetType = self.addDatasetType( 

1311 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1312 ) 

1313 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1314 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1315 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1316 

1317 # Simple prune. 

1318 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1319 with self.assertRaises(LookupError): 

1320 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1) 

1321 

1322 # Put data back. 

1323 ref1 = butler.put(metric, ref1.unresolved(), run=run1) 

1324 ref2 = butler.put(metric, ref2.unresolved(), run=run2) 

1325 ref3 = butler.put(metric, ref3.unresolved(), run=run1) 

1326 

1327 # Check that in normal mode, deleting the record will lead to 

1328 # trash not touching the file. 

1329 uri1 = butler.datastore.getURI(ref1) 

1330 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table 

1331 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id}) 

1332 butler.datastore.trash(ref1) 

1333 butler.datastore.emptyTrash() 

1334 self.assertTrue(uri1.exists()) 

1335 uri1.remove() # Clean it up. 

1336 

1337 # Simulate execution butler setup by deleting the datastore 

1338 # record but keeping the file around and trusting. 

1339 butler.datastore.trustGetRequest = True 

1340 uri2 = butler.datastore.getURI(ref2) 

1341 uri3 = butler.datastore.getURI(ref3) 

1342 self.assertTrue(uri2.exists()) 

1343 self.assertTrue(uri3.exists()) 

1344 

1345 # Remove the datastore record. 

1346 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table 

1347 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id}) 

1348 self.assertTrue(uri2.exists()) 

1349 butler.datastore.trash([ref2, ref3]) 

1350 # Immediate removal for ref2 file 

1351 self.assertFalse(uri2.exists()) 

1352 # But ref3 has to wait for the empty. 

1353 self.assertTrue(uri3.exists()) 

1354 butler.datastore.emptyTrash() 

1355 self.assertFalse(uri3.exists()) 

1356 

1357 # Clear out the datasets from registry. 

1358 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1359 

1360 def testPytypeCoercion(self): 

1361 """Test python type coercion on Butler.get""" 

1362 

1363 # Store some data with the normal example storage class. 

1364 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1365 datasetTypeName = "test_metric" 

1366 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1367 

1368 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1369 metric = butler.get(datasetTypeName, dataId=dataId) 

1370 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1371 

1372 datasetType_ori = butler.registry.getDatasetType(datasetTypeName) 

1373 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1374 

1375 # Now need to hack the registry dataset type definition. 

1376 # There is no API for this. 

1377 manager = butler.registry._managers.datasets 

1378 manager._db.update( 

1379 manager._static.dataset_type, 

1380 {"name": datasetTypeName}, 

1381 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1382 ) 

1383 

1384 # Force reset of dataset type cache 

1385 butler.registry.refresh() 

1386 

1387 datasetType_new = butler.registry.getDatasetType(datasetTypeName) 

1388 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1389 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1390 

1391 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1392 self.assertNotEqual(type(metric_model), type(metric)) 

1393 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1394 

1395 # Put the model and read it back to show that everything now 

1396 # works as normal. 

1397 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1398 metric_model_new = butler.get(metric_ref) 

1399 self.assertEqual(metric_model_new, metric_model) 

1400 

1401 # Hack the storage class again to something that will fail on the 

1402 # get with no conversion class. 

1403 manager._db.update( 

1404 manager._static.dataset_type, 

1405 {"name": datasetTypeName}, 

1406 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1407 ) 

1408 butler.registry.refresh() 

1409 

1410 with self.assertRaises(ValueError): 

1411 butler.get(datasetTypeName, dataId=dataId) 

1412 

1413 

1414class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1415 """InMemoryDatastore specialization of a butler""" 

1416 

1417 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1418 fullConfigKey = None 

1419 useTempRoot = False 

1420 validationCanFail = False 

1421 datastoreStr = ["datastore='InMemory"] 

1422 datastoreName = ["InMemoryDatastore@"] 

1423 registryStr = "/gen3.sqlite3" 

1424 

1425 def testIngest(self): 

1426 pass 

1427 

1428 

1429class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1430 """PosixDatastore specialization""" 

1431 

1432 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1433 fullConfigKey = ".datastore.datastores.1.formatters" 

1434 validationCanFail = True 

1435 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1436 datastoreName = [ 

1437 "InMemoryDatastore@", 

1438 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1439 "SecondDatastore", 

1440 ] 

1441 registryStr = "/gen3.sqlite3" 

1442 

1443 

1444class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1445 """Test that a yaml file in one location can refer to a root in another.""" 

1446 

1447 datastoreStr = ["dir1"] 

1448 # Disable the makeRepo test since we are deliberately not using 

1449 # butler.yaml as the config name. 

1450 fullConfigKey = None 

1451 

1452 def setUp(self): 

1453 self.root = makeTestTempDir(TESTDIR) 

1454 

1455 # Make a new repository in one place 

1456 self.dir1 = os.path.join(self.root, "dir1") 

1457 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1458 

1459 # Move the yaml file to a different place and add a "root" 

1460 self.dir2 = os.path.join(self.root, "dir2") 

1461 os.makedirs(self.dir2, exist_ok=True) 

1462 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1463 config = Config(configFile1) 

1464 config["root"] = self.dir1 

1465 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1466 config.dumpToUri(configFile2) 

1467 os.remove(configFile1) 

1468 self.tmpConfigFile = configFile2 

1469 

1470 def testFileLocations(self): 

1471 self.assertNotEqual(self.dir1, self.dir2) 

1472 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1473 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1474 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1475 

1476 

1477class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1478 """Test that a config file created by makeRepo outside of repo works.""" 

1479 

1480 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1481 

1482 def setUp(self): 

1483 self.root = makeTestTempDir(TESTDIR) 

1484 self.root2 = makeTestTempDir(TESTDIR) 

1485 

1486 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1487 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1488 

1489 def tearDown(self): 

1490 if os.path.exists(self.root2): 

1491 shutil.rmtree(self.root2, ignore_errors=True) 

1492 super().tearDown() 

1493 

1494 def testConfigExistence(self): 

1495 c = Config(self.tmpConfigFile) 

1496 uri_config = ResourcePath(c["root"]) 

1497 uri_expected = ResourcePath(self.root, forceDirectory=True) 

1498 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1499 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1500 

1501 def testPutGet(self): 

1502 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1503 self.runPutGetTest(storageClass, "test_metric") 

1504 

1505 

1506class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1507 """Test that a config file created by makeRepo outside of repo works.""" 

1508 

1509 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1510 

1511 def setUp(self): 

1512 self.root = makeTestTempDir(TESTDIR) 

1513 self.root2 = makeTestTempDir(TESTDIR) 

1514 

1515 self.tmpConfigFile = self.root2 

1516 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1517 

1518 def testConfigExistence(self): 

1519 # Append the yaml file else Config constructor does not know the file 

1520 # type. 

1521 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1522 super().testConfigExistence() 

1523 

1524 

1525class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1526 """Test that a config file created by makeRepo outside of repo works.""" 

1527 

1528 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1529 

1530 def setUp(self): 

1531 self.root = makeTestTempDir(TESTDIR) 

1532 self.root2 = makeTestTempDir(TESTDIR) 

1533 

1534 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl() 

1535 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1536 

1537 

1538@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1539@mock_s3 

1540class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1541 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1542 a local in-memory SqlRegistry. 

1543 """ 

1544 

1545 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1546 fullConfigKey = None 

1547 validationCanFail = True 

1548 

1549 bucketName = "anybucketname" 

1550 """Name of the Bucket that will be used in the tests. The name is read from 

1551 the config file used with the tests during set-up. 

1552 """ 

1553 

1554 root = "butlerRoot/" 

1555 """Root repository directory expected to be used in case useTempRoot=False. 

1556 Otherwise the root is set to a 20 characters long randomly generated string 

1557 during set-up. 

1558 """ 

1559 

1560 datastoreStr = [f"datastore={root}"] 

1561 """Contains all expected root locations in a format expected to be 

1562 returned by Butler stringification. 

1563 """ 

1564 

1565 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1566 """The expected format of the S3 Datastore string.""" 

1567 

1568 registryStr = "/gen3.sqlite3" 

1569 """Expected format of the Registry string.""" 

1570 

1571 def genRoot(self): 

1572 """Returns a random string of len 20 to serve as a root 

1573 name for the temporary bucket repo. 

1574 

1575 This is equivalent to tempfile.mkdtemp as this is what self.root 

1576 becomes when useTempRoot is True. 

1577 """ 

1578 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1579 return rndstr + "/" 

1580 

1581 def setUp(self): 

1582 config = Config(self.configFile) 

1583 uri = ResourcePath(config[".datastore.datastore.root"]) 

1584 self.bucketName = uri.netloc 

1585 

1586 # set up some fake credentials if they do not exist 

1587 self.usingDummyCredentials = setAwsEnvCredentials() 

1588 

1589 if self.useTempRoot: 

1590 self.root = self.genRoot() 

1591 rooturi = f"s3://{self.bucketName}/{self.root}" 

1592 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1593 

1594 # need local folder to store registry database 

1595 self.reg_dir = makeTestTempDir(TESTDIR) 

1596 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1597 

1598 # MOTO needs to know that we expect Bucket bucketname to exist 

1599 # (this used to be the class attribute bucketName) 

1600 s3 = boto3.resource("s3") 

1601 s3.create_bucket(Bucket=self.bucketName) 

1602 

1603 self.datastoreStr = f"datastore={self.root}" 

1604 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1605 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1606 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1607 

1608 def tearDown(self): 

1609 s3 = boto3.resource("s3") 

1610 bucket = s3.Bucket(self.bucketName) 

1611 try: 

1612 bucket.objects.all().delete() 

1613 except botocore.exceptions.ClientError as e: 

1614 if e.response["Error"]["Code"] == "404": 

1615 # the key was not reachable - pass 

1616 pass 

1617 else: 

1618 raise 

1619 

1620 bucket = s3.Bucket(self.bucketName) 

1621 bucket.delete() 

1622 

1623 # unset any potentially set dummy credentials 

1624 if self.usingDummyCredentials: 

1625 unsetAwsEnvCredentials() 

1626 

1627 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1628 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1629 

1630 if self.useTempRoot and os.path.exists(self.root): 

1631 shutil.rmtree(self.root, ignore_errors=True) 

1632 

1633 

1634@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!") 

1635# Mock required environment variables during tests 

1636@unittest.mock.patch.dict( 

1637 os.environ, 

1638 { 

1639 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1640 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1641 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1642 }, 

1643) 

1644class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1645 """WebdavDatastore specialization of a butler; a Webdav storage Datastore + 

1646 a local in-memory SqlRegistry. 

1647 """ 

1648 

1649 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml") 

1650 fullConfigKey = None 

1651 validationCanFail = True 

1652 

1653 serverName = "localhost" 

1654 """Name of the server that will be used in the tests. 

1655 """ 

1656 

1657 portNumber = 8080 

1658 """Port on which the webdav server listens. Automatically chosen 

1659 at setUpClass via the _getfreeport() method 

1660 """ 

1661 

1662 root = "butlerRoot/" 

1663 """Root repository directory expected to be used in case useTempRoot=False. 

1664 Otherwise the root is set to a 20 characters long randomly generated string 

1665 during set-up. 

1666 """ 

1667 

1668 datastoreStr = [f"datastore={root}"] 

1669 """Contains all expected root locations in a format expected to be 

1670 returned by Butler stringification. 

1671 """ 

1672 

1673 datastoreName = ["FileDatastore@https://{serverName}/{root}"] 

1674 """The expected format of the WebdavDatastore string.""" 

1675 

1676 registryStr = "/gen3.sqlite3" 

1677 """Expected format of the Registry string.""" 

1678 

1679 serverThread = None 

1680 """Thread in which the local webdav server will run""" 

1681 

1682 stopWebdavServer = False 

1683 """This flag will cause the webdav server to 

1684 gracefully shut down when True 

1685 """ 

1686 

1687 def genRoot(self): 

1688 """Returns a random string of len 20 to serve as a root 

1689 name for the temporary bucket repo. 

1690 

1691 This is equivalent to tempfile.mkdtemp as this is what self.root 

1692 becomes when useTempRoot is True. 

1693 """ 

1694 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1695 return rndstr + "/" 

1696 

1697 @classmethod 

1698 def setUpClass(cls): 

1699 # Do the same as inherited class 

1700 cls.storageClassFactory = StorageClassFactory() 

1701 cls.storageClassFactory.addFromConfig(cls.configFile) 

1702 

1703 cls.portNumber = cls._getfreeport() 

1704 # Run a local webdav server on which tests will be run 

1705 cls.serverThread = Thread( 

1706 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True 

1707 ) 

1708 cls.serverThread.start() 

1709 # Wait for it to start 

1710 time.sleep(3) 

1711 

1712 @classmethod 

1713 def tearDownClass(cls): 

1714 # Ask for graceful shut down of the webdav server 

1715 cls.stopWebdavServer = True 

1716 # Wait for the thread to exit 

1717 cls.serverThread.join() 

1718 

1719 # Mock required environment variables during tests 

1720 @unittest.mock.patch.dict( 

1721 os.environ, 

1722 { 

1723 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1724 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1725 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1726 }, 

1727 ) 

1728 def setUp(self): 

1729 config = Config(self.configFile) 

1730 

1731 if self.useTempRoot: 

1732 self.root = self.genRoot() 

1733 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}" 

1734 config.update({"datastore": {"datastore": {"root": self.rooturi}}}) 

1735 

1736 # need local folder to store registry database 

1737 self.reg_dir = makeTestTempDir(TESTDIR) 

1738 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1739 

1740 self.datastoreStr = f"datastore={self.root}" 

1741 self.datastoreName = [f"FileDatastore@{self.rooturi}"] 

1742 

1743 if not isWebdavEndpoint(self.rooturi): 

1744 raise OSError("Webdav server not running properly: cannot run tests.") 

1745 

1746 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False) 

1747 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml") 

1748 

1749 # Mock required environment variables during tests 

1750 @unittest.mock.patch.dict( 

1751 os.environ, 

1752 { 

1753 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1754 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1755 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1756 }, 

1757 ) 

1758 def tearDown(self): 

1759 # Clear temporary directory 

1760 ResourcePath(self.rooturi).remove() 

1761 ResourcePath(self.rooturi).session.close() 

1762 

1763 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1764 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1765 

1766 if self.useTempRoot and os.path.exists(self.root): 

1767 shutil.rmtree(self.root, ignore_errors=True) 

1768 

1769 def _serveWebdav(self, port: int, stopWebdavServer): 

1770 """Starts a local webdav-compatible HTTP server, 

1771 Listening on http://localhost:port 

1772 This server only runs when this test class is instantiated, 

1773 and then shuts down. Must be started is a separate thread. 

1774 

1775 Parameters 

1776 ---------- 

1777 port : `int` 

1778 The port number on which the server should listen 

1779 """ 

1780 root_path = gettempdir() 

1781 

1782 config = { 

1783 "host": "0.0.0.0", 

1784 "port": port, 

1785 "provider_mapping": {"/": root_path}, 

1786 "http_authenticator": {"domain_controller": None}, 

1787 "simple_dc": {"user_mapping": {"*": True}}, 

1788 "verbose": 0, 

1789 } 

1790 app = WsgiDAVApp(config) 

1791 

1792 server_args = { 

1793 "bind_addr": (config["host"], config["port"]), 

1794 "wsgi_app": app, 

1795 } 

1796 server = wsgi.Server(**server_args) 

1797 server.prepare() 

1798 

1799 try: 

1800 # Start the actual server in a separate thread 

1801 t = Thread(target=server.serve, daemon=True) 

1802 t.start() 

1803 # watch stopWebdavServer, and gracefully 

1804 # shut down the server when True 

1805 while True: 

1806 if stopWebdavServer(): 

1807 break 

1808 time.sleep(1) 

1809 except KeyboardInterrupt: 

1810 print("Caught Ctrl-C, shutting down...") 

1811 finally: 

1812 server.stop() 

1813 t.join() 

1814 

1815 def _getfreeport(): 

1816 """ 

1817 Determines a free port using sockets. 

1818 """ 

1819 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

1820 free_socket.bind(("0.0.0.0", 0)) 

1821 free_socket.listen() 

1822 port = free_socket.getsockname()[1] 

1823 free_socket.close() 

1824 return port 

1825 

1826 

1827class PosixDatastoreTransfers(unittest.TestCase): 

1828 """Test data transfers between butlers. 

1829 

1830 Test for different managers. UUID to UUID and integer to integer are 

1831 tested. UUID to integer is not supported since we do not currently 

1832 want to allow that. Integer to UUID is supported with the caveat 

1833 that UUID4 will be generated and this will be incorrect for raw 

1834 dataset types. The test ignores that. 

1835 """ 

1836 

1837 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1838 

1839 @classmethod 

1840 def setUpClass(cls): 

1841 cls.storageClassFactory = StorageClassFactory() 

1842 cls.storageClassFactory.addFromConfig(cls.configFile) 

1843 

1844 def setUp(self): 

1845 self.root = makeTestTempDir(TESTDIR) 

1846 self.config = Config(self.configFile) 

1847 

1848 def tearDown(self): 

1849 removeTestTempDir(self.root) 

1850 

1851 def create_butler(self, manager, label): 

1852 config = Config(self.configFile) 

1853 config["registry", "managers", "datasets"] = manager 

1854 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True) 

1855 

1856 def create_butlers(self, manager1, manager2): 

1857 self.source_butler = self.create_butler(manager1, "1") 

1858 self.target_butler = self.create_butler(manager2, "2") 

1859 

1860 def testTransferUuidToUuid(self): 

1861 self.create_butlers( 

1862 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1863 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1864 ) 

1865 # Setting id_gen_map should have no effect here 

1866 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1867 

1868 def testTransferIntToInt(self): 

1869 self.create_butlers( 

1870 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1871 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1872 ) 

1873 # int dataset ID only allows UNIQUE 

1874 self.assertButlerTransfers() 

1875 

1876 def testTransferIntToUuid(self): 

1877 self.create_butlers( 

1878 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1879 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1880 ) 

1881 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1882 

1883 def testTransferMissing(self): 

1884 """Test transfers where datastore records are missing. 

1885 

1886 This is how execution butler works. 

1887 """ 

1888 self.create_butlers( 

1889 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1890 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1891 ) 

1892 

1893 # Configure the source butler to allow trust. 

1894 self.source_butler.datastore.trustGetRequest = True 

1895 

1896 self.assertButlerTransfers(purge=True) 

1897 

1898 def testTransferMissingDisassembly(self): 

1899 """Test transfers where datastore records are missing. 

1900 

1901 This is how execution butler works. 

1902 """ 

1903 self.create_butlers( 

1904 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1905 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1906 ) 

1907 

1908 # Configure the source butler to allow trust. 

1909 self.source_butler.datastore.trustGetRequest = True 

1910 

1911 # Test disassembly. 

1912 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

1913 

1914 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"): 

1915 """Test that a run can be transferred to another butler.""" 

1916 

1917 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

1918 datasetTypeName = "random_data" 

1919 

1920 # Test will create 3 collections and we will want to transfer 

1921 # two of those three. 

1922 runs = ["run1", "run2", "other"] 

1923 

1924 # Also want to use two different dataset types to ensure that 

1925 # grouping works. 

1926 datasetTypeNames = ["random_data", "random_data_2"] 

1927 

1928 # Create the run collections in the source butler. 

1929 for run in runs: 

1930 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

1931 

1932 # Create dimensions in both butlers (transfer will not create them). 

1933 n_exposures = 30 

1934 for butler in (self.source_butler, self.target_butler): 

1935 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1936 butler.registry.insertDimensionData( 

1937 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1938 ) 

1939 butler.registry.insertDimensionData( 

1940 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1941 ) 

1942 

1943 for i in range(n_exposures): 

1944 butler.registry.insertDimensionData( 

1945 "exposure", 

1946 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

1947 ) 

1948 

1949 # Create dataset types in the source butler. 

1950 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1951 for datasetTypeName in datasetTypeNames: 

1952 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1953 self.source_butler.registry.registerDatasetType(datasetType) 

1954 

1955 # Write a dataset to an unrelated run -- this will ensure that 

1956 # we are rewriting integer dataset ids in the target if necessary. 

1957 # Will not be relevant for UUID. 

1958 run = "distraction" 

1959 butler = Butler(butler=self.source_butler, run=run) 

1960 butler.put( 

1961 makeExampleMetrics(), 

1962 datasetTypeName, 

1963 exposure=1, 

1964 instrument="DummyCamComp", 

1965 physical_filter="d-r", 

1966 ) 

1967 

1968 # Write some example metrics to the source 

1969 butler = Butler(butler=self.source_butler) 

1970 

1971 # Set of DatasetRefs that should be in the list of refs to transfer 

1972 # but which will not be transferred. 

1973 deleted = set() 

1974 

1975 n_expected = 20 # Number of datasets expected to be transferred 

1976 source_refs = [] 

1977 for i in range(n_exposures): 

1978 # Put a third of datasets into each collection, only retain 

1979 # two thirds. 

1980 index = i % 3 

1981 run = runs[index] 

1982 datasetTypeName = datasetTypeNames[i % 2] 

1983 

1984 metric_data = { 

1985 "summary": {"counter": i}, 

1986 "output": {"text": "metric"}, 

1987 "data": [2 * x for x in range(i)], 

1988 } 

1989 metric = MetricsExample(**metric_data) 

1990 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1991 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

1992 

1993 # Remove the datastore record using low-level API 

1994 if purge: 

1995 # Remove records for a fraction. 

1996 if index == 1: 

1997 

1998 # For one of these delete the file as well. 

1999 # This allows the "missing" code to filter the 

2000 # file out. 

2001 if not deleted: 

2002 primary, uris = butler.datastore.getURIs(ref) 

2003 if primary: 

2004 primary.remove() 

2005 for uri in uris.values(): 

2006 uri.remove() 

2007 n_expected -= 1 

2008 deleted.add(ref) 

2009 

2010 # Remove the datastore record. 

2011 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

2012 

2013 if index < 2: 

2014 source_refs.append(ref) 

2015 if ref not in deleted: 

2016 new_metric = butler.get(ref.unresolved(), collections=run) 

2017 self.assertEqual(new_metric, metric) 

2018 

2019 # Create some bad dataset types to ensure we check for inconsistent 

2020 # definitions. 

2021 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2022 for datasetTypeName in datasetTypeNames: 

2023 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2024 self.target_butler.registry.registerDatasetType(datasetType) 

2025 with self.assertRaises(ConflictingDefinitionError): 

2026 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map) 

2027 # And remove the bad definitions. 

2028 for datasetTypeName in datasetTypeNames: 

2029 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2030 

2031 # Transfer without creating dataset types should fail. 

2032 with self.assertRaises(KeyError): 

2033 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map) 

2034 

2035 # Now transfer them to the second butler 

2036 with self.assertLogs(level=logging.DEBUG) as cm: 

2037 transferred = self.target_butler.transfer_from( 

2038 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True 

2039 ) 

2040 self.assertEqual(len(transferred), n_expected) 

2041 log_output = ";".join(cm.output) 

2042 self.assertIn("found in datastore for chunk", log_output) 

2043 self.assertIn("Creating output run", log_output) 

2044 

2045 # Do the transfer twice to ensure that it will do nothing extra. 

2046 # Only do this if purge=True because it does not work for int 

2047 # dataset_id. 

2048 if purge: 

2049 # This should not need to register dataset types. 

2050 transferred = self.target_butler.transfer_from( 

2051 self.source_butler, source_refs, id_gen_map=id_gen_map 

2052 ) 

2053 self.assertEqual(len(transferred), n_expected) 

2054 

2055 # Also do an explicit low-level transfer to trigger some 

2056 # edge cases. 

2057 with self.assertLogs(level=logging.DEBUG) as cm: 

2058 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

2059 log_output = ";".join(cm.output) 

2060 self.assertIn("no file artifacts exist", log_output) 

2061 

2062 with self.assertRaises(TypeError): 

2063 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) 

2064 

2065 with self.assertRaises(ValueError): 

2066 self.target_butler.datastore.transfer_from( 

2067 self.source_butler.datastore, source_refs, transfer="split" 

2068 ) 

2069 

2070 # Now try to get the same refs from the new butler. 

2071 for ref in source_refs: 

2072 if ref not in deleted: 

2073 unresolved_ref = ref.unresolved() 

2074 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run) 

2075 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run) 

2076 self.assertEqual(new_metric, old_metric) 

2077 

2078 # Now prune run2 collection and create instead a CHAINED collection. 

2079 # This should block the transfer. 

2080 self.target_butler.pruneCollection("run2", purge=True, unstore=True) 

2081 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2082 with self.assertRaises(TypeError): 

2083 # Re-importing the run1 datasets can be problematic if they 

2084 # use integer IDs so filter those out. 

2085 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2086 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map) 

2087 

2088 

2089if __name__ == "__main__": 2089 ↛ 2090line 2089 didn't jump to line 2090, because the condition on line 2089 was never true

2090 unittest.main()