Coverage for tests/test_butler.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1095 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24 

25import logging 

26import os 

27import pathlib 

28import pickle 

29import posixpath 

30import random 

31import shutil 

32import socket 

33import string 

34import tempfile 

35import time 

36import unittest 

37 

38try: 

39 import boto3 

40 import botocore 

41 from moto import mock_s3 

42except ImportError: 

43 boto3 = None 

44 

45 def mock_s3(cls): 

46 """A no-op decorator in case moto mock_s3 can not be imported.""" 

47 return cls 

48 

49 

50try: 

51 from cheroot import wsgi 

52 from wsgidav.wsgidav_app import WsgiDAVApp 

53except ImportError: 

54 WsgiDAVApp = None 

55 

56from tempfile import gettempdir 

57from threading import Thread 

58 

59import astropy.time 

60from lsst.daf.butler import ( 

61 Butler, 

62 ButlerConfig, 

63 CollectionSearch, 

64 CollectionType, 

65 Config, 

66 DatasetIdGenEnum, 

67 DatasetRef, 

68 DatasetType, 

69 FileDataset, 

70 FileTemplateValidationError, 

71 StorageClassFactory, 

72 ValidationError, 

73 script, 

74) 

75from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

76from lsst.daf.butler.registry import ConflictingDefinitionError, MissingCollectionError 

77from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

78from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir, safeTestTempDir 

79from lsst.resources import ResourcePath 

80from lsst.resources.http import isWebdavEndpoint 

81from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

82from lsst.utils import doImport 

83from lsst.utils.introspection import get_full_type_name 

84 

85TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

86 

87 

88def makeExampleMetrics(): 

89 return MetricsExample( 

90 {"AM1": 5.2, "AM2": 30.6}, 

91 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

92 [563, 234, 456.7, 752, 8, 9, 27], 

93 ) 

94 

95 

96class TransactionTestError(Exception): 

97 """Specific error for testing transactions, to prevent misdiagnosing 

98 that might otherwise occur when a standard exception is used. 

99 """ 

100 

101 pass 

102 

103 

104class ButlerConfigTests(unittest.TestCase): 

105 """Simple tests for ButlerConfig that are not tested in any other test 

106 cases.""" 

107 

108 def testSearchPath(self): 

109 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

110 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

111 config1 = ButlerConfig(configFile) 

112 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

113 

114 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

115 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

116 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

117 self.assertIn("testConfigs", "\n".join(cm.output)) 

118 

119 key = ("datastore", "records", "table") 

120 self.assertNotEqual(config1[key], config2[key]) 

121 self.assertEqual(config2[key], "override_record") 

122 

123 

124class ButlerPutGetTests: 

125 """Helper method for running a suite of put/get tests from different 

126 butler configurations.""" 

127 

128 root = None 

129 

130 @staticmethod 

131 def addDatasetType(datasetTypeName, dimensions, storageClass, registry): 

132 """Create a DatasetType and register it""" 

133 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

134 registry.registerDatasetType(datasetType) 

135 return datasetType 

136 

137 @classmethod 

138 def setUpClass(cls): 

139 cls.storageClassFactory = StorageClassFactory() 

140 cls.storageClassFactory.addFromConfig(cls.configFile) 

141 

142 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None): 

143 datasetType = datasetRef.datasetType 

144 dataId = datasetRef.dataId 

145 deferred = butler.getDirectDeferred(datasetRef) 

146 

147 for component in components: 

148 compTypeName = datasetType.componentTypeName(component) 

149 result = butler.get(compTypeName, dataId, collections=collections) 

150 self.assertEqual(result, getattr(reference, component)) 

151 result_deferred = deferred.get(component=component) 

152 self.assertEqual(result_deferred, result) 

153 

154 def tearDown(self): 

155 removeTestTempDir(self.root) 

156 

157 def runPutGetTest(self, storageClass, datasetTypeName): 

158 # New datasets will be added to run and tag, but we will only look in 

159 # tag when looking up datasets. 

160 run = "ingest" 

161 butler = Butler(self.tmpConfigFile, run=run) 

162 

163 collections = set(butler.registry.queryCollections()) 

164 self.assertEqual(collections, set([run])) 

165 

166 # Create and register a DatasetType 

167 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

168 

169 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

170 

171 # Add needed Dimensions 

172 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

173 butler.registry.insertDimensionData( 

174 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

175 ) 

176 butler.registry.insertDimensionData( 

177 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

178 ) 

179 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

180 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

181 butler.registry.insertDimensionData( 

182 "visit", 

183 { 

184 "instrument": "DummyCamComp", 

185 "id": 423, 

186 "name": "fourtwentythree", 

187 "physical_filter": "d-r", 

188 "visit_system": 1, 

189 "datetime_begin": visit_start, 

190 "datetime_end": visit_end, 

191 }, 

192 ) 

193 

194 # Add a second visit for some later tests 

195 butler.registry.insertDimensionData( 

196 "visit", 

197 { 

198 "instrument": "DummyCamComp", 

199 "id": 424, 

200 "name": "fourtwentyfour", 

201 "physical_filter": "d-r", 

202 "visit_system": 1, 

203 }, 

204 ) 

205 

206 # Create and store a dataset 

207 metric = makeExampleMetrics() 

208 dataId = {"instrument": "DummyCamComp", "visit": 423} 

209 

210 # Create a DatasetRef for put 

211 refIn = DatasetRef(datasetType, dataId, id=None) 

212 

213 # Put with a preexisting id should fail 

214 with self.assertRaises(ValueError): 

215 butler.put(metric, DatasetRef(datasetType, dataId, id=100)) 

216 

217 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

218 # and once with a DatasetType 

219 

220 # Keep track of any collections we add and do not clean up 

221 expected_collections = {run} 

222 

223 counter = 0 

224 for args in ((refIn,), (datasetTypeName, dataId), (datasetType, dataId)): 

225 # Since we are using subTest we can get cascading failures 

226 # here with the first attempt failing and the others failing 

227 # immediately because the dataset already exists. Work around 

228 # this by using a distinct run collection each time 

229 counter += 1 

230 this_run = f"put_run_{counter}" 

231 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

232 expected_collections.update({this_run}) 

233 

234 with self.subTest(args=args): 

235 ref = butler.put(metric, *args, run=this_run) 

236 self.assertIsInstance(ref, DatasetRef) 

237 

238 # Test getDirect 

239 metricOut = butler.getDirect(ref) 

240 self.assertEqual(metric, metricOut) 

241 # Test get 

242 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

243 self.assertEqual(metric, metricOut) 

244 # Test get with a datasetRef 

245 metricOut = butler.get(ref, collections=this_run) 

246 self.assertEqual(metric, metricOut) 

247 # Test getDeferred with dataId 

248 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

249 self.assertEqual(metric, metricOut) 

250 # Test getDeferred with a datasetRef 

251 metricOut = butler.getDeferred(ref, collections=this_run).get() 

252 self.assertEqual(metric, metricOut) 

253 # and deferred direct with ref 

254 metricOut = butler.getDirectDeferred(ref).get() 

255 self.assertEqual(metric, metricOut) 

256 

257 # Check we can get components 

258 if storageClass.isComposite(): 

259 self.assertGetComponents( 

260 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

261 ) 

262 

263 # Can the artifacts themselves be retrieved? 

264 if not butler.datastore.isEphemeral: 

265 root_uri = ResourcePath(self.root) 

266 

267 for preserve_path in (True, False): 

268 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

269 # Use copy so that we can test that overwrite 

270 # protection works (using "auto" for File URIs would 

271 # use hard links and subsequent transfer would work 

272 # because it knows they are the same file). 

273 transferred = butler.retrieveArtifacts( 

274 [ref], destination, preserve_path=preserve_path, transfer="copy" 

275 ) 

276 self.assertGreater(len(transferred), 0) 

277 artifacts = list(ResourcePath.findFileResources([destination])) 

278 self.assertEqual(set(transferred), set(artifacts)) 

279 

280 for artifact in transferred: 

281 path_in_destination = artifact.relative_to(destination) 

282 self.assertIsNotNone(path_in_destination) 

283 

284 # when path is not preserved there should not be 

285 # any path separators. 

286 num_seps = path_in_destination.count("/") 

287 if preserve_path: 

288 self.assertGreater(num_seps, 0) 

289 else: 

290 self.assertEqual(num_seps, 0) 

291 

292 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

293 n_uris = len(secondary_uris) 

294 if primary_uri: 

295 n_uris += 1 

296 self.assertEqual( 

297 len(artifacts), 

298 n_uris, 

299 "Comparing expected artifacts vs actual:" 

300 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

301 ) 

302 

303 if preserve_path: 

304 # No need to run these twice 

305 with self.assertRaises(ValueError): 

306 butler.retrieveArtifacts([ref], destination, transfer="move") 

307 

308 with self.assertRaises(FileExistsError): 

309 butler.retrieveArtifacts([ref], destination) 

310 

311 transferred_again = butler.retrieveArtifacts( 

312 [ref], destination, preserve_path=preserve_path, overwrite=True 

313 ) 

314 self.assertEqual(set(transferred_again), set(transferred)) 

315 

316 # Now remove the dataset completely. 

317 butler.pruneDatasets([ref], purge=True, unstore=True, run=this_run) 

318 # Lookup with original args should still fail. 

319 with self.assertRaises(LookupError): 

320 butler.datasetExists(*args, collections=this_run) 

321 # getDirect() should still fail. 

322 with self.assertRaises(FileNotFoundError): 

323 butler.getDirect(ref) 

324 # Registry shouldn't be able to find it by dataset_id anymore. 

325 self.assertIsNone(butler.registry.getDataset(ref.id)) 

326 

327 # Do explicit registry removal since we know they are 

328 # empty 

329 butler.registry.removeCollection(this_run) 

330 expected_collections.remove(this_run) 

331 

332 # Put the dataset again, since the last thing we did was remove it 

333 # and we want to use the default collection. 

334 ref = butler.put(metric, refIn) 

335 

336 # Get with parameters 

337 stop = 4 

338 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

339 self.assertNotEqual(metric, sliced) 

340 self.assertEqual(metric.summary, sliced.summary) 

341 self.assertEqual(metric.output, sliced.output) 

342 self.assertEqual(metric.data[:stop], sliced.data) 

343 # getDeferred with parameters 

344 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

345 self.assertNotEqual(metric, sliced) 

346 self.assertEqual(metric.summary, sliced.summary) 

347 self.assertEqual(metric.output, sliced.output) 

348 self.assertEqual(metric.data[:stop], sliced.data) 

349 # getDeferred with deferred parameters 

350 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

351 self.assertNotEqual(metric, sliced) 

352 self.assertEqual(metric.summary, sliced.summary) 

353 self.assertEqual(metric.output, sliced.output) 

354 self.assertEqual(metric.data[:stop], sliced.data) 

355 

356 if storageClass.isComposite(): 

357 # Check that components can be retrieved 

358 metricOut = butler.get(ref.datasetType.name, dataId) 

359 compNameS = ref.datasetType.componentTypeName("summary") 

360 compNameD = ref.datasetType.componentTypeName("data") 

361 summary = butler.get(compNameS, dataId) 

362 self.assertEqual(summary, metric.summary) 

363 data = butler.get(compNameD, dataId) 

364 self.assertEqual(data, metric.data) 

365 

366 if "counter" in storageClass.derivedComponents: 

367 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

368 self.assertEqual(count, len(data)) 

369 

370 count = butler.get( 

371 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

372 ) 

373 self.assertEqual(count, stop) 

374 

375 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

376 summary = butler.getDirect(compRef) 

377 self.assertEqual(summary, metric.summary) 

378 

379 # Create a Dataset type that has the same name but is inconsistent. 

380 inconsistentDatasetType = DatasetType( 

381 datasetTypeName, dimensions, self.storageClassFactory.getStorageClass("Config") 

382 ) 

383 

384 # Getting with a dataset type that does not match registry fails 

385 with self.assertRaises(ValueError): 

386 butler.get(inconsistentDatasetType, dataId) 

387 

388 # Combining a DatasetRef with a dataId should fail 

389 with self.assertRaises(ValueError): 

390 butler.get(ref, dataId) 

391 # Getting with an explicit ref should fail if the id doesn't match 

392 with self.assertRaises(ValueError): 

393 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=101)) 

394 

395 # Getting a dataset with unknown parameters should fail 

396 with self.assertRaises(KeyError): 

397 butler.get(ref, parameters={"unsupported": True}) 

398 

399 # Check we have a collection 

400 collections = set(butler.registry.queryCollections()) 

401 self.assertEqual(collections, expected_collections) 

402 

403 # Clean up to check that we can remove something that may have 

404 # already had a component removed 

405 butler.pruneDatasets([ref], unstore=True, purge=True) 

406 

407 # Check that we can configure a butler to accept a put even 

408 # if it already has the dataset in registry. 

409 ref = butler.put(metric, refIn) 

410 

411 # Repeat put will fail. 

412 with self.assertRaises(ConflictingDefinitionError): 

413 butler.put(metric, refIn) 

414 

415 # Remove the datastore entry. 

416 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

417 

418 # Put will still fail 

419 with self.assertRaises(ConflictingDefinitionError): 

420 butler.put(metric, refIn) 

421 

422 # Allow the put to succeed 

423 butler._allow_put_of_predefined_dataset = True 

424 ref2 = butler.put(metric, refIn) 

425 self.assertEqual(ref2.id, ref.id) 

426 

427 # A second put will still fail but with a different exception 

428 # than before. 

429 with self.assertRaises(ConflictingDefinitionError): 

430 butler.put(metric, refIn) 

431 

432 # Reset the flag to avoid confusion 

433 butler._allow_put_of_predefined_dataset = False 

434 

435 # Leave the dataset in place since some downstream tests require 

436 # something to be present 

437 

438 return butler 

439 

440 def testDeferredCollectionPassing(self): 

441 # Construct a butler with no run or collection, but make it writeable. 

442 butler = Butler(self.tmpConfigFile, writeable=True) 

443 # Create and register a DatasetType 

444 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

445 datasetType = self.addDatasetType( 

446 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

447 ) 

448 # Add needed Dimensions 

449 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

450 butler.registry.insertDimensionData( 

451 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

452 ) 

453 butler.registry.insertDimensionData( 

454 "visit", 

455 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

456 ) 

457 dataId = {"instrument": "DummyCamComp", "visit": 423} 

458 # Create dataset. 

459 metric = makeExampleMetrics() 

460 # Register a new run and put dataset. 

461 run = "deferred" 

462 self.assertTrue(butler.registry.registerRun(run)) 

463 # Second time it will be allowed but indicate no-op 

464 self.assertFalse(butler.registry.registerRun(run)) 

465 ref = butler.put(metric, datasetType, dataId, run=run) 

466 # Putting with no run should fail with TypeError. 

467 with self.assertRaises(TypeError): 

468 butler.put(metric, datasetType, dataId) 

469 # Dataset should exist. 

470 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

471 # We should be able to get the dataset back, but with and without 

472 # a deferred dataset handle. 

473 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

474 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

475 # Trying to find the dataset without any collection is a TypeError. 

476 with self.assertRaises(TypeError): 

477 butler.datasetExists(datasetType, dataId) 

478 with self.assertRaises(TypeError): 

479 butler.get(datasetType, dataId) 

480 # Associate the dataset with a different collection. 

481 butler.registry.registerCollection("tagged") 

482 butler.registry.associate("tagged", [ref]) 

483 # Deleting the dataset from the new collection should make it findable 

484 # in the original collection. 

485 butler.pruneDatasets([ref], tags=["tagged"]) 

486 self.assertTrue(butler.datasetExists(datasetType, dataId, collections=[run])) 

487 

488 

489class ButlerTests(ButlerPutGetTests): 

490 """Tests for Butler.""" 

491 

492 useTempRoot = True 

493 

494 def setUp(self): 

495 """Create a new butler root for each test.""" 

496 self.root = makeTestTempDir(TESTDIR) 

497 Butler.makeRepo(self.root, config=Config(self.configFile)) 

498 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

499 

500 def testConstructor(self): 

501 """Independent test of constructor.""" 

502 butler = Butler(self.tmpConfigFile, run="ingest") 

503 self.assertIsInstance(butler, Butler) 

504 

505 # Check that butler.yaml is added automatically. 

506 if self.tmpConfigFile.endswith(end := "/butler.yaml"): 

507 config_dir = self.tmpConfigFile[: -len(end)] 

508 butler = Butler(config_dir, run="ingest") 

509 self.assertIsInstance(butler, Butler) 

510 

511 collections = set(butler.registry.queryCollections()) 

512 self.assertEqual(collections, {"ingest"}) 

513 

514 butler2 = Butler(butler=butler, collections=["other"]) 

515 self.assertEqual(butler2.collections, CollectionSearch.fromExpression(["other"])) 

516 self.assertIsNone(butler2.run) 

517 self.assertIs(butler.datastore, butler2.datastore) 

518 

519 # Test that we can use an environment variable to find this 

520 # repository. 

521 butler_index = Config() 

522 butler_index["label"] = self.tmpConfigFile 

523 for suffix in (".yaml", ".json"): 

524 # Ensure that the content differs so that we know that 

525 # we aren't reusing the cache. 

526 bad_label = f"s3://bucket/not_real{suffix}" 

527 butler_index["bad_label"] = bad_label 

528 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

529 butler_index.dumpToUri(temp_file) 

530 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

531 self.assertEqual(Butler.get_known_repos(), set(("label", "bad_label"))) 

532 uri = Butler.get_repo_uri("bad_label") 

533 self.assertEqual(uri, ResourcePath(bad_label)) 

534 uri = Butler.get_repo_uri("label") 

535 butler = Butler(uri, writeable=False) 

536 self.assertIsInstance(butler, Butler) 

537 with self.assertRaises(KeyError) as cm: 

538 Butler.get_repo_uri("missing") 

539 self.assertIn("not known to", str(cm.exception)) 

540 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

541 with self.assertRaises(FileNotFoundError): 

542 Butler.get_repo_uri("label") 

543 self.assertEqual(Butler.get_known_repos(), set()) 

544 with self.assertRaises(KeyError) as cm: 

545 # No environment variable set. 

546 Butler.get_repo_uri("label") 

547 self.assertIn("No repository index defined", str(cm.exception)) 

548 self.assertEqual(Butler.get_known_repos(), set()) 

549 

550 def testBasicPutGet(self): 

551 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

552 self.runPutGetTest(storageClass, "test_metric") 

553 

554 def testCompositePutGetConcrete(self): 

555 

556 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

557 butler = self.runPutGetTest(storageClass, "test_metric") 

558 

559 # Should *not* be disassembled 

560 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

561 self.assertEqual(len(datasets), 1) 

562 uri, components = butler.getURIs(datasets[0]) 

563 self.assertIsInstance(uri, ResourcePath) 

564 self.assertFalse(components) 

565 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

566 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

567 

568 # Predicted dataset 

569 dataId = {"instrument": "DummyCamComp", "visit": 424} 

570 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

571 self.assertFalse(components) 

572 self.assertIsInstance(uri, ResourcePath) 

573 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

574 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

575 

576 def testCompositePutGetVirtual(self): 

577 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

578 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

579 

580 # Should be disassembled 

581 datasets = list(butler.registry.queryDatasets(..., collections="ingest")) 

582 self.assertEqual(len(datasets), 1) 

583 uri, components = butler.getURIs(datasets[0]) 

584 

585 if butler.datastore.isEphemeral: 

586 # Never disassemble in-memory datastore 

587 self.assertIsInstance(uri, ResourcePath) 

588 self.assertFalse(components) 

589 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

590 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

591 else: 

592 self.assertIsNone(uri) 

593 self.assertEqual(set(components), set(storageClass.components)) 

594 for compuri in components.values(): 

595 self.assertIsInstance(compuri, ResourcePath) 

596 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

597 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

598 

599 # Predicted dataset 

600 dataId = {"instrument": "DummyCamComp", "visit": 424} 

601 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

602 

603 if butler.datastore.isEphemeral: 

604 # Never disassembled 

605 self.assertIsInstance(uri, ResourcePath) 

606 self.assertFalse(components) 

607 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

608 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

609 else: 

610 self.assertIsNone(uri) 

611 self.assertEqual(set(components), set(storageClass.components)) 

612 for compuri in components.values(): 

613 self.assertIsInstance(compuri, ResourcePath) 

614 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

615 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

616 

617 def testIngest(self): 

618 butler = Butler(self.tmpConfigFile, run="ingest") 

619 

620 # Create and register a DatasetType 

621 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "detector"]) 

622 

623 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

624 datasetTypeName = "metric" 

625 

626 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

627 

628 # Add needed Dimensions 

629 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

630 butler.registry.insertDimensionData( 

631 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

632 ) 

633 for detector in (1, 2): 

634 butler.registry.insertDimensionData( 

635 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

636 ) 

637 

638 butler.registry.insertDimensionData( 

639 "visit", 

640 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

641 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

642 ) 

643 

644 formatter = doImport("lsst.daf.butler.formatters.yaml.YamlFormatter") 

645 dataRoot = os.path.join(TESTDIR, "data", "basic") 

646 datasets = [] 

647 for detector in (1, 2): 

648 detector_name = f"detector_{detector}" 

649 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

650 dataId = {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

651 # Create a DatasetRef for ingest 

652 refIn = DatasetRef(datasetType, dataId, id=None) 

653 

654 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

655 

656 butler.ingest(*datasets, transfer="copy") 

657 

658 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

659 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

660 

661 metrics1 = butler.get(datasetTypeName, dataId1) 

662 metrics2 = butler.get(datasetTypeName, dataId2) 

663 self.assertNotEqual(metrics1, metrics2) 

664 

665 # Compare URIs 

666 uri1 = butler.getURI(datasetTypeName, dataId1) 

667 uri2 = butler.getURI(datasetTypeName, dataId2) 

668 self.assertNotEqual(uri1, uri2) 

669 

670 # Now do a multi-dataset but single file ingest 

671 metricFile = os.path.join(dataRoot, "detectors.yaml") 

672 refs = [] 

673 for detector in (1, 2): 

674 detector_name = f"detector_{detector}" 

675 dataId = {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

676 # Create a DatasetRef for ingest 

677 refs.append(DatasetRef(datasetType, dataId, id=None)) 

678 

679 datasets = [] 

680 datasets.append(FileDataset(path=metricFile, refs=refs, formatter=MultiDetectorFormatter)) 

681 

682 butler.ingest(*datasets, transfer="copy") 

683 

684 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

685 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

686 

687 multi1 = butler.get(datasetTypeName, dataId1) 

688 multi2 = butler.get(datasetTypeName, dataId2) 

689 

690 self.assertEqual(multi1, metrics1) 

691 self.assertEqual(multi2, metrics2) 

692 

693 # Compare URIs 

694 uri1 = butler.getURI(datasetTypeName, dataId1) 

695 uri2 = butler.getURI(datasetTypeName, dataId2) 

696 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

697 

698 # Test that removing one does not break the second 

699 # This line will issue a warning log message for a ChainedDatastore 

700 # that uses an InMemoryDatastore since in-memory can not ingest 

701 # files. 

702 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

703 self.assertFalse(butler.datasetExists(datasetTypeName, dataId1)) 

704 self.assertTrue(butler.datasetExists(datasetTypeName, dataId2)) 

705 multi2b = butler.get(datasetTypeName, dataId2) 

706 self.assertEqual(multi2, multi2b) 

707 

708 def testPruneCollections(self): 

709 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

710 butler = Butler(self.tmpConfigFile, writeable=True) 

711 # Load registry data with dimensions to hang datasets off of. 

712 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

713 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

714 # Add some RUN-type collections. 

715 run1 = "run1" 

716 butler.registry.registerRun(run1) 

717 run2 = "run2" 

718 butler.registry.registerRun(run2) 

719 # put some datasets. ref1 and ref2 have the same data ID, and are in 

720 # different runs. ref3 has a different data ID. 

721 metric = makeExampleMetrics() 

722 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

723 datasetType = self.addDatasetType( 

724 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

725 ) 

726 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

727 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

728 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

729 

730 # Try to delete a RUN collection without purge, or with purge and not 

731 # unstore. 

732 with self.assertRaises(TypeError): 

733 butler.pruneCollection(run1) 

734 with self.assertRaises(TypeError): 

735 butler.pruneCollection(run2, purge=True) 

736 # Add a TAGGED collection and associate ref3 only into it. 

737 tag1 = "tag1" 

738 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

739 self.assertTrue(registered) 

740 # Registering a second time should be allowed. 

741 registered = butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

742 self.assertFalse(registered) 

743 butler.registry.associate(tag1, [ref3]) 

744 # Add a CHAINED collection that searches run1 and then run2. It 

745 # logically contains only ref1, because ref2 is shadowed due to them 

746 # having the same data ID and dataset type. 

747 chain1 = "chain1" 

748 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

749 butler.registry.setCollectionChain(chain1, [run1, run2]) 

750 # Try to delete RUN collections, which should fail with complete 

751 # rollback because they're still referenced by the CHAINED 

752 # collection. 

753 with self.assertRaises(Exception): 

754 butler.pruneCollection(run1, pruge=True, unstore=True) 

755 with self.assertRaises(Exception): 

756 butler.pruneCollection(run2, pruge=True, unstore=True) 

757 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

758 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

759 self.assertTrue(existence[ref1]) 

760 self.assertTrue(existence[ref2]) 

761 self.assertTrue(existence[ref3]) 

762 # Try to delete CHAINED and TAGGED collections with purge; should not 

763 # work. 

764 with self.assertRaises(TypeError): 

765 butler.pruneCollection(tag1, purge=True, unstore=True) 

766 with self.assertRaises(TypeError): 

767 butler.pruneCollection(chain1, purge=True, unstore=True) 

768 # Remove the tagged collection with unstore=False. This should not 

769 # affect the datasets. 

770 butler.pruneCollection(tag1) 

771 with self.assertRaises(MissingCollectionError): 

772 butler.registry.getCollectionType(tag1) 

773 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

774 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

775 self.assertTrue(existence[ref1]) 

776 self.assertTrue(existence[ref2]) 

777 self.assertTrue(existence[ref3]) 

778 # Add the tagged collection back in, and remove it with unstore=True. 

779 # This should remove ref3 only from the datastore. 

780 butler.registry.registerCollection(tag1, type=CollectionType.TAGGED) 

781 butler.registry.associate(tag1, [ref3]) 

782 butler.pruneCollection(tag1, unstore=True) 

783 with self.assertRaises(MissingCollectionError): 

784 butler.registry.getCollectionType(tag1) 

785 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

786 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

787 self.assertTrue(existence[ref1]) 

788 self.assertTrue(existence[ref2]) 

789 self.assertFalse(existence[ref3]) 

790 # Delete the chain with unstore=False. The datasets should not be 

791 # affected at all. 

792 butler.pruneCollection(chain1) 

793 with self.assertRaises(MissingCollectionError): 

794 butler.registry.getCollectionType(chain1) 

795 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

796 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

797 self.assertTrue(existence[ref1]) 

798 self.assertTrue(existence[ref2]) 

799 self.assertFalse(existence[ref3]) 

800 # Redefine and then delete the chain with unstore=True. Only ref1 

801 # should be unstored (ref3 has already been unstored, but otherwise 

802 # would be now). 

803 butler.registry.registerCollection(chain1, type=CollectionType.CHAINED) 

804 butler.registry.setCollectionChain(chain1, [run1, run2]) 

805 butler.pruneCollection(chain1, unstore=True) 

806 with self.assertRaises(MissingCollectionError): 

807 butler.registry.getCollectionType(chain1) 

808 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref1, ref2, ref3]) 

809 existence = butler.datastore.mexists([ref1, ref2, ref3]) 

810 self.assertFalse(existence[ref1]) 

811 self.assertTrue(existence[ref2]) 

812 self.assertFalse(existence[ref3]) 

813 # Remove run1. This removes ref1 and ref3 from the registry (they're 

814 # already gone from the datastore, which is fine). 

815 butler.pruneCollection(run1, purge=True, unstore=True) 

816 with self.assertRaises(MissingCollectionError): 

817 butler.registry.getCollectionType(run1) 

818 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), [ref2]) 

819 self.assertTrue(butler.datastore.exists(ref2)) 

820 # Remove run2. This removes ref2 from the registry and the datastore. 

821 butler.pruneCollection(run2, purge=True, unstore=True) 

822 with self.assertRaises(MissingCollectionError): 

823 butler.registry.getCollectionType(run2) 

824 self.assertCountEqual(set(butler.registry.queryDatasets(..., collections=...)), []) 

825 

826 # Now that the collections have been pruned we can remove the 

827 # dataset type 

828 butler.registry.removeDatasetType(datasetType.name) 

829 

830 def testPickle(self): 

831 """Test pickle support.""" 

832 butler = Butler(self.tmpConfigFile, run="ingest") 

833 butlerOut = pickle.loads(pickle.dumps(butler)) 

834 self.assertIsInstance(butlerOut, Butler) 

835 self.assertEqual(butlerOut._config, butler._config) 

836 self.assertEqual(butlerOut.collections, butler.collections) 

837 self.assertEqual(butlerOut.run, butler.run) 

838 

839 def testGetDatasetTypes(self): 

840 butler = Butler(self.tmpConfigFile, run="ingest") 

841 dimensions = butler.registry.dimensions.extract(["instrument", "visit", "physical_filter"]) 

842 dimensionEntries = [ 

843 ( 

844 "instrument", 

845 {"instrument": "DummyCam"}, 

846 {"instrument": "DummyHSC"}, 

847 {"instrument": "DummyCamComp"}, 

848 ), 

849 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

850 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

851 ] 

852 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

853 # Add needed Dimensions 

854 for args in dimensionEntries: 

855 butler.registry.insertDimensionData(*args) 

856 

857 # When a DatasetType is added to the registry entries are not created 

858 # for components but querying them can return the components. 

859 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

860 components = set() 

861 for datasetTypeName in datasetTypeNames: 

862 # Create and register a DatasetType 

863 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

864 

865 for componentName in storageClass.components: 

866 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

867 

868 fromRegistry = set(butler.registry.queryDatasetTypes(components=True)) 

869 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

870 

871 # Now that we have some dataset types registered, validate them 

872 butler.validateConfiguration( 

873 ignore=[ 

874 "test_metric_comp", 

875 "metric3", 

876 "calexp", 

877 "DummySC", 

878 "datasetType.component", 

879 "random_data", 

880 "random_data_2", 

881 ] 

882 ) 

883 

884 # Add a new datasetType that will fail template validation 

885 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

886 if self.validationCanFail: 

887 with self.assertRaises(ValidationError): 

888 butler.validateConfiguration() 

889 

890 # Rerun validation but with a subset of dataset type names 

891 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

892 

893 # Rerun validation but ignore the bad datasetType 

894 butler.validateConfiguration( 

895 ignore=[ 

896 "test_metric_comp", 

897 "metric3", 

898 "calexp", 

899 "DummySC", 

900 "datasetType.component", 

901 "random_data", 

902 "random_data_2", 

903 ] 

904 ) 

905 

906 def testTransaction(self): 

907 butler = Butler(self.tmpConfigFile, run="ingest") 

908 datasetTypeName = "test_metric" 

909 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

910 dimensionEntries = ( 

911 ("instrument", {"instrument": "DummyCam"}), 

912 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

913 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

914 ) 

915 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

916 metric = makeExampleMetrics() 

917 dataId = {"instrument": "DummyCam", "visit": 42} 

918 # Create and register a DatasetType 

919 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

920 with self.assertRaises(TransactionTestError): 

921 with butler.transaction(): 

922 # Add needed Dimensions 

923 for args in dimensionEntries: 

924 butler.registry.insertDimensionData(*args) 

925 # Store a dataset 

926 ref = butler.put(metric, datasetTypeName, dataId) 

927 self.assertIsInstance(ref, DatasetRef) 

928 # Test getDirect 

929 metricOut = butler.getDirect(ref) 

930 self.assertEqual(metric, metricOut) 

931 # Test get 

932 metricOut = butler.get(datasetTypeName, dataId) 

933 self.assertEqual(metric, metricOut) 

934 # Check we can get components 

935 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

936 raise TransactionTestError("This should roll back the entire transaction") 

937 with self.assertRaises(LookupError, msg=f"Check can't expand DataId {dataId}"): 

938 butler.registry.expandDataId(dataId) 

939 # Should raise LookupError for missing data ID value 

940 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

941 butler.get(datasetTypeName, dataId) 

942 # Also check explicitly if Dataset entry is missing 

943 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

944 # Direct retrieval should not find the file in the Datastore 

945 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

946 butler.getDirect(ref) 

947 

948 def testMakeRepo(self): 

949 """Test that we can write butler configuration to a new repository via 

950 the Butler.makeRepo interface and then instantiate a butler from the 

951 repo root. 

952 """ 

953 # Do not run the test if we know this datastore configuration does 

954 # not support a file system root 

955 if self.fullConfigKey is None: 

956 return 

957 

958 # create two separate directories 

959 root1 = tempfile.mkdtemp(dir=self.root) 

960 root2 = tempfile.mkdtemp(dir=self.root) 

961 

962 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

963 limited = Config(self.configFile) 

964 butler1 = Butler(butlerConfig) 

965 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

966 full = Config(self.tmpConfigFile) 

967 butler2 = Butler(butlerConfig) 

968 # Butlers should have the same configuration regardless of whether 

969 # defaults were expanded. 

970 self.assertEqual(butler1._config, butler2._config) 

971 # Config files loaded directly should not be the same. 

972 self.assertNotEqual(limited, full) 

973 # Make sure "limited" doesn't have a few keys we know it should be 

974 # inheriting from defaults. 

975 self.assertIn(self.fullConfigKey, full) 

976 self.assertNotIn(self.fullConfigKey, limited) 

977 

978 # Collections don't appear until something is put in them 

979 collections1 = set(butler1.registry.queryCollections()) 

980 self.assertEqual(collections1, set()) 

981 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

982 

983 # Check that a config with no associated file name will not 

984 # work properly with relocatable Butler repo 

985 butlerConfig.configFile = None 

986 with self.assertRaises(ValueError): 

987 Butler(butlerConfig) 

988 

989 with self.assertRaises(FileExistsError): 

990 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

991 

992 def testStringification(self): 

993 butler = Butler(self.tmpConfigFile, run="ingest") 

994 butlerStr = str(butler) 

995 

996 if self.datastoreStr is not None: 

997 for testStr in self.datastoreStr: 

998 self.assertIn(testStr, butlerStr) 

999 if self.registryStr is not None: 

1000 self.assertIn(self.registryStr, butlerStr) 

1001 

1002 datastoreName = butler.datastore.name 

1003 if self.datastoreName is not None: 

1004 for testStr in self.datastoreName: 

1005 self.assertIn(testStr, datastoreName) 

1006 

1007 def testButlerRewriteDataId(self): 

1008 """Test that dataIds can be rewritten based on dimension records.""" 

1009 

1010 butler = Butler(self.tmpConfigFile, run="ingest") 

1011 

1012 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1013 datasetTypeName = "random_data" 

1014 

1015 # Create dimension records. 

1016 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1017 butler.registry.insertDimensionData( 

1018 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1019 ) 

1020 butler.registry.insertDimensionData( 

1021 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1022 ) 

1023 

1024 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1025 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1026 butler.registry.registerDatasetType(datasetType) 

1027 

1028 n_exposures = 5 

1029 dayobs = 20210530 

1030 

1031 for i in range(n_exposures): 

1032 butler.registry.insertDimensionData( 

1033 "exposure", 

1034 { 

1035 "instrument": "DummyCamComp", 

1036 "id": i, 

1037 "obs_id": f"exp{i}", 

1038 "seq_num": i, 

1039 "day_obs": dayobs, 

1040 "physical_filter": "d-r", 

1041 }, 

1042 ) 

1043 

1044 # Write some data. 

1045 for i in range(n_exposures): 

1046 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1047 

1048 # Use the seq_num for the put to test rewriting. 

1049 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1050 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1051 

1052 # Check that the exposure is correct in the dataId 

1053 self.assertEqual(ref.dataId["exposure"], i) 

1054 

1055 # and check that we can get the dataset back with the same dataId 

1056 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1057 self.assertEqual(new_metric, metric) 

1058 

1059 

1060class FileDatastoreButlerTests(ButlerTests): 

1061 """Common tests and specialization of ButlerTests for butlers backed 

1062 by datastores that inherit from FileDatastore. 

1063 """ 

1064 

1065 def checkFileExists(self, root, relpath): 

1066 """Checks if file exists at a given path (relative to root). 

1067 

1068 Test testPutTemplates verifies actual physical existance of the files 

1069 in the requested location. 

1070 """ 

1071 uri = ResourcePath(root, forceDirectory=True) 

1072 return uri.join(relpath).exists() 

1073 

1074 def testPutTemplates(self): 

1075 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1076 butler = Butler(self.tmpConfigFile, run="ingest") 

1077 

1078 # Add needed Dimensions 

1079 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1080 butler.registry.insertDimensionData( 

1081 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1082 ) 

1083 butler.registry.insertDimensionData( 

1084 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1085 ) 

1086 butler.registry.insertDimensionData( 

1087 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1088 ) 

1089 

1090 # Create and store a dataset 

1091 metric = makeExampleMetrics() 

1092 

1093 # Create two almost-identical DatasetTypes (both will use default 

1094 # template) 

1095 dimensions = butler.registry.dimensions.extract(["instrument", "visit"]) 

1096 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1097 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1098 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1099 

1100 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1101 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1102 

1103 # Put with exactly the data ID keys needed 

1104 ref = butler.put(metric, "metric1", dataId1) 

1105 uri = butler.getURI(ref) 

1106 self.assertTrue( 

1107 self.checkFileExists(butler.datastore.root, "ingest/metric1/??#?/d-r/DummyCamComp_423.pickle"), 

1108 f"Checking existence of {uri}", 

1109 ) 

1110 

1111 # Check the template based on dimensions 

1112 butler.datastore.templates.validateTemplates([ref]) 

1113 

1114 # Put with extra data ID keys (physical_filter is an optional 

1115 # dependency); should not change template (at least the way we're 

1116 # defining them to behave now; the important thing is that they 

1117 # must be consistent). 

1118 ref = butler.put(metric, "metric2", dataId2) 

1119 uri = butler.getURI(ref) 

1120 self.assertTrue( 

1121 self.checkFileExists(butler.datastore.root, "ingest/metric2/d-r/DummyCamComp_v423.pickle"), 

1122 f"Checking existence of {uri}", 

1123 ) 

1124 

1125 # Check the template based on dimensions 

1126 butler.datastore.templates.validateTemplates([ref]) 

1127 

1128 # Now use a file template that will not result in unique filenames 

1129 with self.assertRaises(FileTemplateValidationError): 

1130 butler.put(metric, "metric3", dataId1) 

1131 

1132 def testImportExport(self): 

1133 # Run put/get tests just to create and populate a repo. 

1134 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1135 self.runImportExportTest(storageClass) 

1136 

1137 @unittest.expectedFailure 

1138 def testImportExportVirtualComposite(self): 

1139 # Run put/get tests just to create and populate a repo. 

1140 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1141 self.runImportExportTest(storageClass) 

1142 

1143 def runImportExportTest(self, storageClass): 

1144 """This test does an export to a temp directory and an import back 

1145 into a new temp directory repo. It does not assume a posix datastore""" 

1146 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1147 print("Root:", exportButler.datastore.root) 

1148 # Test that the repo actually has at least one dataset. 

1149 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1150 self.assertGreater(len(datasets), 0) 

1151 # Add a DimensionRecord that's unused by those datasets. 

1152 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1153 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1154 # Export and then import datasets. 

1155 with safeTestTempDir(TESTDIR) as exportDir: 

1156 exportFile = os.path.join(exportDir, "exports.yaml") 

1157 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1158 export.saveDatasets(datasets) 

1159 # Export the same datasets again. This should quietly do 

1160 # nothing because of internal deduplication, and it shouldn't 

1161 # complain about being asked to export the "htm7" elements even 

1162 # though there aren't any in these datasets or in the database. 

1163 export.saveDatasets(datasets, elements=["htm7"]) 

1164 # Save one of the data IDs again; this should be harmless 

1165 # because of internal deduplication. 

1166 export.saveDataIds([datasets[0].dataId]) 

1167 # Save some dimension records directly. 

1168 export.saveDimensionData("skymap", [skymapRecord]) 

1169 self.assertTrue(os.path.exists(exportFile)) 

1170 with safeTestTempDir(TESTDIR) as importDir: 

1171 # We always want this to be a local posix butler 

1172 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1173 # Calling script.butlerImport tests the implementation of the 

1174 # butler command line interface "import" subcommand. Functions 

1175 # in the script folder are generally considered protected and 

1176 # should not be used as public api. 

1177 with open(exportFile, "r") as f: 

1178 script.butlerImport( 

1179 importDir, 

1180 export_file=f, 

1181 directory=exportDir, 

1182 transfer="auto", 

1183 skip_dimensions=None, 

1184 reuse_ids=False, 

1185 ) 

1186 importButler = Butler(importDir, run="ingest") 

1187 for ref in datasets: 

1188 with self.subTest(ref=ref): 

1189 # Test for existence by passing in the DatasetType and 

1190 # data ID separately, to avoid lookup by dataset_id. 

1191 self.assertTrue(importButler.datasetExists(ref.datasetType, ref.dataId)) 

1192 self.assertEqual( 

1193 list(importButler.registry.queryDimensionRecords("skymap")), 

1194 [importButler.registry.dimensions["skymap"].RecordClass(**skymapRecord)], 

1195 ) 

1196 

1197 def testRemoveRuns(self): 

1198 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1199 butler = Butler(self.tmpConfigFile, writeable=True) 

1200 # Load registry data with dimensions to hang datasets off of. 

1201 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1202 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1203 # Add some RUN-type collection. 

1204 run1 = "run1" 

1205 butler.registry.registerRun(run1) 

1206 run2 = "run2" 

1207 butler.registry.registerRun(run2) 

1208 # put a dataset in each 

1209 metric = makeExampleMetrics() 

1210 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1211 datasetType = self.addDatasetType( 

1212 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1213 ) 

1214 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1215 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1216 uri1 = butler.getURI(ref1, collections=[run1]) 

1217 uri2 = butler.getURI(ref2, collections=[run2]) 

1218 # Remove from both runs with different values for unstore. 

1219 butler.removeRuns([run1], unstore=True) 

1220 butler.removeRuns([run2], unstore=False) 

1221 # Should be nothing in registry for either one, and datastore should 

1222 # not think either exists. 

1223 with self.assertRaises(MissingCollectionError): 

1224 butler.registry.getCollectionType(run1) 

1225 with self.assertRaises(MissingCollectionError): 

1226 butler.registry.getCollectionType(run2) 

1227 self.assertFalse(butler.datastore.exists(ref1)) 

1228 self.assertFalse(butler.datastore.exists(ref2)) 

1229 # The ref we unstored should be gone according to the URI, but the 

1230 # one we forgot should still be around. 

1231 self.assertFalse(uri1.exists()) 

1232 self.assertTrue(uri2.exists()) 

1233 

1234 

1235class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1236 """PosixDatastore specialization of a butler""" 

1237 

1238 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1239 fullConfigKey = ".datastore.formatters" 

1240 validationCanFail = True 

1241 datastoreStr = ["/tmp"] 

1242 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1243 registryStr = "/gen3.sqlite3" 

1244 

1245 def testPathConstructor(self): 

1246 """Independent test of constructor using PathLike.""" 

1247 butler = Butler(self.tmpConfigFile, run="ingest") 

1248 self.assertIsInstance(butler, Butler) 

1249 

1250 # And again with a Path object with the butler yaml 

1251 path = pathlib.Path(self.tmpConfigFile) 

1252 butler = Butler(path, writeable=False) 

1253 self.assertIsInstance(butler, Butler) 

1254 

1255 # And again with a Path object without the butler yaml 

1256 # (making sure we skip it if the tmp config doesn't end 

1257 # in butler.yaml -- which is the case for a subclass) 

1258 if self.tmpConfigFile.endswith("butler.yaml"): 

1259 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1260 butler = Butler(path, writeable=False) 

1261 self.assertIsInstance(butler, Butler) 

1262 

1263 def testExportTransferCopy(self): 

1264 """Test local export using all transfer modes""" 

1265 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1266 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1267 # Test that the repo actually has at least one dataset. 

1268 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1269 self.assertGreater(len(datasets), 0) 

1270 uris = [exportButler.getURI(d) for d in datasets] 

1271 datastoreRoot = exportButler.datastore.root 

1272 

1273 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1274 

1275 for path in pathsInStore: 

1276 # Assume local file system 

1277 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1278 

1279 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1280 with safeTestTempDir(TESTDIR) as exportDir: 

1281 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1282 export.saveDatasets(datasets) 

1283 for path in pathsInStore: 

1284 self.assertTrue( 

1285 self.checkFileExists(exportDir, path), 

1286 f"Check that mode {transfer} exported files", 

1287 ) 

1288 

1289 def testPruneDatasets(self): 

1290 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1291 butler = Butler(self.tmpConfigFile, writeable=True) 

1292 # Load registry data with dimensions to hang datasets off of. 

1293 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1294 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1295 # Add some RUN-type collections. 

1296 run1 = "run1" 

1297 butler.registry.registerRun(run1) 

1298 run2 = "run2" 

1299 butler.registry.registerRun(run2) 

1300 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1301 # different runs. ref3 has a different data ID. 

1302 metric = makeExampleMetrics() 

1303 dimensions = butler.registry.dimensions.extract(["instrument", "physical_filter"]) 

1304 datasetType = self.addDatasetType( 

1305 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1306 ) 

1307 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1308 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1309 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1310 

1311 # Simple prune. 

1312 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1313 with self.assertRaises(LookupError): 

1314 butler.datasetExists(ref1.datasetType, ref1.dataId, collections=run1) 

1315 

1316 # Put data back. 

1317 ref1 = butler.put(metric, ref1.unresolved(), run=run1) 

1318 ref2 = butler.put(metric, ref2.unresolved(), run=run2) 

1319 ref3 = butler.put(metric, ref3.unresolved(), run=run1) 

1320 

1321 # Check that in normal mode, deleting the record will lead to 

1322 # trash not touching the file. 

1323 uri1 = butler.datastore.getURI(ref1) 

1324 butler.datastore.bridge.moveToTrash([ref1]) # Update the dataset_location table 

1325 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref1.id}) 

1326 butler.datastore.trash(ref1) 

1327 butler.datastore.emptyTrash() 

1328 self.assertTrue(uri1.exists()) 

1329 uri1.remove() # Clean it up. 

1330 

1331 # Simulate execution butler setup by deleting the datastore 

1332 # record but keeping the file around and trusting. 

1333 butler.datastore.trustGetRequest = True 

1334 uri2 = butler.datastore.getURI(ref2) 

1335 uri3 = butler.datastore.getURI(ref3) 

1336 self.assertTrue(uri2.exists()) 

1337 self.assertTrue(uri3.exists()) 

1338 

1339 # Remove the datastore record. 

1340 butler.datastore.bridge.moveToTrash([ref2]) # Update the dataset_location table 

1341 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref2.id}) 

1342 self.assertTrue(uri2.exists()) 

1343 butler.datastore.trash([ref2, ref3]) 

1344 # Immediate removal for ref2 file 

1345 self.assertFalse(uri2.exists()) 

1346 # But ref3 has to wait for the empty. 

1347 self.assertTrue(uri3.exists()) 

1348 butler.datastore.emptyTrash() 

1349 self.assertFalse(uri3.exists()) 

1350 

1351 # Clear out the datasets from registry. 

1352 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1353 

1354 def testPytypeCoercion(self): 

1355 """Test python type coercion on Butler.get""" 

1356 

1357 # Store some data with the normal example storage class. 

1358 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1359 datasetTypeName = "test_metric" 

1360 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1361 

1362 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1363 metric = butler.get(datasetTypeName, dataId=dataId) 

1364 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1365 

1366 datasetType_ori = butler.registry.getDatasetType(datasetTypeName) 

1367 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1368 

1369 # Now need to hack the registry dataset type definition. 

1370 # There is no API for this. 

1371 manager = butler.registry._managers.datasets 

1372 manager._db.update( 

1373 manager._static.dataset_type, 

1374 {"name": datasetTypeName}, 

1375 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1376 ) 

1377 

1378 # Force reset of dataset type cache 

1379 butler.registry.refresh() 

1380 

1381 datasetType_new = butler.registry.getDatasetType(datasetTypeName) 

1382 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1383 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1384 

1385 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1386 self.assertNotEqual(type(metric_model), type(metric)) 

1387 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1388 

1389 # Put the model and read it back to show that everything now 

1390 # works as normal. 

1391 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1392 metric_model_new = butler.get(metric_ref) 

1393 self.assertEqual(metric_model_new, metric_model) 

1394 

1395 # Hack the storage class again to something that will fail on the 

1396 # get with no conversion class. 

1397 manager._db.update( 

1398 manager._static.dataset_type, 

1399 {"name": datasetTypeName}, 

1400 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1401 ) 

1402 butler.registry.refresh() 

1403 

1404 with self.assertRaises(ValueError): 

1405 butler.get(datasetTypeName, dataId=dataId) 

1406 

1407 

1408class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1409 """InMemoryDatastore specialization of a butler""" 

1410 

1411 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1412 fullConfigKey = None 

1413 useTempRoot = False 

1414 validationCanFail = False 

1415 datastoreStr = ["datastore='InMemory"] 

1416 datastoreName = ["InMemoryDatastore@"] 

1417 registryStr = "/gen3.sqlite3" 

1418 

1419 def testIngest(self): 

1420 pass 

1421 

1422 

1423class ChainedDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1424 """PosixDatastore specialization""" 

1425 

1426 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1427 fullConfigKey = ".datastore.datastores.1.formatters" 

1428 validationCanFail = True 

1429 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1430 datastoreName = [ 

1431 "InMemoryDatastore@", 

1432 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1433 "SecondDatastore", 

1434 ] 

1435 registryStr = "/gen3.sqlite3" 

1436 

1437 

1438class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1439 """Test that a yaml file in one location can refer to a root in another.""" 

1440 

1441 datastoreStr = ["dir1"] 

1442 # Disable the makeRepo test since we are deliberately not using 

1443 # butler.yaml as the config name. 

1444 fullConfigKey = None 

1445 

1446 def setUp(self): 

1447 self.root = makeTestTempDir(TESTDIR) 

1448 

1449 # Make a new repository in one place 

1450 self.dir1 = os.path.join(self.root, "dir1") 

1451 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1452 

1453 # Move the yaml file to a different place and add a "root" 

1454 self.dir2 = os.path.join(self.root, "dir2") 

1455 os.makedirs(self.dir2, exist_ok=True) 

1456 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1457 config = Config(configFile1) 

1458 config["root"] = self.dir1 

1459 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1460 config.dumpToUri(configFile2) 

1461 os.remove(configFile1) 

1462 self.tmpConfigFile = configFile2 

1463 

1464 def testFileLocations(self): 

1465 self.assertNotEqual(self.dir1, self.dir2) 

1466 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1467 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1468 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1469 

1470 

1471class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1472 """Test that a config file created by makeRepo outside of repo works.""" 

1473 

1474 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1475 

1476 def setUp(self): 

1477 self.root = makeTestTempDir(TESTDIR) 

1478 self.root2 = makeTestTempDir(TESTDIR) 

1479 

1480 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1481 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1482 

1483 def tearDown(self): 

1484 if os.path.exists(self.root2): 

1485 shutil.rmtree(self.root2, ignore_errors=True) 

1486 super().tearDown() 

1487 

1488 def testConfigExistence(self): 

1489 c = Config(self.tmpConfigFile) 

1490 uri_config = ResourcePath(c["root"]) 

1491 uri_expected = ResourcePath(self.root, forceDirectory=True) 

1492 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1493 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1494 

1495 def testPutGet(self): 

1496 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1497 self.runPutGetTest(storageClass, "test_metric") 

1498 

1499 

1500class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1501 """Test that a config file created by makeRepo outside of repo works.""" 

1502 

1503 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1504 

1505 def setUp(self): 

1506 self.root = makeTestTempDir(TESTDIR) 

1507 self.root2 = makeTestTempDir(TESTDIR) 

1508 

1509 self.tmpConfigFile = self.root2 

1510 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1511 

1512 def testConfigExistence(self): 

1513 # Append the yaml file else Config constructor does not know the file 

1514 # type. 

1515 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1516 super().testConfigExistence() 

1517 

1518 

1519class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1520 """Test that a config file created by makeRepo outside of repo works.""" 

1521 

1522 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1523 

1524 def setUp(self): 

1525 self.root = makeTestTempDir(TESTDIR) 

1526 self.root2 = makeTestTempDir(TESTDIR) 

1527 

1528 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl() 

1529 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1530 

1531 

1532@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1533@mock_s3 

1534class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1535 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1536 a local in-memory SqlRegistry. 

1537 """ 

1538 

1539 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1540 fullConfigKey = None 

1541 validationCanFail = True 

1542 

1543 bucketName = "anybucketname" 

1544 """Name of the Bucket that will be used in the tests. The name is read from 

1545 the config file used with the tests during set-up. 

1546 """ 

1547 

1548 root = "butlerRoot/" 

1549 """Root repository directory expected to be used in case useTempRoot=False. 

1550 Otherwise the root is set to a 20 characters long randomly generated string 

1551 during set-up. 

1552 """ 

1553 

1554 datastoreStr = [f"datastore={root}"] 

1555 """Contains all expected root locations in a format expected to be 

1556 returned by Butler stringification. 

1557 """ 

1558 

1559 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1560 """The expected format of the S3 Datastore string.""" 

1561 

1562 registryStr = "/gen3.sqlite3" 

1563 """Expected format of the Registry string.""" 

1564 

1565 def genRoot(self): 

1566 """Returns a random string of len 20 to serve as a root 

1567 name for the temporary bucket repo. 

1568 

1569 This is equivalent to tempfile.mkdtemp as this is what self.root 

1570 becomes when useTempRoot is True. 

1571 """ 

1572 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1573 return rndstr + "/" 

1574 

1575 def setUp(self): 

1576 config = Config(self.configFile) 

1577 uri = ResourcePath(config[".datastore.datastore.root"]) 

1578 self.bucketName = uri.netloc 

1579 

1580 # set up some fake credentials if they do not exist 

1581 self.usingDummyCredentials = setAwsEnvCredentials() 

1582 

1583 if self.useTempRoot: 

1584 self.root = self.genRoot() 

1585 rooturi = f"s3://{self.bucketName}/{self.root}" 

1586 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1587 

1588 # need local folder to store registry database 

1589 self.reg_dir = makeTestTempDir(TESTDIR) 

1590 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1591 

1592 # MOTO needs to know that we expect Bucket bucketname to exist 

1593 # (this used to be the class attribute bucketName) 

1594 s3 = boto3.resource("s3") 

1595 s3.create_bucket(Bucket=self.bucketName) 

1596 

1597 self.datastoreStr = f"datastore={self.root}" 

1598 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1599 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1600 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1601 

1602 def tearDown(self): 

1603 s3 = boto3.resource("s3") 

1604 bucket = s3.Bucket(self.bucketName) 

1605 try: 

1606 bucket.objects.all().delete() 

1607 except botocore.exceptions.ClientError as e: 

1608 if e.response["Error"]["Code"] == "404": 

1609 # the key was not reachable - pass 

1610 pass 

1611 else: 

1612 raise 

1613 

1614 bucket = s3.Bucket(self.bucketName) 

1615 bucket.delete() 

1616 

1617 # unset any potentially set dummy credentials 

1618 if self.usingDummyCredentials: 

1619 unsetAwsEnvCredentials() 

1620 

1621 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1622 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1623 

1624 if self.useTempRoot and os.path.exists(self.root): 

1625 shutil.rmtree(self.root, ignore_errors=True) 

1626 

1627 

1628@unittest.skipIf(WsgiDAVApp is None, "Warning: wsgidav/cheroot not found!") 

1629# Mock required environment variables during tests 

1630@unittest.mock.patch.dict( 

1631 os.environ, 

1632 { 

1633 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1634 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1635 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1636 }, 

1637) 

1638class WebdavDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1639 """WebdavDatastore specialization of a butler; a Webdav storage Datastore + 

1640 a local in-memory SqlRegistry. 

1641 """ 

1642 

1643 configFile = os.path.join(TESTDIR, "config/basic/butler-webdavstore.yaml") 

1644 fullConfigKey = None 

1645 validationCanFail = True 

1646 

1647 serverName = "localhost" 

1648 """Name of the server that will be used in the tests. 

1649 """ 

1650 

1651 portNumber = 8080 

1652 """Port on which the webdav server listens. Automatically chosen 

1653 at setUpClass via the _getfreeport() method 

1654 """ 

1655 

1656 root = "butlerRoot/" 

1657 """Root repository directory expected to be used in case useTempRoot=False. 

1658 Otherwise the root is set to a 20 characters long randomly generated string 

1659 during set-up. 

1660 """ 

1661 

1662 datastoreStr = [f"datastore={root}"] 

1663 """Contains all expected root locations in a format expected to be 

1664 returned by Butler stringification. 

1665 """ 

1666 

1667 datastoreName = ["FileDatastore@https://{serverName}/{root}"] 

1668 """The expected format of the WebdavDatastore string.""" 

1669 

1670 registryStr = "/gen3.sqlite3" 

1671 """Expected format of the Registry string.""" 

1672 

1673 serverThread = None 

1674 """Thread in which the local webdav server will run""" 

1675 

1676 stopWebdavServer = False 

1677 """This flag will cause the webdav server to 

1678 gracefully shut down when True 

1679 """ 

1680 

1681 def genRoot(self): 

1682 """Returns a random string of len 20 to serve as a root 

1683 name for the temporary bucket repo. 

1684 

1685 This is equivalent to tempfile.mkdtemp as this is what self.root 

1686 becomes when useTempRoot is True. 

1687 """ 

1688 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1689 return rndstr + "/" 

1690 

1691 @classmethod 

1692 def setUpClass(cls): 

1693 # Do the same as inherited class 

1694 cls.storageClassFactory = StorageClassFactory() 

1695 cls.storageClassFactory.addFromConfig(cls.configFile) 

1696 

1697 cls.portNumber = cls._getfreeport() 

1698 # Run a local webdav server on which tests will be run 

1699 cls.serverThread = Thread( 

1700 target=cls._serveWebdav, args=(cls, cls.portNumber, lambda: cls.stopWebdavServer), daemon=True 

1701 ) 

1702 cls.serverThread.start() 

1703 # Wait for it to start 

1704 time.sleep(3) 

1705 

1706 @classmethod 

1707 def tearDownClass(cls): 

1708 # Ask for graceful shut down of the webdav server 

1709 cls.stopWebdavServer = True 

1710 # Wait for the thread to exit 

1711 cls.serverThread.join() 

1712 

1713 # Mock required environment variables during tests 

1714 @unittest.mock.patch.dict( 

1715 os.environ, 

1716 { 

1717 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1718 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1719 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1720 }, 

1721 ) 

1722 def setUp(self): 

1723 config = Config(self.configFile) 

1724 

1725 if self.useTempRoot: 

1726 self.root = self.genRoot() 

1727 self.rooturi = f"http://{self.serverName}:{self.portNumber}/{self.root}" 

1728 config.update({"datastore": {"datastore": {"root": self.rooturi}}}) 

1729 

1730 # need local folder to store registry database 

1731 self.reg_dir = makeTestTempDir(TESTDIR) 

1732 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1733 

1734 self.datastoreStr = f"datastore={self.root}" 

1735 self.datastoreName = [f"FileDatastore@{self.rooturi}"] 

1736 

1737 if not isWebdavEndpoint(self.rooturi): 

1738 raise OSError("Webdav server not running properly: cannot run tests.") 

1739 

1740 Butler.makeRepo(self.rooturi, config=config, forceConfigRoot=False) 

1741 self.tmpConfigFile = posixpath.join(self.rooturi, "butler.yaml") 

1742 

1743 # Mock required environment variables during tests 

1744 @unittest.mock.patch.dict( 

1745 os.environ, 

1746 { 

1747 "LSST_BUTLER_WEBDAV_AUTH": "TOKEN", 

1748 "LSST_BUTLER_WEBDAV_TOKEN_FILE": os.path.join(TESTDIR, "config/testConfigs/webdav/token"), 

1749 "LSST_BUTLER_WEBDAV_CA_BUNDLE": "/path/to/ca/certs", 

1750 }, 

1751 ) 

1752 def tearDown(self): 

1753 # Clear temporary directory 

1754 ResourcePath(self.rooturi).remove() 

1755 ResourcePath(self.rooturi).session.close() 

1756 

1757 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1758 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1759 

1760 if self.useTempRoot and os.path.exists(self.root): 

1761 shutil.rmtree(self.root, ignore_errors=True) 

1762 

1763 def _serveWebdav(self, port: int, stopWebdavServer): 

1764 """Starts a local webdav-compatible HTTP server, 

1765 Listening on http://localhost:port 

1766 This server only runs when this test class is instantiated, 

1767 and then shuts down. Must be started is a separate thread. 

1768 

1769 Parameters 

1770 ---------- 

1771 port : `int` 

1772 The port number on which the server should listen 

1773 """ 

1774 root_path = gettempdir() 

1775 

1776 config = { 

1777 "host": "0.0.0.0", 

1778 "port": port, 

1779 "provider_mapping": {"/": root_path}, 

1780 "http_authenticator": {"domain_controller": None}, 

1781 "simple_dc": {"user_mapping": {"*": True}}, 

1782 "verbose": 0, 

1783 } 

1784 app = WsgiDAVApp(config) 

1785 

1786 server_args = { 

1787 "bind_addr": (config["host"], config["port"]), 

1788 "wsgi_app": app, 

1789 } 

1790 server = wsgi.Server(**server_args) 

1791 server.prepare() 

1792 

1793 try: 

1794 # Start the actual server in a separate thread 

1795 t = Thread(target=server.serve, daemon=True) 

1796 t.start() 

1797 # watch stopWebdavServer, and gracefully 

1798 # shut down the server when True 

1799 while True: 

1800 if stopWebdavServer(): 

1801 break 

1802 time.sleep(1) 

1803 except KeyboardInterrupt: 

1804 print("Caught Ctrl-C, shutting down...") 

1805 finally: 

1806 server.stop() 

1807 t.join() 

1808 

1809 def _getfreeport(): 

1810 """ 

1811 Determines a free port using sockets. 

1812 """ 

1813 free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 

1814 free_socket.bind(("0.0.0.0", 0)) 

1815 free_socket.listen() 

1816 port = free_socket.getsockname()[1] 

1817 free_socket.close() 

1818 return port 

1819 

1820 

1821class PosixDatastoreTransfers(unittest.TestCase): 

1822 """Test data transfers between butlers. 

1823 

1824 Test for different managers. UUID to UUID and integer to integer are 

1825 tested. UUID to integer is not supported since we do not currently 

1826 want to allow that. Integer to UUID is supported with the caveat 

1827 that UUID4 will be generated and this will be incorrect for raw 

1828 dataset types. The test ignores that. 

1829 """ 

1830 

1831 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1832 

1833 @classmethod 

1834 def setUpClass(cls): 

1835 cls.storageClassFactory = StorageClassFactory() 

1836 cls.storageClassFactory.addFromConfig(cls.configFile) 

1837 

1838 def setUp(self): 

1839 self.root = makeTestTempDir(TESTDIR) 

1840 self.config = Config(self.configFile) 

1841 

1842 def tearDown(self): 

1843 removeTestTempDir(self.root) 

1844 

1845 def create_butler(self, manager, label): 

1846 config = Config(self.configFile) 

1847 config["registry", "managers", "datasets"] = manager 

1848 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True) 

1849 

1850 def create_butlers(self, manager1, manager2): 

1851 self.source_butler = self.create_butler(manager1, "1") 

1852 self.target_butler = self.create_butler(manager2, "2") 

1853 

1854 def testTransferUuidToUuid(self): 

1855 self.create_butlers( 

1856 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1857 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1858 ) 

1859 # Setting id_gen_map should have no effect here 

1860 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1861 

1862 def testTransferIntToInt(self): 

1863 self.create_butlers( 

1864 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1865 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1866 ) 

1867 # int dataset ID only allows UNIQUE 

1868 self.assertButlerTransfers() 

1869 

1870 def testTransferIntToUuid(self): 

1871 self.create_butlers( 

1872 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManager", 

1873 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1874 ) 

1875 self.assertButlerTransfers(id_gen_map={"random_data_2": DatasetIdGenEnum.DATAID_TYPE}) 

1876 

1877 def testTransferMissing(self): 

1878 """Test transfers where datastore records are missing. 

1879 

1880 This is how execution butler works. 

1881 """ 

1882 self.create_butlers( 

1883 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1884 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1885 ) 

1886 

1887 # Configure the source butler to allow trust. 

1888 self.source_butler.datastore.trustGetRequest = True 

1889 

1890 self.assertButlerTransfers(purge=True) 

1891 

1892 def testTransferMissingDisassembly(self): 

1893 """Test transfers where datastore records are missing. 

1894 

1895 This is how execution butler works. 

1896 """ 

1897 self.create_butlers( 

1898 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1899 "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID", 

1900 ) 

1901 

1902 # Configure the source butler to allow trust. 

1903 self.source_butler.datastore.trustGetRequest = True 

1904 

1905 # Test disassembly. 

1906 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

1907 

1908 def assertButlerTransfers(self, id_gen_map=None, purge=False, storageClassName="StructuredData"): 

1909 """Test that a run can be transferred to another butler.""" 

1910 

1911 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

1912 datasetTypeName = "random_data" 

1913 

1914 # Test will create 3 collections and we will want to transfer 

1915 # two of those three. 

1916 runs = ["run1", "run2", "other"] 

1917 

1918 # Also want to use two different dataset types to ensure that 

1919 # grouping works. 

1920 datasetTypeNames = ["random_data", "random_data_2"] 

1921 

1922 # Create the run collections in the source butler. 

1923 for run in runs: 

1924 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

1925 

1926 # Create dimensions in both butlers (transfer will not create them). 

1927 n_exposures = 30 

1928 for butler in (self.source_butler, self.target_butler): 

1929 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1930 butler.registry.insertDimensionData( 

1931 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1932 ) 

1933 butler.registry.insertDimensionData( 

1934 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1935 ) 

1936 

1937 for i in range(n_exposures): 

1938 butler.registry.insertDimensionData( 

1939 "exposure", 

1940 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

1941 ) 

1942 

1943 # Create dataset types in the source butler. 

1944 dimensions = butler.registry.dimensions.extract(["instrument", "exposure"]) 

1945 for datasetTypeName in datasetTypeNames: 

1946 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1947 self.source_butler.registry.registerDatasetType(datasetType) 

1948 

1949 # Write a dataset to an unrelated run -- this will ensure that 

1950 # we are rewriting integer dataset ids in the target if necessary. 

1951 # Will not be relevant for UUID. 

1952 run = "distraction" 

1953 butler = Butler(butler=self.source_butler, run=run) 

1954 butler.put( 

1955 makeExampleMetrics(), 

1956 datasetTypeName, 

1957 exposure=1, 

1958 instrument="DummyCamComp", 

1959 physical_filter="d-r", 

1960 ) 

1961 

1962 # Write some example metrics to the source 

1963 butler = Butler(butler=self.source_butler) 

1964 

1965 # Set of DatasetRefs that should be in the list of refs to transfer 

1966 # but which will not be transferred. 

1967 deleted = set() 

1968 

1969 n_expected = 20 # Number of datasets expected to be transferred 

1970 source_refs = [] 

1971 for i in range(n_exposures): 

1972 # Put a third of datasets into each collection, only retain 

1973 # two thirds. 

1974 index = i % 3 

1975 run = runs[index] 

1976 datasetTypeName = datasetTypeNames[i % 2] 

1977 

1978 metric_data = { 

1979 "summary": {"counter": i}, 

1980 "output": {"text": "metric"}, 

1981 "data": [2 * x for x in range(i)], 

1982 } 

1983 metric = MetricsExample(**metric_data) 

1984 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1985 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

1986 

1987 # Remove the datastore record using low-level API 

1988 if purge: 

1989 # Remove records for a fraction. 

1990 if index == 1: 

1991 

1992 # For one of these delete the file as well. 

1993 # This allows the "missing" code to filter the 

1994 # file out. 

1995 if not deleted: 

1996 primary, uris = butler.datastore.getURIs(ref) 

1997 if primary: 

1998 primary.remove() 

1999 for uri in uris.values(): 

2000 uri.remove() 

2001 n_expected -= 1 

2002 deleted.add(ref) 

2003 

2004 # Remove the datastore record. 

2005 butler.datastore._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

2006 

2007 if index < 2: 

2008 source_refs.append(ref) 

2009 if ref not in deleted: 

2010 new_metric = butler.get(ref.unresolved(), collections=run) 

2011 self.assertEqual(new_metric, metric) 

2012 

2013 # Create some bad dataset types to ensure we check for inconsistent 

2014 # definitions. 

2015 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2016 for datasetTypeName in datasetTypeNames: 

2017 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2018 self.target_butler.registry.registerDatasetType(datasetType) 

2019 with self.assertRaises(ConflictingDefinitionError): 

2020 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map) 

2021 # And remove the bad definitions. 

2022 for datasetTypeName in datasetTypeNames: 

2023 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2024 

2025 # Transfer without creating dataset types should fail. 

2026 with self.assertRaises(KeyError): 

2027 self.target_butler.transfer_from(self.source_butler, source_refs, id_gen_map=id_gen_map) 

2028 

2029 # Now transfer them to the second butler 

2030 with self.assertLogs(level=logging.DEBUG) as cm: 

2031 transferred = self.target_butler.transfer_from( 

2032 self.source_butler, source_refs, id_gen_map=id_gen_map, register_dataset_types=True 

2033 ) 

2034 self.assertEqual(len(transferred), n_expected) 

2035 log_output = ";".join(cm.output) 

2036 self.assertIn("found in datastore for chunk", log_output) 

2037 self.assertIn("Creating output run", log_output) 

2038 

2039 # Do the transfer twice to ensure that it will do nothing extra. 

2040 # Only do this if purge=True because it does not work for int 

2041 # dataset_id. 

2042 if purge: 

2043 # This should not need to register dataset types. 

2044 transferred = self.target_butler.transfer_from( 

2045 self.source_butler, source_refs, id_gen_map=id_gen_map 

2046 ) 

2047 self.assertEqual(len(transferred), n_expected) 

2048 

2049 # Also do an explicit low-level transfer to trigger some 

2050 # edge cases. 

2051 with self.assertLogs(level=logging.DEBUG) as cm: 

2052 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

2053 log_output = ";".join(cm.output) 

2054 self.assertIn("no file artifacts exist", log_output) 

2055 

2056 with self.assertRaises(TypeError): 

2057 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) 

2058 

2059 with self.assertRaises(ValueError): 

2060 self.target_butler.datastore.transfer_from( 

2061 self.source_butler.datastore, source_refs, transfer="split" 

2062 ) 

2063 

2064 # Now try to get the same refs from the new butler. 

2065 for ref in source_refs: 

2066 if ref not in deleted: 

2067 unresolved_ref = ref.unresolved() 

2068 new_metric = self.target_butler.get(unresolved_ref, collections=ref.run) 

2069 old_metric = self.source_butler.get(unresolved_ref, collections=ref.run) 

2070 self.assertEqual(new_metric, old_metric) 

2071 

2072 # Now prune run2 collection and create instead a CHAINED collection. 

2073 # This should block the transfer. 

2074 self.target_butler.pruneCollection("run2", purge=True, unstore=True) 

2075 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2076 with self.assertRaises(TypeError): 

2077 # Re-importing the run1 datasets can be problematic if they 

2078 # use integer IDs so filter those out. 

2079 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2080 self.target_butler.transfer_from(self.source_butler, to_transfer, id_gen_map=id_gen_map) 

2081 

2082 

2083if __name__ == "__main__": 2083 ↛ 2084line 2083 didn't jump to line 2084, because the condition on line 2083 was never true

2084 unittest.main()