Coverage for tests/test_butler.py: 14%

1344 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-01 11:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Tests for Butler. 

29""" 

30from __future__ import annotations 

31 

32import gc 

33import json 

34import logging 

35import os 

36import pathlib 

37import pickle 

38import posixpath 

39import random 

40import shutil 

41import string 

42import tempfile 

43import unittest 

44import uuid 

45from collections.abc import Mapping 

46from typing import TYPE_CHECKING, Any, cast 

47 

48try: 

49 import boto3 

50 import botocore 

51 from lsst.resources.s3utils import clean_test_environment_for_s3 

52 

53 try: 

54 from moto import mock_aws # v5 

55 except ImportError: 

56 from moto import mock_s3 as mock_aws 

57except ImportError: 

58 boto3 = None 

59 

60 def mock_aws(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def] 

61 """No-op decorator in case moto mock_aws can not be imported.""" 

62 return None 

63 

64 

65try: 

66 # It's possible but silly to have testing.postgresql installed without 

67 # having the postgresql server installed (because then nothing in 

68 # testing.postgresql would work), so we use the presence of that module 

69 # to test whether we can expect the server to be available. 

70 import testing.postgresql # type: ignore[import] 

71except ImportError: 

72 testing = None 

73 

74import astropy.time 

75import sqlalchemy 

76from lsst.daf.butler import ( 

77 Butler, 

78 ButlerConfig, 

79 ButlerRepoIndex, 

80 CollectionType, 

81 Config, 

82 DataCoordinate, 

83 DatasetExistence, 

84 DatasetRef, 

85 DatasetType, 

86 FileDataset, 

87 StorageClassFactory, 

88 ValidationError, 

89 script, 

90) 

91from lsst.daf.butler.datastore import NullDatastore 

92from lsst.daf.butler.datastore.file_templates import FileTemplate, FileTemplateValidationError 

93from lsst.daf.butler.datastores.fileDatastore import FileDatastore 

94from lsst.daf.butler.direct_butler import DirectButler 

95from lsst.daf.butler.registry import ( 

96 CollectionError, 

97 CollectionTypeError, 

98 ConflictingDefinitionError, 

99 DataIdValueError, 

100 MissingCollectionError, 

101 OrphanedRecordError, 

102) 

103from lsst.daf.butler.registry.sql_registry import SqlRegistry 

104from lsst.daf.butler.repo_relocation import BUTLER_ROOT_TAG 

105from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

106from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir 

107from lsst.resources import ResourcePath 

108from lsst.utils import doImportType 

109from lsst.utils.introspection import get_full_type_name 

110 

111if TYPE_CHECKING: 

112 import types 

113 

114 from lsst.daf.butler import DimensionGroup, Registry, StorageClass 

115 

116TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

117 

118 

119def clean_environment() -> None: 

120 """Remove external environment variables that affect the tests.""" 

121 for k in ("DAF_BUTLER_REPOSITORY_INDEX",): 

122 os.environ.pop(k, None) 

123 

124 

125def makeExampleMetrics() -> MetricsExample: 

126 """Return example dataset suitable for tests.""" 

127 return MetricsExample( 

128 {"AM1": 5.2, "AM2": 30.6}, 

129 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

130 [563, 234, 456.7, 752, 8, 9, 27], 

131 ) 

132 

133 

134class TransactionTestError(Exception): 

135 """Specific error for testing transactions, to prevent misdiagnosing 

136 that might otherwise occur when a standard exception is used. 

137 """ 

138 

139 pass 

140 

141 

142class ButlerConfigTests(unittest.TestCase): 

143 """Simple tests for ButlerConfig that are not tested in any other test 

144 cases. 

145 """ 

146 

147 def testSearchPath(self) -> None: 

148 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

149 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

150 config1 = ButlerConfig(configFile) 

151 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

152 

153 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

154 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

155 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

156 self.assertIn("testConfigs", "\n".join(cm.output)) 

157 

158 key = ("datastore", "records", "table") 

159 self.assertNotEqual(config1[key], config2[key]) 

160 self.assertEqual(config2[key], "override_record") 

161 

162 

163class ButlerPutGetTests(TestCaseMixin): 

164 """Helper method for running a suite of put/get tests from different 

165 butler configurations. 

166 """ 

167 

168 root: str 

169 default_run = "ingésτ😺" 

170 storageClassFactory: StorageClassFactory 

171 configFile: str 

172 tmpConfigFile: str 

173 

174 @staticmethod 

175 def addDatasetType( 

176 datasetTypeName: str, dimensions: DimensionGroup, storageClass: StorageClass | str, registry: Registry 

177 ) -> DatasetType: 

178 """Create a DatasetType and register it""" 

179 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

180 registry.registerDatasetType(datasetType) 

181 return datasetType 

182 

183 @classmethod 

184 def setUpClass(cls) -> None: 

185 cls.storageClassFactory = StorageClassFactory() 

186 cls.storageClassFactory.addFromConfig(cls.configFile) 

187 

188 def assertGetComponents( 

189 self, 

190 butler: Butler, 

191 datasetRef: DatasetRef, 

192 components: tuple[str, ...], 

193 reference: Any, 

194 collections: Any = None, 

195 ) -> None: 

196 datasetType = datasetRef.datasetType 

197 dataId = datasetRef.dataId 

198 deferred = butler.getDeferred(datasetRef) 

199 

200 for component in components: 

201 compTypeName = datasetType.componentTypeName(component) 

202 result = butler.get(compTypeName, dataId, collections=collections) 

203 self.assertEqual(result, getattr(reference, component)) 

204 result_deferred = deferred.get(component=component) 

205 self.assertEqual(result_deferred, result) 

206 

207 def tearDown(self) -> None: 

208 removeTestTempDir(self.root) 

209 

210 def create_butler( 

211 self, run: str, storageClass: StorageClass | str, datasetTypeName: str 

212 ) -> tuple[DirectButler, DatasetType]: 

213 butler = Butler.from_config(self.tmpConfigFile, run=run) 

214 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration" 

215 

216 collections = set(butler.registry.queryCollections()) 

217 self.assertEqual(collections, {run}) 

218 

219 # Create and register a DatasetType 

220 dimensions = butler.dimensions.conform(["instrument", "visit"]) 

221 

222 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

223 

224 # Add needed Dimensions 

225 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

226 butler.registry.insertDimensionData( 

227 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

228 ) 

229 butler.registry.insertDimensionData( 

230 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

231 ) 

232 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

233 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

234 butler.registry.insertDimensionData( 

235 "visit", 

236 { 

237 "instrument": "DummyCamComp", 

238 "id": 423, 

239 "name": "fourtwentythree", 

240 "physical_filter": "d-r", 

241 "datetime_begin": visit_start, 

242 "datetime_end": visit_end, 

243 }, 

244 ) 

245 

246 # Add more visits for some later tests 

247 for visit_id in (424, 425): 

248 butler.registry.insertDimensionData( 

249 "visit", 

250 { 

251 "instrument": "DummyCamComp", 

252 "id": visit_id, 

253 "name": f"fourtwentyfour_{visit_id}", 

254 "physical_filter": "d-r", 

255 }, 

256 ) 

257 return butler, datasetType 

258 

259 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> DirectButler: 

260 # New datasets will be added to run and tag, but we will only look in 

261 # tag when looking up datasets. 

262 run = self.default_run 

263 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

264 assert butler.run is not None 

265 

266 # Create and store a dataset 

267 metric = makeExampleMetrics() 

268 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423}) 

269 

270 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

271 # and once with a DatasetType 

272 

273 # Keep track of any collections we add and do not clean up 

274 expected_collections = {run} 

275 

276 counter = 0 

277 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1") 

278 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate] 

279 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)): 

280 # Since we are using subTest we can get cascading failures 

281 # here with the first attempt failing and the others failing 

282 # immediately because the dataset already exists. Work around 

283 # this by using a distinct run collection each time 

284 counter += 1 

285 this_run = f"put_run_{counter}" 

286 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

287 expected_collections.update({this_run}) 

288 

289 with self.subTest(args=args): 

290 kwargs: dict[str, Any] = {} 

291 if not isinstance(args[0], DatasetRef): # type: ignore 

292 kwargs["run"] = this_run 

293 ref = butler.put(metric, *args, **kwargs) 

294 self.assertIsInstance(ref, DatasetRef) 

295 

296 # Test get of a ref. 

297 metricOut = butler.get(ref) 

298 self.assertEqual(metric, metricOut) 

299 # Test get 

300 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

301 self.assertEqual(metric, metricOut) 

302 # Test get with a datasetRef 

303 metricOut = butler.get(ref) 

304 self.assertEqual(metric, metricOut) 

305 # Test getDeferred with dataId 

306 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

307 self.assertEqual(metric, metricOut) 

308 # Test getDeferred with a ref 

309 metricOut = butler.getDeferred(ref).get() 

310 self.assertEqual(metric, metricOut) 

311 

312 # Check we can get components 

313 if storageClass.isComposite(): 

314 self.assertGetComponents( 

315 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

316 ) 

317 

318 primary_uri, secondary_uris = butler.getURIs(ref) 

319 n_uris = len(secondary_uris) 

320 if primary_uri: 

321 n_uris += 1 

322 

323 # Can the artifacts themselves be retrieved? 

324 if not butler._datastore.isEphemeral: 

325 # Create a temporary directory to hold the retrieved 

326 # artifacts. 

327 with tempfile.TemporaryDirectory( 

328 prefix="butler-artifacts-", ignore_cleanup_errors=True 

329 ) as artifact_root: 

330 root_uri = ResourcePath(artifact_root, forceDirectory=True) 

331 

332 for preserve_path in (True, False): 

333 destination = root_uri.join(f"{preserve_path}_{counter}/") 

334 log = logging.getLogger("lsst.x") 

335 log.warning("Using destination %s for args %s", destination, args) 

336 # Use copy so that we can test that overwrite 

337 # protection works (using "auto" for File URIs 

338 # would use hard links and subsequent transfer 

339 # would work because it knows they are the same 

340 # file). 

341 transferred = butler.retrieveArtifacts( 

342 [ref], destination, preserve_path=preserve_path, transfer="copy" 

343 ) 

344 self.assertGreater(len(transferred), 0) 

345 artifacts = list(ResourcePath.findFileResources([destination])) 

346 self.assertEqual(set(transferred), set(artifacts)) 

347 

348 for artifact in transferred: 

349 path_in_destination = artifact.relative_to(destination) 

350 self.assertIsNotNone(path_in_destination) 

351 assert path_in_destination is not None 

352 

353 # When path is not preserved there should not 

354 # be any path separators. 

355 num_seps = path_in_destination.count("/") 

356 if preserve_path: 

357 self.assertGreater(num_seps, 0) 

358 else: 

359 self.assertEqual(num_seps, 0) 

360 

361 self.assertEqual( 

362 len(artifacts), 

363 n_uris, 

364 "Comparing expected artifacts vs actual:" 

365 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

366 ) 

367 

368 if preserve_path: 

369 # No need to run these twice 

370 with self.assertRaises(ValueError): 

371 butler.retrieveArtifacts([ref], destination, transfer="move") 

372 

373 with self.assertRaises(FileExistsError): 

374 butler.retrieveArtifacts([ref], destination) 

375 

376 transferred_again = butler.retrieveArtifacts( 

377 [ref], destination, preserve_path=preserve_path, overwrite=True 

378 ) 

379 self.assertEqual(set(transferred_again), set(transferred)) 

380 

381 # Now remove the dataset completely. 

382 butler.pruneDatasets([ref], purge=True, unstore=True) 

383 # Lookup with original args should still fail. 

384 kwargs = {"collections": this_run} 

385 if isinstance(args[0], DatasetRef): 

386 kwargs = {} # Prevent warning from being issued. 

387 self.assertFalse(butler.exists(*args, **kwargs)) 

388 # get() should still fail. 

389 with self.assertRaises(FileNotFoundError): 

390 butler.get(ref) 

391 # Registry shouldn't be able to find it by dataset_id anymore. 

392 self.assertIsNone(butler.get_dataset(ref.id)) 

393 

394 # Do explicit registry removal since we know they are 

395 # empty 

396 butler.registry.removeCollection(this_run) 

397 expected_collections.remove(this_run) 

398 

399 # Create DatasetRef for put using default run. 

400 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run) 

401 

402 # Check that getDeferred fails with standalone ref. 

403 with self.assertRaises(LookupError): 

404 butler.getDeferred(refIn) 

405 

406 # Put the dataset again, since the last thing we did was remove it 

407 # and we want to use the default collection. 

408 ref = butler.put(metric, refIn) 

409 

410 # Get with parameters 

411 stop = 4 

412 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

413 self.assertNotEqual(metric, sliced) 

414 self.assertEqual(metric.summary, sliced.summary) 

415 self.assertEqual(metric.output, sliced.output) 

416 assert metric.data is not None # for mypy 

417 self.assertEqual(metric.data[:stop], sliced.data) 

418 # getDeferred with parameters 

419 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

420 self.assertNotEqual(metric, sliced) 

421 self.assertEqual(metric.summary, sliced.summary) 

422 self.assertEqual(metric.output, sliced.output) 

423 self.assertEqual(metric.data[:stop], sliced.data) 

424 # getDeferred with deferred parameters 

425 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

426 self.assertNotEqual(metric, sliced) 

427 self.assertEqual(metric.summary, sliced.summary) 

428 self.assertEqual(metric.output, sliced.output) 

429 self.assertEqual(metric.data[:stop], sliced.data) 

430 

431 if storageClass.isComposite(): 

432 # Check that components can be retrieved 

433 metricOut = butler.get(ref.datasetType.name, dataId) 

434 compNameS = ref.datasetType.componentTypeName("summary") 

435 compNameD = ref.datasetType.componentTypeName("data") 

436 summary = butler.get(compNameS, dataId) 

437 self.assertEqual(summary, metric.summary) 

438 data = butler.get(compNameD, dataId) 

439 self.assertEqual(data, metric.data) 

440 

441 if "counter" in storageClass.derivedComponents: 

442 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

443 self.assertEqual(count, len(data)) 

444 

445 count = butler.get( 

446 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

447 ) 

448 self.assertEqual(count, stop) 

449 

450 compRef = butler.find_dataset(compNameS, dataId, collections=butler.collections) 

451 assert compRef is not None 

452 summary = butler.get(compRef) 

453 self.assertEqual(summary, metric.summary) 

454 

455 # Create a Dataset type that has the same name but is inconsistent. 

456 inconsistentDatasetType = DatasetType( 

457 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config") 

458 ) 

459 

460 # Getting with a dataset type that does not match registry fails 

461 with self.assertRaisesRegex(ValueError, "Supplied dataset type .* inconsistent with registry"): 

462 butler.get(inconsistentDatasetType, dataId) 

463 

464 # Combining a DatasetRef with a dataId should fail 

465 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"): 

466 butler.get(ref, dataId) 

467 # Getting with an explicit ref should fail if the id doesn't match. 

468 with self.assertRaises(FileNotFoundError): 

469 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run)) 

470 

471 # Getting a dataset with unknown parameters should fail 

472 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"): 

473 butler.get(ref, parameters={"unsupported": True}) 

474 

475 # Check we have a collection 

476 collections = set(butler.registry.queryCollections()) 

477 self.assertEqual(collections, expected_collections) 

478 

479 # Clean up to check that we can remove something that may have 

480 # already had a component removed 

481 butler.pruneDatasets([ref], unstore=True, purge=True) 

482 

483 # Add the same ref again, so we can check that duplicate put fails. 

484 ref = butler.put(metric, datasetType, dataId) 

485 

486 # Repeat put will fail. 

487 with self.assertRaisesRegex( 

488 ConflictingDefinitionError, "A database constraint failure was triggered" 

489 ): 

490 butler.put(metric, datasetType, dataId) 

491 

492 # Remove the datastore entry. 

493 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

494 

495 # Put will still fail 

496 with self.assertRaisesRegex( 

497 ConflictingDefinitionError, "A database constraint failure was triggered" 

498 ): 

499 butler.put(metric, datasetType, dataId) 

500 

501 # Repeat the same sequence with resolved ref. 

502 butler.pruneDatasets([ref], unstore=True, purge=True) 

503 ref = butler.put(metric, refIn) 

504 

505 # Repeat put will fail. 

506 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"): 

507 butler.put(metric, refIn) 

508 

509 # Remove the datastore entry. 

510 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

511 

512 # In case of resolved ref this write will succeed. 

513 ref = butler.put(metric, refIn) 

514 

515 # Leave the dataset in place since some downstream tests require 

516 # something to be present 

517 

518 return butler 

519 

520 def testDeferredCollectionPassing(self) -> None: 

521 # Construct a butler with no run or collection, but make it writeable. 

522 butler = Butler.from_config(self.tmpConfigFile, writeable=True) 

523 # Create and register a DatasetType 

524 dimensions = butler.dimensions.conform(["instrument", "visit"]) 

525 datasetType = self.addDatasetType( 

526 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

527 ) 

528 # Add needed Dimensions 

529 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

530 butler.registry.insertDimensionData( 

531 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

532 ) 

533 butler.registry.insertDimensionData( 

534 "visit", 

535 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

536 ) 

537 dataId = {"instrument": "DummyCamComp", "visit": 423} 

538 # Create dataset. 

539 metric = makeExampleMetrics() 

540 # Register a new run and put dataset. 

541 run = "deferred" 

542 self.assertTrue(butler.registry.registerRun(run)) 

543 # Second time it will be allowed but indicate no-op 

544 self.assertFalse(butler.registry.registerRun(run)) 

545 ref = butler.put(metric, datasetType, dataId, run=run) 

546 # Putting with no run should fail with TypeError. 

547 with self.assertRaises(CollectionError): 

548 butler.put(metric, datasetType, dataId) 

549 # Dataset should exist. 

550 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

551 # We should be able to get the dataset back, but with and without 

552 # a deferred dataset handle. 

553 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

554 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

555 # Trying to find the dataset without any collection is a TypeError. 

556 self.assertFalse(butler.exists(datasetType, dataId)) 

557 with self.assertRaises(CollectionError): 

558 butler.get(datasetType, dataId) 

559 # Associate the dataset with a different collection. 

560 butler.registry.registerCollection("tagged") 

561 butler.registry.associate("tagged", [ref]) 

562 # Deleting the dataset from the new collection should make it findable 

563 # in the original collection. 

564 butler.pruneDatasets([ref], tags=["tagged"]) 

565 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

566 

567 

568class ButlerTests(ButlerPutGetTests): 

569 """Tests for Butler.""" 

570 

571 useTempRoot = True 

572 validationCanFail: bool 

573 fullConfigKey: str | None 

574 registryStr: str | None 

575 datastoreName: list[str] | None 

576 datastoreStr: list[str] 

577 

578 def setUp(self) -> None: 

579 """Create a new butler root for each test.""" 

580 self.root = makeTestTempDir(TESTDIR) 

581 Butler.makeRepo(self.root, config=Config(self.configFile)) 

582 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

583 

584 def testConstructor(self) -> None: 

585 """Independent test of constructor.""" 

586 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

587 self.assertIsInstance(butler, Butler) 

588 

589 # Check that butler.yaml is added automatically. 

590 if self.tmpConfigFile.endswith(end := "/butler.yaml"): 

591 config_dir = self.tmpConfigFile[: -len(end)] 

592 butler = Butler.from_config(config_dir, run=self.default_run) 

593 self.assertIsInstance(butler, Butler) 

594 

595 # Even with a ResourcePath. 

596 butler = Butler.from_config(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) 

597 self.assertIsInstance(butler, Butler) 

598 

599 collections = set(butler.registry.queryCollections()) 

600 self.assertEqual(collections, {self.default_run}) 

601 

602 # Check that some special characters can be included in run name. 

603 special_run = "u@b.c-A" 

604 butler_special = Butler.from_config(butler=butler, run=special_run) 

605 collections = set(butler_special.registry.queryCollections("*@*")) 

606 self.assertEqual(collections, {special_run}) 

607 

608 butler2 = Butler.from_config(butler=butler, collections=["other"]) 

609 self.assertEqual(butler2.collections, ("other",)) 

610 self.assertIsNone(butler2.run) 

611 self.assertEqual(type(butler._datastore), type(butler2._datastore)) 

612 self.assertEqual(butler._datastore.config, butler2._datastore.config) 

613 

614 # Test that we can use an environment variable to find this 

615 # repository. 

616 butler_index = Config() 

617 butler_index["label"] = self.tmpConfigFile 

618 for suffix in (".yaml", ".json"): 

619 # Ensure that the content differs so that we know that 

620 # we aren't reusing the cache. 

621 bad_label = f"file://bucket/not_real{suffix}" 

622 butler_index["bad_label"] = bad_label 

623 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

624 butler_index.dumpToUri(temp_file) 

625 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

626 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"}) 

627 uri = Butler.get_repo_uri("bad_label") 

628 self.assertEqual(uri, ResourcePath(bad_label)) 

629 uri = Butler.get_repo_uri("label") 

630 butler = Butler.from_config(uri, writeable=False) 

631 self.assertIsInstance(butler, Butler) 

632 butler = Butler.from_config("label", writeable=False) 

633 self.assertIsInstance(butler, Butler) 

634 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"): 

635 Butler.from_config("not_there", writeable=False) 

636 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"): 

637 Butler.from_config("bad_label") 

638 with self.assertRaises(FileNotFoundError): 

639 # Should ignore aliases. 

640 Butler.from_config(ResourcePath("label", forceAbsolute=False)) 

641 with self.assertRaises(KeyError) as cm: 

642 Butler.get_repo_uri("missing") 

643 self.assertEqual( 

644 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False) 

645 ) 

646 self.assertIn("not known to", str(cm.exception)) 

647 # Should report no failure. 

648 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "") 

649 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

650 # Now with empty configuration. 

651 butler_index = Config() 

652 butler_index.dumpToUri(temp_file) 

653 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

654 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"): 

655 Butler.from_config("label") 

656 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

657 # Now with bad contents. 

658 with open(temp_file.ospath, "w") as fh: 

659 print("'", file=fh) 

660 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

661 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"): 

662 Butler.from_config("label") 

663 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

664 with self.assertRaises(FileNotFoundError): 

665 Butler.get_repo_uri("label") 

666 self.assertEqual(Butler.get_known_repos(), set()) 

667 

668 with self.assertRaisesRegex(FileNotFoundError, "index file not found"): 

669 Butler.from_config("label") 

670 

671 # Check that we can create Butler when the alias file is not found. 

672 butler = Butler.from_config(self.tmpConfigFile, writeable=False) 

673 self.assertIsInstance(butler, Butler) 

674 with self.assertRaises(RuntimeError) as cm: 

675 # No environment variable set. 

676 Butler.get_repo_uri("label") 

677 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False)) 

678 self.assertIn("No repository index defined", str(cm.exception)) 

679 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"): 

680 # No aliases registered. 

681 Butler.from_config("not_there") 

682 self.assertEqual(Butler.get_known_repos(), set()) 

683 

684 def testDafButlerRepositories(self): 

685 with unittest.mock.patch.dict( 

686 os.environ, 

687 {"DAF_BUTLER_REPOSITORIES": "label: 'https://someuri.com'\notherLabel: 'https://otheruri.com'\n"}, 

688 ): 

689 self.assertEqual(str(Butler.get_repo_uri("label")), "https://someuri.com") 

690 

691 with unittest.mock.patch.dict( 

692 os.environ, 

693 { 

694 "DAF_BUTLER_REPOSITORIES": "label: https://someuri.com", 

695 "DAF_BUTLER_REPOSITORY_INDEX": "https://someuri.com", 

696 }, 

697 ): 

698 with self.assertRaisesRegex(RuntimeError, "Only one of the environment variables"): 

699 Butler.get_repo_uri("label") 

700 

701 with unittest.mock.patch.dict( 

702 os.environ, 

703 {"DAF_BUTLER_REPOSITORIES": "invalid"}, 

704 ): 

705 with self.assertRaisesRegex(ValueError, "Repository index not in expected format"): 

706 Butler.get_repo_uri("label") 

707 

708 def testBasicPutGet(self) -> None: 

709 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

710 self.runPutGetTest(storageClass, "test_metric") 

711 

712 def testCompositePutGetConcrete(self) -> None: 

713 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

714 butler = self.runPutGetTest(storageClass, "test_metric") 

715 

716 # Should *not* be disassembled 

717 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

718 self.assertEqual(len(datasets), 1) 

719 uri, components = butler.getURIs(datasets[0]) 

720 self.assertIsInstance(uri, ResourcePath) 

721 self.assertFalse(components) 

722 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

723 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

724 

725 # Predicted dataset 

726 dataId = {"instrument": "DummyCamComp", "visit": 424} 

727 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

728 self.assertFalse(components) 

729 self.assertIsInstance(uri, ResourcePath) 

730 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

731 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

732 

733 def testCompositePutGetVirtual(self) -> None: 

734 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

735 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

736 

737 # Should be disassembled 

738 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

739 self.assertEqual(len(datasets), 1) 

740 uri, components = butler.getURIs(datasets[0]) 

741 

742 if butler._datastore.isEphemeral: 

743 # Never disassemble in-memory datastore 

744 self.assertIsInstance(uri, ResourcePath) 

745 self.assertFalse(components) 

746 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

747 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

748 else: 

749 self.assertIsNone(uri) 

750 self.assertEqual(set(components), set(storageClass.components)) 

751 for compuri in components.values(): 

752 self.assertIsInstance(compuri, ResourcePath) 

753 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

754 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

755 

756 # Predicted dataset 

757 dataId = {"instrument": "DummyCamComp", "visit": 424} 

758 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

759 

760 if butler._datastore.isEphemeral: 

761 # Never disassembled 

762 self.assertIsInstance(uri, ResourcePath) 

763 self.assertFalse(components) 

764 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

765 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

766 else: 

767 self.assertIsNone(uri) 

768 self.assertEqual(set(components), set(storageClass.components)) 

769 for compuri in components.values(): 

770 self.assertIsInstance(compuri, ResourcePath) 

771 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

772 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

773 

774 def testStorageClassOverrideGet(self) -> None: 

775 """Test storage class conversion on get with override.""" 

776 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

777 datasetTypeName = "anything" 

778 run = self.default_run 

779 

780 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

781 

782 # Create and store a dataset. 

783 metric = makeExampleMetrics() 

784 dataId = {"instrument": "DummyCamComp", "visit": 423} 

785 

786 ref = butler.put(metric, datasetType, dataId) 

787 

788 # Return native type. 

789 retrieved = butler.get(ref) 

790 self.assertEqual(retrieved, metric) 

791 

792 # Specify an override. 

793 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion") 

794 model = butler.get(ref, storageClass=new_sc) 

795 self.assertNotEqual(type(model), type(retrieved)) 

796 self.assertIs(type(model), new_sc.pytype) 

797 self.assertEqual(retrieved, model) 

798 

799 # Defer but override later. 

800 deferred = butler.getDeferred(ref) 

801 model = deferred.get(storageClass=new_sc) 

802 self.assertIs(type(model), new_sc.pytype) 

803 self.assertEqual(retrieved, model) 

804 

805 # Defer but override up front. 

806 deferred = butler.getDeferred(ref, storageClass=new_sc) 

807 model = deferred.get() 

808 self.assertIs(type(model), new_sc.pytype) 

809 self.assertEqual(retrieved, model) 

810 

811 # Retrieve a component. Should be a tuple. 

812 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple") 

813 self.assertIs(type(data), tuple) 

814 self.assertEqual(data, tuple(retrieved.data)) 

815 

816 # Parameter on the write storage class should work regardless 

817 # of read storage class. 

818 data = butler.get( 

819 "anything.data", 

820 dataId, 

821 storageClass="StructuredDataDataTestTuple", 

822 parameters={"slice": slice(2, 4)}, 

823 ) 

824 self.assertEqual(len(data), 2) 

825 

826 # Try a parameter that is known to the read storage class but not 

827 # the write storage class. 

828 with self.assertRaises(KeyError): 

829 butler.get( 

830 "anything.data", 

831 dataId, 

832 storageClass="StructuredDataDataTestTuple", 

833 parameters={"xslice": slice(2, 4)}, 

834 ) 

835 

836 def testPytypePutCoercion(self) -> None: 

837 """Test python type coercion on Butler.get and put.""" 

838 # Store some data with the normal example storage class. 

839 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

840 datasetTypeName = "test_metric" 

841 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName) 

842 

843 dataId = {"instrument": "DummyCamComp", "visit": 423} 

844 

845 # Put a dict and this should coerce to a MetricsExample 

846 test_dict = {"summary": {"a": 1}, "output": {"b": 2}} 

847 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424) 

848 test_metric = butler.get(metric_ref) 

849 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample") 

850 self.assertEqual(test_metric.summary, test_dict["summary"]) 

851 self.assertEqual(test_metric.output, test_dict["output"]) 

852 

853 # Check that the put still works if a DatasetType is given with 

854 # a definition matching this python type. 

855 registry_type = butler.get_dataset_type(datasetTypeName) 

856 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson") 

857 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425) 

858 self.assertEqual(metric2_ref.datasetType, registry_type) 

859 

860 # The get will return the type expected by registry. 

861 test_metric2 = butler.get(metric2_ref) 

862 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample") 

863 

864 # Make a new DatasetRef with the compatible but different DatasetType. 

865 # This should now return a dict. 

866 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run) 

867 test_dict2 = butler.get(new_ref) 

868 self.assertEqual(get_full_type_name(test_dict2), "dict") 

869 

870 # Get it again with the wrong dataset type definition using get() 

871 # rather than get(). This should be consistent with get() 

872 # behavior and return the type of the DatasetType. 

873 test_dict3 = butler.get(this_type, dataId=dataId, visit=425) 

874 self.assertEqual(get_full_type_name(test_dict3), "dict") 

875 

876 def testIngest(self) -> None: 

877 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

878 

879 # Create and register a DatasetType 

880 dimensions = butler.dimensions.conform(["instrument", "visit", "detector"]) 

881 

882 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

883 datasetTypeName = "metric" 

884 

885 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

886 

887 # Add needed Dimensions 

888 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

889 butler.registry.insertDimensionData( 

890 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

891 ) 

892 for detector in (1, 2): 

893 butler.registry.insertDimensionData( 

894 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

895 ) 

896 

897 butler.registry.insertDimensionData( 

898 "visit", 

899 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

900 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

901 ) 

902 

903 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter") 

904 dataRoot = os.path.join(TESTDIR, "data", "basic") 

905 datasets = [] 

906 for detector in (1, 2): 

907 detector_name = f"detector_{detector}" 

908 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

909 dataId = butler.registry.expandDataId( 

910 {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

911 ) 

912 # Create a DatasetRef for ingest 

913 refIn = DatasetRef(datasetType, dataId, run=self.default_run) 

914 

915 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

916 

917 butler.ingest(*datasets, transfer="copy") 

918 

919 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

920 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

921 

922 metrics1 = butler.get(datasetTypeName, dataId1) 

923 metrics2 = butler.get(datasetTypeName, dataId2) 

924 self.assertNotEqual(metrics1, metrics2) 

925 

926 # Compare URIs 

927 uri1 = butler.getURI(datasetTypeName, dataId1) 

928 uri2 = butler.getURI(datasetTypeName, dataId2) 

929 self.assertNotEqual(uri1, uri2) 

930 

931 # Now do a multi-dataset but single file ingest 

932 metricFile = os.path.join(dataRoot, "detectors.yaml") 

933 refs = [] 

934 for detector in (1, 2): 

935 detector_name = f"detector_{detector}" 

936 dataId = butler.registry.expandDataId( 

937 {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

938 ) 

939 # Create a DatasetRef for ingest 

940 refs.append(DatasetRef(datasetType, dataId, run=self.default_run)) 

941 

942 # Test "move" transfer to ensure that the files themselves 

943 # have disappeared following ingest. 

944 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile: 

945 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy") 

946 

947 datasets = [] 

948 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter)) 

949 

950 # For first ingest use copy. 

951 butler.ingest(*datasets, transfer="copy", record_validation_info=False) 

952 

953 # Now try to ingest again in "execution butler" mode where 

954 # the registry entries exist but the datastore does not have 

955 # the files. We also need to strip the dimension records to ensure 

956 # that they will be re-added by the ingest. 

957 ref = datasets[0].refs[0] 

958 datasets[0].refs = [ 

959 cast( 

960 DatasetRef, 

961 butler.find_dataset(ref.datasetType, data_id=ref.dataId, collections=ref.run), 

962 ) 

963 for ref in datasets[0].refs 

964 ] 

965 all_refs = [] 

966 for dataset in datasets: 

967 refs = [] 

968 for ref in dataset.refs: 

969 # Create a dict from the dataId to drop the records. 

970 new_data_id = dict(ref.dataId.required) 

971 new_ref = butler.find_dataset(ref.datasetType, new_data_id, collections=ref.run) 

972 assert new_ref is not None 

973 self.assertFalse(new_ref.dataId.hasRecords()) 

974 refs.append(new_ref) 

975 dataset.refs = refs 

976 all_refs.extend(dataset.refs) 

977 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False) 

978 

979 # Use move mode to test that the file is deleted. Also 

980 # disable recording of file size. 

981 butler.ingest(*datasets, transfer="move", record_validation_info=False) 

982 

983 # Check that every ref now has records. 

984 for dataset in datasets: 

985 for ref in dataset.refs: 

986 self.assertTrue(ref.dataId.hasRecords()) 

987 

988 # Ensure that the file has disappeared. 

989 self.assertFalse(tempFile.exists()) 

990 

991 # Check that the datastore recorded no file size. 

992 # Not all datastores can support this. 

993 try: 

994 infos = butler._datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined] 

995 self.assertEqual(infos[0].file_size, -1) 

996 except AttributeError: 

997 pass 

998 

999 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

1000 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

1001 

1002 multi1 = butler.get(datasetTypeName, dataId1) 

1003 multi2 = butler.get(datasetTypeName, dataId2) 

1004 

1005 self.assertEqual(multi1, metrics1) 

1006 self.assertEqual(multi2, metrics2) 

1007 

1008 # Compare URIs 

1009 uri1 = butler.getURI(datasetTypeName, dataId1) 

1010 uri2 = butler.getURI(datasetTypeName, dataId2) 

1011 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

1012 

1013 # Test that removing one does not break the second 

1014 # This line will issue a warning log message for a ChainedDatastore 

1015 # that uses an InMemoryDatastore since in-memory can not ingest 

1016 # files. 

1017 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

1018 self.assertFalse(butler.exists(datasetTypeName, dataId1)) 

1019 self.assertTrue(butler.exists(datasetTypeName, dataId2)) 

1020 multi2b = butler.get(datasetTypeName, dataId2) 

1021 self.assertEqual(multi2, multi2b) 

1022 

1023 # Ensure we can ingest 0 datasets 

1024 datasets = [] 

1025 butler.ingest(*datasets) 

1026 

1027 def testPickle(self) -> None: 

1028 """Test pickle support.""" 

1029 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1030 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration" 

1031 butlerOut = pickle.loads(pickle.dumps(butler)) 

1032 self.assertIsInstance(butlerOut, Butler) 

1033 self.assertEqual(butlerOut._config, butler._config) 

1034 self.assertEqual(butlerOut.collections, butler.collections) 

1035 self.assertEqual(butlerOut.run, butler.run) 

1036 

1037 def testGetDatasetTypes(self) -> None: 

1038 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1039 dimensions = butler.dimensions.conform(["instrument", "visit", "physical_filter"]) 

1040 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [ 

1041 ( 

1042 "instrument", 

1043 [ 

1044 {"instrument": "DummyCam"}, 

1045 {"instrument": "DummyHSC"}, 

1046 {"instrument": "DummyCamComp"}, 

1047 ], 

1048 ), 

1049 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]), 

1050 ("visit", [{"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}]), 

1051 ] 

1052 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1053 # Add needed Dimensions 

1054 for element, data in dimensionEntries: 

1055 butler.registry.insertDimensionData(element, *data) 

1056 

1057 # When a DatasetType is added to the registry entries are not created 

1058 # for components but querying them can return the components. 

1059 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

1060 components = set() 

1061 for datasetTypeName in datasetTypeNames: 

1062 # Create and register a DatasetType 

1063 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1064 

1065 for componentName in storageClass.components: 

1066 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

1067 

1068 fromRegistry: set[DatasetType] = set() 

1069 for parent_dataset_type in butler.registry.queryDatasetTypes(): 

1070 fromRegistry.add(parent_dataset_type) 

1071 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes()) 

1072 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

1073 

1074 # Now that we have some dataset types registered, validate them 

1075 butler.validateConfiguration( 

1076 ignore=[ 

1077 "test_metric_comp", 

1078 "metric3", 

1079 "metric5", 

1080 "calexp", 

1081 "DummySC", 

1082 "datasetType.component", 

1083 "random_data", 

1084 "random_data_2", 

1085 ] 

1086 ) 

1087 

1088 # Add a new datasetType that will fail template validation 

1089 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

1090 if self.validationCanFail: 

1091 with self.assertRaises(ValidationError): 

1092 butler.validateConfiguration() 

1093 

1094 # Rerun validation but with a subset of dataset type names 

1095 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

1096 

1097 # Rerun validation but ignore the bad datasetType 

1098 butler.validateConfiguration( 

1099 ignore=[ 

1100 "test_metric_comp", 

1101 "metric3", 

1102 "metric5", 

1103 "calexp", 

1104 "DummySC", 

1105 "datasetType.component", 

1106 "random_data", 

1107 "random_data_2", 

1108 ] 

1109 ) 

1110 

1111 def testTransaction(self) -> None: 

1112 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1113 datasetTypeName = "test_metric" 

1114 dimensions = butler.dimensions.conform(["instrument", "visit"]) 

1115 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = ( 

1116 ("instrument", {"instrument": "DummyCam"}), 

1117 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

1118 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

1119 ) 

1120 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1121 metric = makeExampleMetrics() 

1122 dataId = {"instrument": "DummyCam", "visit": 42} 

1123 # Create and register a DatasetType 

1124 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1125 with self.assertRaises(TransactionTestError): 

1126 with butler.transaction(): 

1127 # Add needed Dimensions 

1128 for args in dimensionEntries: 

1129 butler.registry.insertDimensionData(*args) 

1130 # Store a dataset 

1131 ref = butler.put(metric, datasetTypeName, dataId) 

1132 self.assertIsInstance(ref, DatasetRef) 

1133 # Test get of a ref. 

1134 metricOut = butler.get(ref) 

1135 self.assertEqual(metric, metricOut) 

1136 # Test get 

1137 metricOut = butler.get(datasetTypeName, dataId) 

1138 self.assertEqual(metric, metricOut) 

1139 # Check we can get components 

1140 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

1141 raise TransactionTestError("This should roll back the entire transaction") 

1142 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"): 

1143 butler.registry.expandDataId(dataId) 

1144 # Should raise LookupError for missing data ID value 

1145 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

1146 butler.get(datasetTypeName, dataId) 

1147 # Also check explicitly if Dataset entry is missing 

1148 self.assertIsNone(butler.find_dataset(datasetType, dataId, collections=butler.collections)) 

1149 # Direct retrieval should not find the file in the Datastore 

1150 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

1151 butler.get(ref) 

1152 

1153 def testMakeRepo(self) -> None: 

1154 """Test that we can write butler configuration to a new repository via 

1155 the Butler.makeRepo interface and then instantiate a butler from the 

1156 repo root. 

1157 """ 

1158 # Do not run the test if we know this datastore configuration does 

1159 # not support a file system root 

1160 if self.fullConfigKey is None: 

1161 return 

1162 

1163 # create two separate directories 

1164 root1 = tempfile.mkdtemp(dir=self.root) 

1165 root2 = tempfile.mkdtemp(dir=self.root) 

1166 

1167 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

1168 limited = Config(self.configFile) 

1169 butler1 = Butler.from_config(butlerConfig) 

1170 assert isinstance(butler1, DirectButler), "Expect DirectButler in configuration" 

1171 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

1172 full = Config(self.tmpConfigFile) 

1173 butler2 = Butler.from_config(butlerConfig) 

1174 assert isinstance(butler2, DirectButler), "Expect DirectButler in configuration" 

1175 # Butlers should have the same configuration regardless of whether 

1176 # defaults were expanded. 

1177 self.assertEqual(butler1._config, butler2._config) 

1178 # Config files loaded directly should not be the same. 

1179 self.assertNotEqual(limited, full) 

1180 # Make sure "limited" doesn't have a few keys we know it should be 

1181 # inheriting from defaults. 

1182 self.assertIn(self.fullConfigKey, full) 

1183 self.assertNotIn(self.fullConfigKey, limited) 

1184 

1185 # Collections don't appear until something is put in them 

1186 collections1 = set(butler1.registry.queryCollections()) 

1187 self.assertEqual(collections1, set()) 

1188 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

1189 

1190 # Check that a config with no associated file name will not 

1191 # work properly with relocatable Butler repo 

1192 butlerConfig.configFile = None 

1193 with self.assertRaises(ValueError): 

1194 Butler.from_config(butlerConfig) 

1195 

1196 with self.assertRaises(FileExistsError): 

1197 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

1198 

1199 def testStringification(self) -> None: 

1200 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1201 butlerStr = str(butler) 

1202 

1203 if self.datastoreStr is not None: 

1204 for testStr in self.datastoreStr: 

1205 self.assertIn(testStr, butlerStr) 

1206 if self.registryStr is not None: 

1207 self.assertIn(self.registryStr, butlerStr) 

1208 

1209 datastoreName = butler._datastore.name 

1210 if self.datastoreName is not None: 

1211 for testStr in self.datastoreName: 

1212 self.assertIn(testStr, datastoreName) 

1213 

1214 def testButlerRewriteDataId(self) -> None: 

1215 """Test that dataIds can be rewritten based on dimension records.""" 

1216 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1217 

1218 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1219 datasetTypeName = "random_data" 

1220 

1221 # Create dimension records. 

1222 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1223 butler.registry.insertDimensionData( 

1224 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1225 ) 

1226 butler.registry.insertDimensionData( 

1227 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1228 ) 

1229 

1230 dimensions = butler.dimensions.conform(["instrument", "exposure"]) 

1231 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1232 butler.registry.registerDatasetType(datasetType) 

1233 

1234 n_exposures = 5 

1235 dayobs = 20210530 

1236 

1237 for i in range(n_exposures): 

1238 butler.registry.insertDimensionData( 

1239 "exposure", 

1240 { 

1241 "instrument": "DummyCamComp", 

1242 "id": i, 

1243 "obs_id": f"exp{i}", 

1244 "seq_num": i, 

1245 "day_obs": dayobs, 

1246 "physical_filter": "d-r", 

1247 }, 

1248 ) 

1249 

1250 # Write some data. 

1251 for i in range(n_exposures): 

1252 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1253 

1254 # Use the seq_num for the put to test rewriting. 

1255 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1256 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1257 

1258 # Check that the exposure is correct in the dataId 

1259 self.assertEqual(ref.dataId["exposure"], i) 

1260 

1261 # and check that we can get the dataset back with the same dataId 

1262 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1263 self.assertEqual(new_metric, metric) 

1264 

1265 def testGetDatasetCollectionCaching(self): 

1266 # Prior to DM-41117, there was a bug where get_dataset would throw 

1267 # MissingCollectionError if you tried to fetch a dataset that was added 

1268 # after the collection cache was last updated. 

1269 reader_butler, datasetType = self.create_butler(self.default_run, "int", "datasettypename") 

1270 writer_butler = Butler.from_config(self.tmpConfigFile, writeable=True, run="new_run") 

1271 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1272 put_ref = writer_butler.put(123, datasetType, dataId) 

1273 get_ref = reader_butler.get_dataset(put_ref.id) 

1274 self.assertEqual(get_ref.id, put_ref.id) 

1275 

1276 

1277class FileDatastoreButlerTests(ButlerTests): 

1278 """Common tests and specialization of ButlerTests for butlers backed 

1279 by datastores that inherit from FileDatastore. 

1280 """ 

1281 

1282 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool: 

1283 """Check if file exists at a given path (relative to root). 

1284 

1285 Test testPutTemplates verifies actual physical existance of the files 

1286 in the requested location. 

1287 """ 

1288 uri = ResourcePath(root, forceDirectory=True) 

1289 return uri.join(relpath).exists() 

1290 

1291 def testPutTemplates(self) -> None: 

1292 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1293 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1294 

1295 # Add needed Dimensions 

1296 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1297 butler.registry.insertDimensionData( 

1298 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1299 ) 

1300 butler.registry.insertDimensionData( 

1301 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1302 ) 

1303 butler.registry.insertDimensionData( 

1304 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1305 ) 

1306 

1307 # Create and store a dataset 

1308 metric = makeExampleMetrics() 

1309 

1310 # Create two almost-identical DatasetTypes (both will use default 

1311 # template) 

1312 dimensions = butler.dimensions.conform(["instrument", "visit"]) 

1313 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1314 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1315 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1316 

1317 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1318 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1319 

1320 # Put with exactly the data ID keys needed 

1321 ref = butler.put(metric, "metric1", dataId1) 

1322 uri = butler.getURI(ref) 

1323 self.assertTrue(uri.exists()) 

1324 self.assertTrue( 

1325 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle") 

1326 ) 

1327 

1328 # Check the template based on dimensions 

1329 if hasattr(butler._datastore, "templates"): 

1330 butler._datastore.templates.validateTemplates([ref]) 

1331 

1332 # Put with extra data ID keys (physical_filter is an optional 

1333 # dependency); should not change template (at least the way we're 

1334 # defining them to behave now; the important thing is that they 

1335 # must be consistent). 

1336 ref = butler.put(metric, "metric2", dataId2) 

1337 uri = butler.getURI(ref) 

1338 self.assertTrue(uri.exists()) 

1339 self.assertTrue( 

1340 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle") 

1341 ) 

1342 

1343 # Check the template based on dimensions 

1344 if hasattr(butler._datastore, "templates"): 

1345 butler._datastore.templates.validateTemplates([ref]) 

1346 

1347 # Use a template that has a typo in dimension record metadata. 

1348 # Easier to test with a butler that has a ref with records attached. 

1349 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits") 

1350 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"): 

1351 path = template.format(ref) 

1352 self.assertEqual(path, f"a/v423/{ref.id}_fits") 

1353 

1354 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits") 

1355 with self.assertRaises(KeyError): 

1356 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"): 

1357 template.format(ref) 

1358 

1359 # Now use a file template that will not result in unique filenames 

1360 with self.assertRaises(FileTemplateValidationError): 

1361 butler.put(metric, "metric3", dataId1) 

1362 

1363 def testImportExport(self) -> None: 

1364 # Run put/get tests just to create and populate a repo. 

1365 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1366 self.runImportExportTest(storageClass) 

1367 

1368 @unittest.expectedFailure 

1369 def testImportExportVirtualComposite(self) -> None: 

1370 # Run put/get tests just to create and populate a repo. 

1371 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1372 self.runImportExportTest(storageClass) 

1373 

1374 def runImportExportTest(self, storageClass: StorageClass) -> None: 

1375 """Test exporting and importing. 

1376 

1377 This test does an export to a temp directory and an import back 

1378 into a new temp directory repo. It does not assume a posix datastore. 

1379 """ 

1380 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1381 

1382 # Test that we must have a file extension. 

1383 with self.assertRaises(ValueError): 

1384 with exportButler.export(filename="dump", directory=".") as export: 

1385 pass 

1386 

1387 # Test that unknown format is not allowed. 

1388 with self.assertRaises(ValueError): 

1389 with exportButler.export(filename="dump.fits", directory=".") as export: 

1390 pass 

1391 

1392 # Test that the repo actually has at least one dataset. 

1393 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1394 self.assertGreater(len(datasets), 0) 

1395 # Add a DimensionRecord that's unused by those datasets. 

1396 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1397 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1398 # Export and then import datasets. 

1399 with safeTestTempDir(TESTDIR) as exportDir: 

1400 exportFile = os.path.join(exportDir, "exports.yaml") 

1401 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1402 export.saveDatasets(datasets) 

1403 # Export the same datasets again. This should quietly do 

1404 # nothing because of internal deduplication, and it shouldn't 

1405 # complain about being asked to export the "htm7" elements even 

1406 # though there aren't any in these datasets or in the database. 

1407 export.saveDatasets(datasets, elements=["htm7"]) 

1408 # Save one of the data IDs again; this should be harmless 

1409 # because of internal deduplication. 

1410 export.saveDataIds([datasets[0].dataId]) 

1411 # Save some dimension records directly. 

1412 export.saveDimensionData("skymap", [skymapRecord]) 

1413 self.assertTrue(os.path.exists(exportFile)) 

1414 with safeTestTempDir(TESTDIR) as importDir: 

1415 # We always want this to be a local posix butler 

1416 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1417 # Calling script.butlerImport tests the implementation of the 

1418 # butler command line interface "import" subcommand. Functions 

1419 # in the script folder are generally considered protected and 

1420 # should not be used as public api. 

1421 with open(exportFile) as f: 

1422 script.butlerImport( 

1423 importDir, 

1424 export_file=f, 

1425 directory=exportDir, 

1426 transfer="auto", 

1427 skip_dimensions=None, 

1428 ) 

1429 importButler = Butler.from_config(importDir, run=self.default_run) 

1430 for ref in datasets: 

1431 with self.subTest(ref=ref): 

1432 # Test for existence by passing in the DatasetType and 

1433 # data ID separately, to avoid lookup by dataset_id. 

1434 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId)) 

1435 self.assertEqual( 

1436 list(importButler.registry.queryDimensionRecords("skymap")), 

1437 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)], 

1438 ) 

1439 

1440 def testRemoveRuns(self) -> None: 

1441 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1442 butler = Butler.from_config(self.tmpConfigFile, writeable=True) 

1443 # Load registry data with dimensions to hang datasets off of. 

1444 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1445 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1446 # Add some RUN-type collection. 

1447 run1 = "run1" 

1448 butler.registry.registerRun(run1) 

1449 run2 = "run2" 

1450 butler.registry.registerRun(run2) 

1451 # put a dataset in each 

1452 metric = makeExampleMetrics() 

1453 dimensions = butler.dimensions.conform(["instrument", "physical_filter"]) 

1454 datasetType = self.addDatasetType( 

1455 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1456 ) 

1457 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1458 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1459 uri1 = butler.getURI(ref1) 

1460 uri2 = butler.getURI(ref2) 

1461 

1462 with self.assertRaises(OrphanedRecordError): 

1463 butler.registry.removeDatasetType(datasetType.name) 

1464 

1465 # Remove from both runs with different values for unstore. 

1466 butler.removeRuns([run1], unstore=True) 

1467 butler.removeRuns([run2], unstore=False) 

1468 # Should be nothing in registry for either one, and datastore should 

1469 # not think either exists. 

1470 with self.assertRaises(MissingCollectionError): 

1471 butler.registry.getCollectionType(run1) 

1472 with self.assertRaises(MissingCollectionError): 

1473 butler.registry.getCollectionType(run2) 

1474 self.assertFalse(butler.stored(ref1)) 

1475 self.assertFalse(butler.stored(ref2)) 

1476 # The ref we unstored should be gone according to the URI, but the 

1477 # one we forgot should still be around. 

1478 self.assertFalse(uri1.exists()) 

1479 self.assertTrue(uri2.exists()) 

1480 

1481 # Now that the collections have been pruned we can remove the 

1482 # dataset type 

1483 butler.registry.removeDatasetType(datasetType.name) 

1484 

1485 with self.assertLogs("lsst.daf.butler.registry", "INFO") as cm: 

1486 butler.registry.removeDatasetType(("test*", "test*")) 

1487 self.assertIn("not defined", "\n".join(cm.output)) 

1488 

1489 

1490class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1491 """PosixDatastore specialization of a butler""" 

1492 

1493 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1494 fullConfigKey: str | None = ".datastore.formatters" 

1495 validationCanFail = True 

1496 datastoreStr = ["/tmp"] 

1497 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1498 registryStr = "/gen3.sqlite3" 

1499 

1500 def testPathConstructor(self) -> None: 

1501 """Independent test of constructor using PathLike.""" 

1502 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1503 self.assertIsInstance(butler, Butler) 

1504 

1505 # And again with a Path object with the butler yaml 

1506 path = pathlib.Path(self.tmpConfigFile) 

1507 butler = Butler.from_config(path, writeable=False) 

1508 self.assertIsInstance(butler, Butler) 

1509 

1510 # And again with a Path object without the butler yaml 

1511 # (making sure we skip it if the tmp config doesn't end 

1512 # in butler.yaml -- which is the case for a subclass) 

1513 if self.tmpConfigFile.endswith("butler.yaml"): 

1514 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1515 butler = Butler.from_config(path, writeable=False) 

1516 self.assertIsInstance(butler, Butler) 

1517 

1518 def testExportTransferCopy(self) -> None: 

1519 """Test local export using all transfer modes""" 

1520 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1521 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1522 # Test that the repo actually has at least one dataset. 

1523 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1524 self.assertGreater(len(datasets), 0) 

1525 uris = [exportButler.getURI(d) for d in datasets] 

1526 assert isinstance(exportButler._datastore, FileDatastore) 

1527 datastoreRoot = exportButler.get_datastore_roots()[exportButler.get_datastore_names()[0]] 

1528 

1529 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1530 

1531 for path in pathsInStore: 

1532 # Assume local file system 

1533 assert path is not None 

1534 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1535 

1536 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1537 with safeTestTempDir(TESTDIR) as exportDir: 

1538 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1539 export.saveDatasets(datasets) 

1540 for path in pathsInStore: 

1541 assert path is not None 

1542 self.assertTrue( 

1543 self.checkFileExists(exportDir, path), 

1544 f"Check that mode {transfer} exported files", 

1545 ) 

1546 

1547 def testPruneDatasets(self) -> None: 

1548 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1549 butler = Butler.from_config(self.tmpConfigFile, writeable=True) 

1550 assert isinstance(butler._datastore, FileDatastore) 

1551 # Load registry data with dimensions to hang datasets off of. 

1552 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1553 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1554 # Add some RUN-type collections. 

1555 run1 = "run1" 

1556 butler.registry.registerRun(run1) 

1557 run2 = "run2" 

1558 butler.registry.registerRun(run2) 

1559 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1560 # different runs. ref3 has a different data ID. 

1561 metric = makeExampleMetrics() 

1562 dimensions = butler.dimensions.conform(["instrument", "physical_filter"]) 

1563 datasetType = self.addDatasetType( 

1564 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1565 ) 

1566 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1567 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1568 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1569 

1570 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1571 for ref, stored in many_stored.items(): 

1572 self.assertTrue(stored, f"Ref {ref} should be stored") 

1573 

1574 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1575 for ref, exists in many_exists.items(): 

1576 self.assertTrue(exists, f"Checking ref {ref} exists.") 

1577 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored") 

1578 

1579 # Simple prune. 

1580 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1581 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1)) 

1582 

1583 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1584 for ref, stored in many_stored.items(): 

1585 self.assertFalse(stored, f"Ref {ref} should not be stored") 

1586 

1587 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1588 for ref, exists in many_exists.items(): 

1589 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored") 

1590 

1591 # Put data back. 

1592 ref1_new = butler.put(metric, ref1) 

1593 self.assertEqual(ref1_new, ref1) # Reuses original ID. 

1594 ref2 = butler.put(metric, ref2) 

1595 

1596 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1597 self.assertTrue(many_stored[ref1]) 

1598 self.assertTrue(many_stored[ref2]) 

1599 self.assertFalse(many_stored[ref3]) 

1600 

1601 ref3 = butler.put(metric, ref3) 

1602 

1603 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1604 for ref, exists in many_exists.items(): 

1605 self.assertTrue(exists, f"Ref {ref} should not be stored") 

1606 

1607 # Clear out the datasets from registry and start again. 

1608 refs = [ref1, ref2, ref3] 

1609 butler.pruneDatasets(refs, purge=True, unstore=True) 

1610 for ref in refs: 

1611 butler.put(metric, ref) 

1612 

1613 # Confirm we can retrieve deferred. 

1614 dref1 = butler.getDeferred(ref1) # known and exists 

1615 metric1 = dref1.get() 

1616 self.assertEqual(metric1, metric) 

1617 

1618 # Test different forms of file availability. 

1619 # Need to be in a state where: 

1620 # - one ref just has registry record. 

1621 # - one ref has a missing file but a datastore record. 

1622 # - one ref has a missing datastore record but file is there. 

1623 # - one ref does not exist anywhere. 

1624 # Do not need to test a ref that has everything since that is tested 

1625 # above. 

1626 ref0 = DatasetRef( 

1627 datasetType, 

1628 DataCoordinate.standardize( 

1629 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions 

1630 ), 

1631 run=run1, 

1632 ) 

1633 

1634 # Delete from datastore and retain in Registry. 

1635 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False) 

1636 

1637 # File has been removed. 

1638 uri2 = butler.getURI(ref2) 

1639 uri2.remove() 

1640 

1641 # Datastore has lost track. 

1642 butler._datastore.forget([ref3]) 

1643 

1644 # First test with a standard butler. 

1645 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1646 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1647 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1648 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1649 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED) 

1650 

1651 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False) 

1652 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1653 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1654 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN) 

1655 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1656 self.assertTrue(exists_many[ref2]) 

1657 

1658 # Check that per-ref query gives the same answer as many query. 

1659 for ref, exists in exists_many.items(): 

1660 self.assertEqual(butler.exists(ref, full_check=False), exists) 

1661 

1662 # Get deferred checks for existence before it allows it to be 

1663 # retrieved. 

1664 with self.assertRaises(LookupError): 

1665 butler.getDeferred(ref3) # not known, file exists 

1666 dref2 = butler.getDeferred(ref2) # known but file missing 

1667 with self.assertRaises(FileNotFoundError): 

1668 dref2.get() 

1669 

1670 # Test again with a trusting butler. 

1671 butler._datastore.trustGetRequest = True 

1672 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1673 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1674 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1675 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1676 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT) 

1677 

1678 # When trusting we can get a deferred dataset handle that is not 

1679 # known but does exist. 

1680 dref3 = butler.getDeferred(ref3) 

1681 metric3 = dref3.get() 

1682 self.assertEqual(metric3, metric) 

1683 

1684 # Check that per-ref query gives the same answer as many query. 

1685 for ref, exists in exists_many.items(): 

1686 self.assertEqual(butler.exists(ref, full_check=True), exists) 

1687 

1688 # Create a ref that surprisingly has the UUID of an existing ref 

1689 # but is not the same. 

1690 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id) 

1691 with self.assertRaises(ValueError): 

1692 butler.exists(ref_bad) 

1693 

1694 # Create a ref that has a compatible storage class. 

1695 ref_compat = ref2.overrideStorageClass("StructuredDataDict") 

1696 exists = butler.exists(ref_compat) 

1697 self.assertEqual(exists, exists_many[ref2]) 

1698 

1699 # Remove everything and start from scratch. 

1700 butler._datastore.trustGetRequest = False 

1701 butler.pruneDatasets(refs, purge=True, unstore=True) 

1702 for ref in refs: 

1703 butler.put(metric, ref) 

1704 

1705 # These tests mess directly with the trash table and can leave the 

1706 # datastore in an odd state. Do them at the end. 

1707 # Check that in normal mode, deleting the record will lead to 

1708 # trash not touching the file. 

1709 uri1 = butler.getURI(ref1) 

1710 butler._datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table 

1711 butler._datastore.forget([ref1]) 

1712 butler._datastore.trash(ref1) 

1713 butler._datastore.emptyTrash() 

1714 self.assertTrue(uri1.exists()) 

1715 uri1.remove() # Clean it up. 

1716 

1717 # Simulate execution butler setup by deleting the datastore 

1718 # record but keeping the file around and trusting. 

1719 butler._datastore.trustGetRequest = True 

1720 uris = butler.get_many_uris([ref2, ref3]) 

1721 uri2 = uris[ref2].primaryURI 

1722 uri3 = uris[ref3].primaryURI 

1723 self.assertTrue(uri2.exists()) 

1724 self.assertTrue(uri3.exists()) 

1725 

1726 # Remove the datastore record. 

1727 butler._datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table 

1728 butler._datastore.forget([ref2]) 

1729 self.assertTrue(uri2.exists()) 

1730 butler._datastore.trash([ref2, ref3]) 

1731 # Immediate removal for ref2 file 

1732 self.assertFalse(uri2.exists()) 

1733 # But ref3 has to wait for the empty. 

1734 self.assertTrue(uri3.exists()) 

1735 butler._datastore.emptyTrash() 

1736 self.assertFalse(uri3.exists()) 

1737 

1738 # Clear out the datasets from registry. 

1739 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1740 

1741 def testPytypeCoercion(self) -> None: 

1742 """Test python type coercion on Butler.get and put.""" 

1743 # Store some data with the normal example storage class. 

1744 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1745 datasetTypeName = "test_metric" 

1746 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1747 

1748 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1749 metric = butler.get(datasetTypeName, dataId=dataId) 

1750 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1751 

1752 datasetType_ori = butler.get_dataset_type(datasetTypeName) 

1753 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1754 

1755 # Now need to hack the registry dataset type definition. 

1756 # There is no API for this. 

1757 assert isinstance(butler._registry, SqlRegistry) 

1758 manager = butler._registry._managers.datasets 

1759 assert hasattr(manager, "_db") and hasattr(manager, "_static") 

1760 manager._db.update( 

1761 manager._static.dataset_type, 

1762 {"name": datasetTypeName}, 

1763 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1764 ) 

1765 

1766 # Force reset of dataset type cache 

1767 butler.registry.refresh() 

1768 

1769 datasetType_new = butler.get_dataset_type(datasetTypeName) 

1770 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1771 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1772 

1773 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1774 self.assertNotEqual(type(metric_model), type(metric)) 

1775 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1776 

1777 # Put the model and read it back to show that everything now 

1778 # works as normal. 

1779 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1780 metric_model_new = butler.get(metric_ref) 

1781 self.assertEqual(metric_model_new, metric_model) 

1782 

1783 # Hack the storage class again to something that will fail on the 

1784 # get with no conversion class. 

1785 manager._db.update( 

1786 manager._static.dataset_type, 

1787 {"name": datasetTypeName}, 

1788 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1789 ) 

1790 butler.registry.refresh() 

1791 

1792 with self.assertRaises(ValueError): 

1793 butler.get(datasetTypeName, dataId=dataId) 

1794 

1795 

1796@unittest.skipUnless(testing is not None, "testing.postgresql module not found") 

1797class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1798 """PosixDatastore specialization of a butler using Postgres""" 

1799 

1800 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1801 fullConfigKey = ".datastore.formatters" 

1802 validationCanFail = True 

1803 datastoreStr = ["/tmp"] 

1804 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1805 registryStr = "PostgreSQL@test" 

1806 postgresql: Any 

1807 

1808 @staticmethod 

1809 def _handler(postgresql: Any) -> None: 

1810 engine = sqlalchemy.engine.create_engine(postgresql.url()) 

1811 with engine.begin() as connection: 

1812 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;")) 

1813 

1814 @classmethod 

1815 def setUpClass(cls) -> None: 

1816 # Create the postgres test server. 

1817 cls.postgresql = testing.postgresql.PostgresqlFactory( 

1818 cache_initialized_db=True, on_initialized=cls._handler 

1819 ) 

1820 super().setUpClass() 

1821 

1822 @classmethod 

1823 def tearDownClass(cls) -> None: 

1824 # Clean up any lingering SQLAlchemy engines/connections 

1825 # so they're closed before we shut down the server. 

1826 gc.collect() 

1827 cls.postgresql.clear_cache() 

1828 super().tearDownClass() 

1829 

1830 def setUp(self) -> None: 

1831 self.server = self.postgresql() 

1832 

1833 # Need to add a registry section to the config. 

1834 self._temp_config = False 

1835 config = Config(self.configFile) 

1836 config["registry", "db"] = self.server.url() 

1837 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh: 

1838 config.dump(fh) 

1839 self.configFile = fh.name 

1840 self._temp_config = True 

1841 super().setUp() 

1842 

1843 def tearDown(self) -> None: 

1844 self.server.stop() 

1845 if self._temp_config and os.path.exists(self.configFile): 

1846 os.remove(self.configFile) 

1847 super().tearDown() 

1848 

1849 def testMakeRepo(self) -> None: 

1850 # The base class test assumes that it's using sqlite and assumes 

1851 # the config file is acceptable to sqlite. 

1852 raise unittest.SkipTest("Postgres config is not compatible with this test.") 

1853 

1854 

1855@unittest.skipUnless(testing is not None, "testing.postgresql module not found") 

1856class ClonedPostgresPosixDatastoreButlerTestCase(PostgresPosixDatastoreButlerTestCase, unittest.TestCase): 

1857 """Test that Butler with a Postgres registry still works after cloning.""" 

1858 

1859 def create_butler( 

1860 self, run: str, storageClass: StorageClass | str, datasetTypeName: str 

1861 ) -> tuple[DirectButler, DatasetType]: 

1862 butler, datasetType = super().create_butler(run, storageClass, datasetTypeName) 

1863 return butler._clone(run=run), datasetType 

1864 

1865 

1866class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1867 """InMemoryDatastore specialization of a butler""" 

1868 

1869 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1870 fullConfigKey = None 

1871 useTempRoot = False 

1872 validationCanFail = False 

1873 datastoreStr = ["datastore='InMemory"] 

1874 datastoreName = ["InMemoryDatastore@"] 

1875 registryStr = "/gen3.sqlite3" 

1876 

1877 def testIngest(self) -> None: 

1878 pass 

1879 

1880 

1881class ClonedSqliteButlerTestCase(InMemoryDatastoreButlerTestCase, unittest.TestCase): 

1882 """Test that a Butler with a Sqlite registry still works after cloning.""" 

1883 

1884 def create_butler( 

1885 self, run: str, storageClass: StorageClass | str, datasetTypeName: str 

1886 ) -> tuple[DirectButler, DatasetType]: 

1887 butler, datasetType = super().create_butler(run, storageClass, datasetTypeName) 

1888 return butler._clone(run=run), datasetType 

1889 

1890 

1891class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1892 """PosixDatastore specialization""" 

1893 

1894 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1895 fullConfigKey = ".datastore.datastores.1.formatters" 

1896 validationCanFail = True 

1897 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1898 datastoreName = [ 

1899 "InMemoryDatastore@", 

1900 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1901 "SecondDatastore", 

1902 ] 

1903 registryStr = "/gen3.sqlite3" 

1904 

1905 

1906class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1907 """Test that a yaml file in one location can refer to a root in another.""" 

1908 

1909 datastoreStr = ["dir1"] 

1910 # Disable the makeRepo test since we are deliberately not using 

1911 # butler.yaml as the config name. 

1912 fullConfigKey = None 

1913 

1914 def setUp(self) -> None: 

1915 self.root = makeTestTempDir(TESTDIR) 

1916 

1917 # Make a new repository in one place 

1918 self.dir1 = os.path.join(self.root, "dir1") 

1919 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1920 

1921 # Move the yaml file to a different place and add a "root" 

1922 self.dir2 = os.path.join(self.root, "dir2") 

1923 os.makedirs(self.dir2, exist_ok=True) 

1924 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1925 config = Config(configFile1) 

1926 config["root"] = self.dir1 

1927 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1928 config.dumpToUri(configFile2) 

1929 os.remove(configFile1) 

1930 self.tmpConfigFile = configFile2 

1931 

1932 def testFileLocations(self) -> None: 

1933 self.assertNotEqual(self.dir1, self.dir2) 

1934 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1935 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1936 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1937 

1938 

1939class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1940 """Test that a config file created by makeRepo outside of repo works.""" 

1941 

1942 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1943 

1944 def setUp(self) -> None: 

1945 self.root = makeTestTempDir(TESTDIR) 

1946 self.root2 = makeTestTempDir(TESTDIR) 

1947 

1948 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1949 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1950 

1951 def tearDown(self) -> None: 

1952 if os.path.exists(self.root2): 

1953 shutil.rmtree(self.root2, ignore_errors=True) 

1954 super().tearDown() 

1955 

1956 def testConfigExistence(self) -> None: 

1957 c = Config(self.tmpConfigFile) 

1958 uri_config = ResourcePath(c["root"]) 

1959 uri_expected = ResourcePath(self.root, forceDirectory=True) 

1960 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1961 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1962 

1963 def testPutGet(self) -> None: 

1964 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1965 self.runPutGetTest(storageClass, "test_metric") 

1966 

1967 

1968class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1969 """Test that a config file created by makeRepo outside of repo works.""" 

1970 

1971 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1972 

1973 def setUp(self) -> None: 

1974 self.root = makeTestTempDir(TESTDIR) 

1975 self.root2 = makeTestTempDir(TESTDIR) 

1976 

1977 self.tmpConfigFile = self.root2 

1978 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1979 

1980 def testConfigExistence(self) -> None: 

1981 # Append the yaml file else Config constructor does not know the file 

1982 # type. 

1983 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1984 super().testConfigExistence() 

1985 

1986 

1987class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1988 """Test that a config file created by makeRepo outside of repo works.""" 

1989 

1990 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1991 

1992 def setUp(self) -> None: 

1993 self.root = makeTestTempDir(TESTDIR) 

1994 self.root2 = makeTestTempDir(TESTDIR) 

1995 

1996 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl() 

1997 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1998 

1999 

2000@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

2001class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

2002 """S3Datastore specialization of a butler; an S3 storage Datastore + 

2003 a local in-memory SqlRegistry. 

2004 """ 

2005 

2006 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

2007 fullConfigKey = None 

2008 validationCanFail = True 

2009 

2010 bucketName = "anybucketname" 

2011 """Name of the Bucket that will be used in the tests. The name is read from 

2012 the config file used with the tests during set-up. 

2013 """ 

2014 

2015 root = "butlerRoot/" 

2016 """Root repository directory expected to be used in case useTempRoot=False. 

2017 Otherwise the root is set to a 20 characters long randomly generated string 

2018 during set-up. 

2019 """ 

2020 

2021 datastoreStr = [f"datastore={root}"] 

2022 """Contains all expected root locations in a format expected to be 

2023 returned by Butler stringification. 

2024 """ 

2025 

2026 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

2027 """The expected format of the S3 Datastore string.""" 

2028 

2029 registryStr = "/gen3.sqlite3" 

2030 """Expected format of the Registry string.""" 

2031 

2032 mock_aws = mock_aws() 

2033 """The mocked s3 interface from moto.""" 

2034 

2035 def genRoot(self) -> str: 

2036 """Return a random string of len 20 to serve as a root 

2037 name for the temporary bucket repo. 

2038 

2039 This is equivalent to tempfile.mkdtemp as this is what self.root 

2040 becomes when useTempRoot is True. 

2041 """ 

2042 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

2043 return rndstr + "/" 

2044 

2045 def setUp(self) -> None: 

2046 config = Config(self.configFile) 

2047 uri = ResourcePath(config[".datastore.datastore.root"]) 

2048 self.bucketName = uri.netloc 

2049 

2050 # Enable S3 mocking of tests. 

2051 self.enterContext(clean_test_environment_for_s3()) 

2052 self.mock_aws.start() 

2053 

2054 if self.useTempRoot: 

2055 self.root = self.genRoot() 

2056 rooturi = f"s3://{self.bucketName}/{self.root}" 

2057 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

2058 

2059 # need local folder to store registry database 

2060 self.reg_dir = makeTestTempDir(TESTDIR) 

2061 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

2062 

2063 # MOTO needs to know that we expect Bucket bucketname to exist 

2064 # (this used to be the class attribute bucketName) 

2065 s3 = boto3.resource("s3") 

2066 s3.create_bucket(Bucket=self.bucketName) 

2067 

2068 self.datastoreStr = [f"datastore='{rooturi}'"] 

2069 self.datastoreName = [f"FileDatastore@{rooturi}"] 

2070 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

2071 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

2072 

2073 def tearDown(self) -> None: 

2074 s3 = boto3.resource("s3") 

2075 bucket = s3.Bucket(self.bucketName) 

2076 try: 

2077 bucket.objects.all().delete() 

2078 except botocore.exceptions.ClientError as e: 

2079 if e.response["Error"]["Code"] == "404": 

2080 # the key was not reachable - pass 

2081 pass 

2082 else: 

2083 raise 

2084 

2085 bucket = s3.Bucket(self.bucketName) 

2086 bucket.delete() 

2087 

2088 # Stop the S3 mock. 

2089 self.mock_aws.stop() 

2090 

2091 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

2092 shutil.rmtree(self.reg_dir, ignore_errors=True) 

2093 

2094 if self.useTempRoot and os.path.exists(self.root): 

2095 shutil.rmtree(self.root, ignore_errors=True) 

2096 

2097 super().tearDown() 

2098 

2099 

2100class PosixDatastoreTransfers(unittest.TestCase): 

2101 """Test data transfers between butlers. 

2102 

2103 Test for different managers. UUID to UUID and integer to integer are 

2104 tested. UUID to integer is not supported since we do not currently 

2105 want to allow that. Integer to UUID is supported with the caveat 

2106 that UUID4 will be generated and this will be incorrect for raw 

2107 dataset types. The test ignores that. 

2108 """ 

2109 

2110 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

2111 storageClassFactory: StorageClassFactory 

2112 

2113 @classmethod 

2114 def setUpClass(cls) -> None: 

2115 cls.storageClassFactory = StorageClassFactory() 

2116 cls.storageClassFactory.addFromConfig(cls.configFile) 

2117 

2118 def setUp(self) -> None: 

2119 self.root = makeTestTempDir(TESTDIR) 

2120 self.config = Config(self.configFile) 

2121 

2122 def tearDown(self) -> None: 

2123 removeTestTempDir(self.root) 

2124 

2125 def create_butler(self, manager: str, label: str) -> Butler: 

2126 config = Config(self.configFile) 

2127 config["registry", "managers", "datasets"] = manager 

2128 return Butler.from_config( 

2129 Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True 

2130 ) 

2131 

2132 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None: 

2133 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID" 

2134 if manager1 is None: 

2135 manager1 = default 

2136 if manager2 is None: 

2137 manager2 = default 

2138 self.source_butler = self.create_butler(manager1, "1") 

2139 self.target_butler = self.create_butler(manager2, "2") 

2140 

2141 def testTransferUuidToUuid(self) -> None: 

2142 self.create_butlers() 

2143 self.assertButlerTransfers() 

2144 

2145 def testTransferMissing(self) -> None: 

2146 """Test transfers where datastore records are missing. 

2147 

2148 This is how execution butler works. 

2149 """ 

2150 self.create_butlers() 

2151 

2152 # Configure the source butler to allow trust. 

2153 self.source_butler._datastore._set_trust_mode(True) 

2154 

2155 self.assertButlerTransfers(purge=True) 

2156 

2157 def testTransferMissingDisassembly(self) -> None: 

2158 """Test transfers where datastore records are missing. 

2159 

2160 This is how execution butler works. 

2161 """ 

2162 self.create_butlers() 

2163 

2164 # Configure the source butler to allow trust. 

2165 self.source_butler._datastore._set_trust_mode(True) 

2166 

2167 # Test disassembly. 

2168 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

2169 

2170 def testAbsoluteURITransferDirect(self) -> None: 

2171 """Test transfer using an absolute URI.""" 

2172 self._absolute_transfer("auto") 

2173 

2174 def testAbsoluteURITransferCopy(self) -> None: 

2175 """Test transfer using an absolute URI.""" 

2176 self._absolute_transfer("copy") 

2177 

2178 def _absolute_transfer(self, transfer: str) -> None: 

2179 self.create_butlers() 

2180 

2181 storageClassName = "StructuredData" 

2182 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2183 datasetTypeName = "random_data" 

2184 run = "run1" 

2185 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2186 

2187 dimensions = self.source_butler.dimensions.conform(()) 

2188 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2189 self.source_butler.registry.registerDatasetType(datasetType) 

2190 

2191 metrics = makeExampleMetrics() 

2192 with ResourcePath.temporary_uri(suffix=".json") as temp: 

2193 dataId = DataCoordinate.make_empty(self.source_butler.dimensions) 

2194 source_refs = [DatasetRef(datasetType, dataId, run=run)] 

2195 temp.write(json.dumps(metrics.exportAsDict()).encode()) 

2196 dataset = FileDataset(path=temp, refs=source_refs) 

2197 self.source_butler.ingest(dataset, transfer="direct") 

2198 

2199 self.target_butler.transfer_from( 

2200 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer 

2201 ) 

2202 

2203 uri = self.target_butler.getURI(dataset.refs[0]) 

2204 if transfer == "auto": 

2205 self.assertEqual(uri, temp) 

2206 else: 

2207 self.assertNotEqual(uri, temp) 

2208 

2209 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None: 

2210 """Test that a run can be transferred to another butler.""" 

2211 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2212 datasetTypeName = "random_data" 

2213 

2214 # Test will create 3 collections and we will want to transfer 

2215 # two of those three. 

2216 runs = ["run1", "run2", "other"] 

2217 

2218 # Also want to use two different dataset types to ensure that 

2219 # grouping works. 

2220 datasetTypeNames = ["random_data", "random_data_2"] 

2221 

2222 # Create the run collections in the source butler. 

2223 for run in runs: 

2224 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2225 

2226 # Create dimensions in source butler. 

2227 n_exposures = 30 

2228 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

2229 self.source_butler.registry.insertDimensionData( 

2230 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

2231 ) 

2232 self.source_butler.registry.insertDimensionData( 

2233 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

2234 ) 

2235 

2236 for i in range(n_exposures): 

2237 self.source_butler.registry.insertDimensionData( 

2238 "exposure", 

2239 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

2240 ) 

2241 

2242 # Create dataset types in the source butler. 

2243 dimensions = self.source_butler.dimensions.conform(["instrument", "exposure"]) 

2244 for datasetTypeName in datasetTypeNames: 

2245 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2246 self.source_butler.registry.registerDatasetType(datasetType) 

2247 

2248 # Write a dataset to an unrelated run -- this will ensure that 

2249 # we are rewriting integer dataset ids in the target if necessary. 

2250 # Will not be relevant for UUID. 

2251 run = "distraction" 

2252 butler = Butler.from_config(butler=self.source_butler, run=run) 

2253 butler.put( 

2254 makeExampleMetrics(), 

2255 datasetTypeName, 

2256 exposure=1, 

2257 instrument="DummyCamComp", 

2258 physical_filter="d-r", 

2259 ) 

2260 

2261 # Write some example metrics to the source 

2262 butler = Butler.from_config(butler=self.source_butler) 

2263 

2264 # Set of DatasetRefs that should be in the list of refs to transfer 

2265 # but which will not be transferred. 

2266 deleted: set[DatasetRef] = set() 

2267 

2268 n_expected = 20 # Number of datasets expected to be transferred 

2269 source_refs = [] 

2270 for i in range(n_exposures): 

2271 # Put a third of datasets into each collection, only retain 

2272 # two thirds. 

2273 index = i % 3 

2274 run = runs[index] 

2275 datasetTypeName = datasetTypeNames[i % 2] 

2276 

2277 metric = MetricsExample( 

2278 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)] 

2279 ) 

2280 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

2281 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

2282 

2283 # Remove the datastore record using low-level API, but only 

2284 # for a specific index. 

2285 if purge and index == 1: 

2286 # For one of these delete the file as well. 

2287 # This allows the "missing" code to filter the 

2288 # file out. 

2289 # Access the individual datastores. 

2290 datastores = [] 

2291 if hasattr(butler._datastore, "datastores"): 

2292 datastores.extend(butler._datastore.datastores) 

2293 else: 

2294 datastores.append(butler._datastore) 

2295 

2296 if not deleted: 

2297 # For a chained datastore we need to remove 

2298 # files in each chain. 

2299 for datastore in datastores: 

2300 # The file might not be known to the datastore 

2301 # if constraints are used. 

2302 try: 

2303 primary, uris = datastore.getURIs(ref) 

2304 except FileNotFoundError: 

2305 continue 

2306 if primary and primary.scheme != "mem": 

2307 primary.remove() 

2308 for uri in uris.values(): 

2309 if uri.scheme != "mem": 

2310 uri.remove() 

2311 n_expected -= 1 

2312 deleted.add(ref) 

2313 

2314 # Remove the datastore record. 

2315 for datastore in datastores: 

2316 if hasattr(datastore, "removeStoredItemInfo"): 

2317 datastore.removeStoredItemInfo(ref) 

2318 

2319 if index < 2: 

2320 source_refs.append(ref) 

2321 if ref not in deleted: 

2322 new_metric = butler.get(ref) 

2323 self.assertEqual(new_metric, metric) 

2324 

2325 # Create some bad dataset types to ensure we check for inconsistent 

2326 # definitions. 

2327 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2328 for datasetTypeName in datasetTypeNames: 

2329 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2330 self.target_butler.registry.registerDatasetType(datasetType) 

2331 with self.assertRaises(ConflictingDefinitionError) as cm: 

2332 self.target_butler.transfer_from(self.source_butler, source_refs) 

2333 self.assertIn("dataset type differs", str(cm.exception)) 

2334 

2335 # And remove the bad definitions. 

2336 for datasetTypeName in datasetTypeNames: 

2337 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2338 

2339 # Transfer without creating dataset types should fail. 

2340 with self.assertRaises(KeyError): 

2341 self.target_butler.transfer_from(self.source_butler, source_refs) 

2342 

2343 # Transfer without creating dimensions should fail. 

2344 with self.assertRaises(ConflictingDefinitionError) as cm: 

2345 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True) 

2346 self.assertIn("dimension", str(cm.exception)) 

2347 

2348 # The failed transfer above leaves registry in an inconsistent 

2349 # state because the run is created but then rolled back without 

2350 # the collection cache being cleared. For now force a refresh. 

2351 # Can remove with DM-35498. 

2352 self.target_butler.registry.refresh() 

2353 

2354 # Do a dry run -- this should not have any effect on the target butler. 

2355 self.target_butler.transfer_from(self.source_butler, source_refs, dry_run=True) 

2356 

2357 # Transfer the records for one ref to test the alternative API. 

2358 with self.assertLogs(logger="lsst", level=logging.DEBUG) as log_cm: 

2359 self.target_butler.transfer_dimension_records_from(self.source_butler, [source_refs[0]]) 

2360 self.assertIn("number of records transferred: 1", ";".join(log_cm.output)) 

2361 

2362 # Now transfer them to the second butler, including dimensions. 

2363 with self.assertLogs(logger="lsst", level=logging.DEBUG) as log_cm: 

2364 transferred = self.target_butler.transfer_from( 

2365 self.source_butler, 

2366 source_refs, 

2367 register_dataset_types=True, 

2368 transfer_dimensions=True, 

2369 ) 

2370 self.assertEqual(len(transferred), n_expected) 

2371 log_output = ";".join(log_cm.output) 

2372 

2373 # A ChainedDatastore will use the in-memory datastore for mexists 

2374 # so we can not rely on the mexists log message. 

2375 self.assertIn("Number of datastore records found in source", log_output) 

2376 self.assertIn("Creating output run", log_output) 

2377 

2378 # Do the transfer twice to ensure that it will do nothing extra. 

2379 # Only do this if purge=True because it does not work for int 

2380 # dataset_id. 

2381 if purge: 

2382 # This should not need to register dataset types. 

2383 transferred = self.target_butler.transfer_from(self.source_butler, source_refs) 

2384 self.assertEqual(len(transferred), n_expected) 

2385 

2386 # Also do an explicit low-level transfer to trigger some 

2387 # edge cases. 

2388 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2389 self.target_butler._datastore.transfer_from(self.source_butler._datastore, source_refs) 

2390 log_output = ";".join(log_cm.output) 

2391 self.assertIn("no file artifacts exist", log_output) 

2392 

2393 with self.assertRaises((TypeError, AttributeError)): 

2394 self.target_butler._datastore.transfer_from(self.source_butler, source_refs) # type: ignore 

2395 

2396 with self.assertRaises(ValueError): 

2397 self.target_butler._datastore.transfer_from( 

2398 self.source_butler._datastore, source_refs, transfer="split" 

2399 ) 

2400 

2401 # Now try to get the same refs from the new butler. 

2402 for ref in source_refs: 

2403 if ref not in deleted: 

2404 new_metric = self.target_butler.get(ref) 

2405 old_metric = self.source_butler.get(ref) 

2406 self.assertEqual(new_metric, old_metric) 

2407 

2408 # Now prune run2 collection and create instead a CHAINED collection. 

2409 # This should block the transfer. 

2410 self.target_butler.removeRuns(["run2"], unstore=True) 

2411 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2412 with self.assertRaises(CollectionTypeError): 

2413 # Re-importing the run1 datasets can be problematic if they 

2414 # use integer IDs so filter those out. 

2415 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2416 self.target_butler.transfer_from(self.source_butler, to_transfer) 

2417 

2418 

2419class ChainedDatastoreTransfers(PosixDatastoreTransfers): 

2420 """Test transfers using a chained datastore.""" 

2421 

2422 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

2423 

2424 

2425class NullDatastoreTestCase(unittest.TestCase): 

2426 """Test that we can fall back to a null datastore.""" 

2427 

2428 # Need a good config to create the repo. 

2429 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

2430 storageClassFactory: StorageClassFactory 

2431 

2432 @classmethod 

2433 def setUpClass(cls) -> None: 

2434 cls.storageClassFactory = StorageClassFactory() 

2435 cls.storageClassFactory.addFromConfig(cls.configFile) 

2436 

2437 def setUp(self) -> None: 

2438 """Create a new butler root for each test.""" 

2439 self.root = makeTestTempDir(TESTDIR) 

2440 Butler.makeRepo(self.root, config=Config(self.configFile)) 

2441 

2442 def tearDown(self) -> None: 

2443 removeTestTempDir(self.root) 

2444 

2445 def test_fallback(self) -> None: 

2446 # Read the butler config and mess with the datastore section. 

2447 config_path = os.path.join(self.root, "butler.yaml") 

2448 bad_config = Config(config_path) 

2449 bad_config["datastore", "cls"] = "lsst.not.a.datastore.Datastore" 

2450 bad_config.dumpToUri(config_path) 

2451 

2452 with self.assertRaises(RuntimeError): 

2453 Butler(self.root, without_datastore=False) 

2454 

2455 with self.assertRaises(RuntimeError): 

2456 Butler.from_config(self.root, without_datastore=False) 

2457 

2458 butler = Butler.from_config(self.root, writeable=True, without_datastore=True) 

2459 self.assertIsInstance(butler._datastore, NullDatastore) 

2460 

2461 # Check that registry is working. 

2462 butler.registry.registerRun("MYRUN") 

2463 collections = butler.registry.queryCollections(...) 

2464 self.assertIn("MYRUN", set(collections)) 

2465 

2466 # Create a ref. 

2467 dimensions = butler.dimensions.conform([]) 

2468 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

2469 datasetTypeName = "metric" 

2470 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2471 butler.registry.registerDatasetType(datasetType) 

2472 ref = DatasetRef(datasetType, {}, run="MYRUN") 

2473 

2474 # Check that datastore will complain. 

2475 with self.assertRaises(FileNotFoundError): 

2476 butler.get(ref) 

2477 with self.assertRaises(FileNotFoundError): 

2478 butler.getURI(ref) 

2479 

2480 

2481def setup_module(module: types.ModuleType) -> None: 

2482 """Set up the module for pytest.""" 

2483 clean_environment() 

2484 

2485 

2486if __name__ == "__main__": 

2487 clean_environment() 

2488 unittest.main()