Coverage for tests/test_butler.py: 13%

1311 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Tests for Butler. 

29""" 

30from __future__ import annotations 

31 

32import gc 

33import json 

34import logging 

35import os 

36import pathlib 

37import pickle 

38import posixpath 

39import random 

40import shutil 

41import string 

42import tempfile 

43import unittest 

44import uuid 

45from collections.abc import Mapping 

46from typing import TYPE_CHECKING, Any, cast 

47 

48try: 

49 import boto3 

50 import botocore 

51 from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

52 from moto import mock_s3 # type: ignore[import] 

53except ImportError: 

54 boto3 = None 

55 

56 def mock_s3(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def] 

57 """No-op decorator in case moto mock_s3 can not be imported.""" 

58 return None 

59 

60 

61try: 

62 # It's possible but silly to have testing.postgresql installed without 

63 # having the postgresql server installed (because then nothing in 

64 # testing.postgresql would work), so we use the presence of that module 

65 # to test whether we can expect the server to be available. 

66 import testing.postgresql # type: ignore[import] 

67except ImportError: 

68 testing = None 

69 

70import astropy.time 

71import sqlalchemy 

72from lsst.daf.butler import ( 

73 Butler, 

74 ButlerConfig, 

75 ButlerRepoIndex, 

76 CollectionType, 

77 Config, 

78 DataCoordinate, 

79 DatasetExistence, 

80 DatasetRef, 

81 DatasetType, 

82 FileDataset, 

83 StorageClassFactory, 

84 ValidationError, 

85 script, 

86) 

87from lsst.daf.butler.datastore import NullDatastore 

88from lsst.daf.butler.datastore.file_templates import FileTemplate, FileTemplateValidationError 

89from lsst.daf.butler.datastores.fileDatastore import FileDatastore 

90from lsst.daf.butler.direct_butler import DirectButler 

91from lsst.daf.butler.registry import ( 

92 CollectionError, 

93 CollectionTypeError, 

94 ConflictingDefinitionError, 

95 DataIdValueError, 

96 MissingCollectionError, 

97 OrphanedRecordError, 

98) 

99from lsst.daf.butler.registry.sql_registry import SqlRegistry 

100from lsst.daf.butler.repo_relocation import BUTLER_ROOT_TAG 

101from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

102from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir 

103from lsst.resources import ResourcePath 

104from lsst.utils import doImportType 

105from lsst.utils.introspection import get_full_type_name 

106 

107if TYPE_CHECKING: 

108 import types 

109 

110 from lsst.daf.butler import Datastore, DimensionGraph, Registry, StorageClass 

111 

112TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

113 

114 

115def clean_environment() -> None: 

116 """Remove external environment variables that affect the tests.""" 

117 for k in ( 

118 "DAF_BUTLER_REPOSITORY_INDEX", 

119 "S3_ENDPOINT_URL", 

120 "AWS_ACCESS_KEY_ID", 

121 "AWS_SECRET_ACCESS_KEY", 

122 "AWS_SHARED_CREDENTIALS_FILE", 

123 ): 

124 os.environ.pop(k, None) 

125 

126 

127def makeExampleMetrics() -> MetricsExample: 

128 """Return example dataset suitable for tests.""" 

129 return MetricsExample( 

130 {"AM1": 5.2, "AM2": 30.6}, 

131 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

132 [563, 234, 456.7, 752, 8, 9, 27], 

133 ) 

134 

135 

136class TransactionTestError(Exception): 

137 """Specific error for testing transactions, to prevent misdiagnosing 

138 that might otherwise occur when a standard exception is used. 

139 """ 

140 

141 pass 

142 

143 

144class ButlerConfigTests(unittest.TestCase): 

145 """Simple tests for ButlerConfig that are not tested in any other test 

146 cases. 

147 """ 

148 

149 def testSearchPath(self) -> None: 

150 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

151 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

152 config1 = ButlerConfig(configFile) 

153 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

154 

155 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

156 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

157 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

158 self.assertIn("testConfigs", "\n".join(cm.output)) 

159 

160 key = ("datastore", "records", "table") 

161 self.assertNotEqual(config1[key], config2[key]) 

162 self.assertEqual(config2[key], "override_record") 

163 

164 

165class ButlerPutGetTests(TestCaseMixin): 

166 """Helper method for running a suite of put/get tests from different 

167 butler configurations. 

168 """ 

169 

170 root: str 

171 default_run = "ingésτ😺" 

172 storageClassFactory: StorageClassFactory 

173 configFile: str 

174 tmpConfigFile: str 

175 

176 @staticmethod 

177 def addDatasetType( 

178 datasetTypeName: str, dimensions: DimensionGraph, storageClass: StorageClass | str, registry: Registry 

179 ) -> DatasetType: 

180 """Create a DatasetType and register it""" 

181 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

182 registry.registerDatasetType(datasetType) 

183 return datasetType 

184 

185 @classmethod 

186 def setUpClass(cls) -> None: 

187 cls.storageClassFactory = StorageClassFactory() 

188 cls.storageClassFactory.addFromConfig(cls.configFile) 

189 

190 def assertGetComponents( 

191 self, 

192 butler: Butler, 

193 datasetRef: DatasetRef, 

194 components: tuple[str, ...], 

195 reference: Any, 

196 collections: Any = None, 

197 ) -> None: 

198 datasetType = datasetRef.datasetType 

199 dataId = datasetRef.dataId 

200 deferred = butler.getDeferred(datasetRef) 

201 

202 for component in components: 

203 compTypeName = datasetType.componentTypeName(component) 

204 result = butler.get(compTypeName, dataId, collections=collections) 

205 self.assertEqual(result, getattr(reference, component)) 

206 result_deferred = deferred.get(component=component) 

207 self.assertEqual(result_deferred, result) 

208 

209 def tearDown(self) -> None: 

210 removeTestTempDir(self.root) 

211 

212 def create_butler( 

213 self, run: str, storageClass: StorageClass | str, datasetTypeName: str 

214 ) -> tuple[DirectButler, DatasetType]: 

215 butler = Butler.from_config(self.tmpConfigFile, run=run) 

216 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration" 

217 

218 collections = set(butler.registry.queryCollections()) 

219 self.assertEqual(collections, {run}) 

220 

221 # Create and register a DatasetType 

222 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

223 

224 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

225 

226 # Add needed Dimensions 

227 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

228 butler.registry.insertDimensionData( 

229 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

230 ) 

231 butler.registry.insertDimensionData( 

232 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

233 ) 

234 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

235 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

236 butler.registry.insertDimensionData( 

237 "visit", 

238 { 

239 "instrument": "DummyCamComp", 

240 "id": 423, 

241 "name": "fourtwentythree", 

242 "physical_filter": "d-r", 

243 "visit_system": 1, 

244 "datetime_begin": visit_start, 

245 "datetime_end": visit_end, 

246 }, 

247 ) 

248 

249 # Add more visits for some later tests 

250 for visit_id in (424, 425): 

251 butler.registry.insertDimensionData( 

252 "visit", 

253 { 

254 "instrument": "DummyCamComp", 

255 "id": visit_id, 

256 "name": f"fourtwentyfour_{visit_id}", 

257 "physical_filter": "d-r", 

258 "visit_system": 1, 

259 }, 

260 ) 

261 return butler, datasetType 

262 

263 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> DirectButler: 

264 # New datasets will be added to run and tag, but we will only look in 

265 # tag when looking up datasets. 

266 run = self.default_run 

267 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

268 assert butler.run is not None 

269 

270 # Create and store a dataset 

271 metric = makeExampleMetrics() 

272 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423}) 

273 

274 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

275 # and once with a DatasetType 

276 

277 # Keep track of any collections we add and do not clean up 

278 expected_collections = {run} 

279 

280 counter = 0 

281 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1") 

282 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate] 

283 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)): 

284 # Since we are using subTest we can get cascading failures 

285 # here with the first attempt failing and the others failing 

286 # immediately because the dataset already exists. Work around 

287 # this by using a distinct run collection each time 

288 counter += 1 

289 this_run = f"put_run_{counter}" 

290 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

291 expected_collections.update({this_run}) 

292 

293 with self.subTest(args=args): 

294 kwargs: dict[str, Any] = {} 

295 if not isinstance(args[0], DatasetRef): # type: ignore 

296 kwargs["run"] = this_run 

297 ref = butler.put(metric, *args, **kwargs) 

298 self.assertIsInstance(ref, DatasetRef) 

299 

300 # Test getDirect 

301 metricOut = butler.get(ref) 

302 self.assertEqual(metric, metricOut) 

303 # Test get 

304 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

305 self.assertEqual(metric, metricOut) 

306 # Test get with a datasetRef 

307 metricOut = butler.get(ref) 

308 self.assertEqual(metric, metricOut) 

309 # Test getDeferred with dataId 

310 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

311 self.assertEqual(metric, metricOut) 

312 # Test getDeferred with a ref 

313 metricOut = butler.getDeferred(ref).get() 

314 self.assertEqual(metric, metricOut) 

315 

316 # Check we can get components 

317 if storageClass.isComposite(): 

318 self.assertGetComponents( 

319 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

320 ) 

321 

322 # Can the artifacts themselves be retrieved? 

323 if not butler._datastore.isEphemeral: 

324 root_uri = ResourcePath(self.root) 

325 

326 for preserve_path in (True, False): 

327 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

328 # Use copy so that we can test that overwrite 

329 # protection works (using "auto" for File URIs would 

330 # use hard links and subsequent transfer would work 

331 # because it knows they are the same file). 

332 transferred = butler.retrieveArtifacts( 

333 [ref], destination, preserve_path=preserve_path, transfer="copy" 

334 ) 

335 self.assertGreater(len(transferred), 0) 

336 artifacts = list(ResourcePath.findFileResources([destination])) 

337 self.assertEqual(set(transferred), set(artifacts)) 

338 

339 for artifact in transferred: 

340 path_in_destination = artifact.relative_to(destination) 

341 self.assertIsNotNone(path_in_destination) 

342 assert path_in_destination is not None 

343 

344 # when path is not preserved there should not be 

345 # any path separators. 

346 num_seps = path_in_destination.count("/") 

347 if preserve_path: 

348 self.assertGreater(num_seps, 0) 

349 else: 

350 self.assertEqual(num_seps, 0) 

351 

352 primary_uri, secondary_uris = butler.getURIs(ref) 

353 n_uris = len(secondary_uris) 

354 if primary_uri: 

355 n_uris += 1 

356 self.assertEqual( 

357 len(artifacts), 

358 n_uris, 

359 "Comparing expected artifacts vs actual:" 

360 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

361 ) 

362 

363 if preserve_path: 

364 # No need to run these twice 

365 with self.assertRaises(ValueError): 

366 butler.retrieveArtifacts([ref], destination, transfer="move") 

367 

368 with self.assertRaises(FileExistsError): 

369 butler.retrieveArtifacts([ref], destination) 

370 

371 transferred_again = butler.retrieveArtifacts( 

372 [ref], destination, preserve_path=preserve_path, overwrite=True 

373 ) 

374 self.assertEqual(set(transferred_again), set(transferred)) 

375 

376 # Now remove the dataset completely. 

377 butler.pruneDatasets([ref], purge=True, unstore=True) 

378 # Lookup with original args should still fail. 

379 kwargs = {"collections": this_run} 

380 if isinstance(args[0], DatasetRef): 

381 kwargs = {} # Prevent warning from being issued. 

382 self.assertFalse(butler.exists(*args, **kwargs)) 

383 # get() should still fail. 

384 with self.assertRaises(FileNotFoundError): 

385 butler.get(ref) 

386 # Registry shouldn't be able to find it by dataset_id anymore. 

387 self.assertIsNone(butler.registry.getDataset(ref.id)) 

388 

389 # Do explicit registry removal since we know they are 

390 # empty 

391 butler.registry.removeCollection(this_run) 

392 expected_collections.remove(this_run) 

393 

394 # Create DatasetRef for put using default run. 

395 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run) 

396 

397 # Check that getDeferred fails with standalone ref. 

398 with self.assertRaises(LookupError): 

399 butler.getDeferred(refIn) 

400 

401 # Put the dataset again, since the last thing we did was remove it 

402 # and we want to use the default collection. 

403 ref = butler.put(metric, refIn) 

404 

405 # Get with parameters 

406 stop = 4 

407 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

408 self.assertNotEqual(metric, sliced) 

409 self.assertEqual(metric.summary, sliced.summary) 

410 self.assertEqual(metric.output, sliced.output) 

411 assert metric.data is not None # for mypy 

412 self.assertEqual(metric.data[:stop], sliced.data) 

413 # getDeferred with parameters 

414 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

415 self.assertNotEqual(metric, sliced) 

416 self.assertEqual(metric.summary, sliced.summary) 

417 self.assertEqual(metric.output, sliced.output) 

418 self.assertEqual(metric.data[:stop], sliced.data) 

419 # getDeferred with deferred parameters 

420 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

421 self.assertNotEqual(metric, sliced) 

422 self.assertEqual(metric.summary, sliced.summary) 

423 self.assertEqual(metric.output, sliced.output) 

424 self.assertEqual(metric.data[:stop], sliced.data) 

425 

426 if storageClass.isComposite(): 

427 # Check that components can be retrieved 

428 metricOut = butler.get(ref.datasetType.name, dataId) 

429 compNameS = ref.datasetType.componentTypeName("summary") 

430 compNameD = ref.datasetType.componentTypeName("data") 

431 summary = butler.get(compNameS, dataId) 

432 self.assertEqual(summary, metric.summary) 

433 data = butler.get(compNameD, dataId) 

434 self.assertEqual(data, metric.data) 

435 

436 if "counter" in storageClass.derivedComponents: 

437 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

438 self.assertEqual(count, len(data)) 

439 

440 count = butler.get( 

441 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

442 ) 

443 self.assertEqual(count, stop) 

444 

445 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

446 assert compRef is not None 

447 summary = butler.get(compRef) 

448 self.assertEqual(summary, metric.summary) 

449 

450 # Create a Dataset type that has the same name but is inconsistent. 

451 inconsistentDatasetType = DatasetType( 

452 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config") 

453 ) 

454 

455 # Getting with a dataset type that does not match registry fails 

456 with self.assertRaisesRegex(ValueError, "Supplied dataset type .* inconsistent with registry"): 

457 butler.get(inconsistentDatasetType, dataId) 

458 

459 # Combining a DatasetRef with a dataId should fail 

460 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"): 

461 butler.get(ref, dataId) 

462 # Getting with an explicit ref should fail if the id doesn't match. 

463 with self.assertRaises(FileNotFoundError): 

464 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run)) 

465 

466 # Getting a dataset with unknown parameters should fail 

467 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"): 

468 butler.get(ref, parameters={"unsupported": True}) 

469 

470 # Check we have a collection 

471 collections = set(butler.registry.queryCollections()) 

472 self.assertEqual(collections, expected_collections) 

473 

474 # Clean up to check that we can remove something that may have 

475 # already had a component removed 

476 butler.pruneDatasets([ref], unstore=True, purge=True) 

477 

478 # Add the same ref again, so we can check that duplicate put fails. 

479 ref = butler.put(metric, datasetType, dataId) 

480 

481 # Repeat put will fail. 

482 with self.assertRaisesRegex( 

483 ConflictingDefinitionError, "A database constraint failure was triggered" 

484 ): 

485 butler.put(metric, datasetType, dataId) 

486 

487 # Remove the datastore entry. 

488 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

489 

490 # Put will still fail 

491 with self.assertRaisesRegex( 

492 ConflictingDefinitionError, "A database constraint failure was triggered" 

493 ): 

494 butler.put(metric, datasetType, dataId) 

495 

496 # Repeat the same sequence with resolved ref. 

497 butler.pruneDatasets([ref], unstore=True, purge=True) 

498 ref = butler.put(metric, refIn) 

499 

500 # Repeat put will fail. 

501 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"): 

502 butler.put(metric, refIn) 

503 

504 # Remove the datastore entry. 

505 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

506 

507 # In case of resolved ref this write will succeed. 

508 ref = butler.put(metric, refIn) 

509 

510 # Leave the dataset in place since some downstream tests require 

511 # something to be present 

512 

513 return butler 

514 

515 def testDeferredCollectionPassing(self) -> None: 

516 # Construct a butler with no run or collection, but make it writeable. 

517 butler = Butler.from_config(self.tmpConfigFile, writeable=True) 

518 # Create and register a DatasetType 

519 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

520 datasetType = self.addDatasetType( 

521 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

522 ) 

523 # Add needed Dimensions 

524 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

525 butler.registry.insertDimensionData( 

526 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

527 ) 

528 butler.registry.insertDimensionData( 

529 "visit", 

530 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

531 ) 

532 dataId = {"instrument": "DummyCamComp", "visit": 423} 

533 # Create dataset. 

534 metric = makeExampleMetrics() 

535 # Register a new run and put dataset. 

536 run = "deferred" 

537 self.assertTrue(butler.registry.registerRun(run)) 

538 # Second time it will be allowed but indicate no-op 

539 self.assertFalse(butler.registry.registerRun(run)) 

540 ref = butler.put(metric, datasetType, dataId, run=run) 

541 # Putting with no run should fail with TypeError. 

542 with self.assertRaises(CollectionError): 

543 butler.put(metric, datasetType, dataId) 

544 # Dataset should exist. 

545 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

546 # We should be able to get the dataset back, but with and without 

547 # a deferred dataset handle. 

548 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

549 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

550 # Trying to find the dataset without any collection is a TypeError. 

551 self.assertFalse(butler.exists(datasetType, dataId)) 

552 with self.assertRaises(CollectionError): 

553 butler.get(datasetType, dataId) 

554 # Associate the dataset with a different collection. 

555 butler.registry.registerCollection("tagged") 

556 butler.registry.associate("tagged", [ref]) 

557 # Deleting the dataset from the new collection should make it findable 

558 # in the original collection. 

559 butler.pruneDatasets([ref], tags=["tagged"]) 

560 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

561 

562 

563class ButlerTests(ButlerPutGetTests): 

564 """Tests for Butler.""" 

565 

566 useTempRoot = True 

567 validationCanFail: bool 

568 fullConfigKey: str | None 

569 registryStr: str | None 

570 datastoreName: list[str] | None 

571 datastoreStr: list[str] 

572 

573 def setUp(self) -> None: 

574 """Create a new butler root for each test.""" 

575 self.root = makeTestTempDir(TESTDIR) 

576 Butler.makeRepo(self.root, config=Config(self.configFile)) 

577 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

578 

579 def testConstructor(self) -> None: 

580 """Independent test of constructor.""" 

581 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

582 self.assertIsInstance(butler, Butler) 

583 

584 # Check that butler.yaml is added automatically. 

585 if self.tmpConfigFile.endswith(end := "/butler.yaml"): 

586 config_dir = self.tmpConfigFile[: -len(end)] 

587 butler = Butler.from_config(config_dir, run=self.default_run) 

588 self.assertIsInstance(butler, Butler) 

589 

590 # Even with a ResourcePath. 

591 butler = Butler.from_config(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) 

592 self.assertIsInstance(butler, Butler) 

593 

594 collections = set(butler.registry.queryCollections()) 

595 self.assertEqual(collections, {self.default_run}) 

596 

597 # Check that some special characters can be included in run name. 

598 special_run = "u@b.c-A" 

599 butler_special = Butler.from_config(butler=butler, run=special_run) 

600 collections = set(butler_special.registry.queryCollections("*@*")) 

601 self.assertEqual(collections, {special_run}) 

602 

603 butler2 = Butler.from_config(butler=butler, collections=["other"]) 

604 self.assertEqual(butler2.collections, ("other",)) 

605 self.assertIsNone(butler2.run) 

606 self.assertIs(butler._datastore, butler2._datastore) 

607 

608 # Test that we can use an environment variable to find this 

609 # repository. 

610 butler_index = Config() 

611 butler_index["label"] = self.tmpConfigFile 

612 for suffix in (".yaml", ".json"): 

613 # Ensure that the content differs so that we know that 

614 # we aren't reusing the cache. 

615 bad_label = f"file://bucket/not_real{suffix}" 

616 butler_index["bad_label"] = bad_label 

617 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

618 butler_index.dumpToUri(temp_file) 

619 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

620 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"}) 

621 uri = Butler.get_repo_uri("bad_label") 

622 self.assertEqual(uri, ResourcePath(bad_label)) 

623 uri = Butler.get_repo_uri("label") 

624 butler = Butler.from_config(uri, writeable=False) 

625 self.assertIsInstance(butler, Butler) 

626 butler = Butler.from_config("label", writeable=False) 

627 self.assertIsInstance(butler, Butler) 

628 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"): 

629 Butler.from_config("not_there", writeable=False) 

630 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"): 

631 Butler.from_config("bad_label") 

632 with self.assertRaises(FileNotFoundError): 

633 # Should ignore aliases. 

634 Butler.from_config(ResourcePath("label", forceAbsolute=False)) 

635 with self.assertRaises(KeyError) as cm: 

636 Butler.get_repo_uri("missing") 

637 self.assertEqual( 

638 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False) 

639 ) 

640 self.assertIn("not known to", str(cm.exception)) 

641 # Should report no failure. 

642 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "") 

643 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

644 # Now with empty configuration. 

645 butler_index = Config() 

646 butler_index.dumpToUri(temp_file) 

647 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

648 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"): 

649 Butler.from_config("label") 

650 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

651 # Now with bad contents. 

652 with open(temp_file.ospath, "w") as fh: 

653 print("'", file=fh) 

654 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

655 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"): 

656 Butler.from_config("label") 

657 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

658 with self.assertRaises(FileNotFoundError): 

659 Butler.get_repo_uri("label") 

660 self.assertEqual(Butler.get_known_repos(), set()) 

661 

662 with self.assertRaisesRegex(FileNotFoundError, "index file not found"): 

663 Butler.from_config("label") 

664 

665 # Check that we can create Butler when the alias file is not found. 

666 butler = Butler.from_config(self.tmpConfigFile, writeable=False) 

667 self.assertIsInstance(butler, Butler) 

668 with self.assertRaises(KeyError) as cm: 

669 # No environment variable set. 

670 Butler.get_repo_uri("label") 

671 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False)) 

672 self.assertIn("No repository index defined", str(cm.exception)) 

673 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"): 

674 # No aliases registered. 

675 Butler.from_config("not_there") 

676 self.assertEqual(Butler.get_known_repos(), set()) 

677 

678 def testBasicPutGet(self) -> None: 

679 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

680 self.runPutGetTest(storageClass, "test_metric") 

681 

682 def testCompositePutGetConcrete(self) -> None: 

683 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

684 butler = self.runPutGetTest(storageClass, "test_metric") 

685 

686 # Should *not* be disassembled 

687 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

688 self.assertEqual(len(datasets), 1) 

689 uri, components = butler.getURIs(datasets[0]) 

690 self.assertIsInstance(uri, ResourcePath) 

691 self.assertFalse(components) 

692 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

693 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

694 

695 # Predicted dataset 

696 dataId = {"instrument": "DummyCamComp", "visit": 424} 

697 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

698 self.assertFalse(components) 

699 self.assertIsInstance(uri, ResourcePath) 

700 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

701 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

702 

703 def testCompositePutGetVirtual(self) -> None: 

704 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

705 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

706 

707 # Should be disassembled 

708 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

709 self.assertEqual(len(datasets), 1) 

710 uri, components = butler.getURIs(datasets[0]) 

711 

712 if butler._datastore.isEphemeral: 

713 # Never disassemble in-memory datastore 

714 self.assertIsInstance(uri, ResourcePath) 

715 self.assertFalse(components) 

716 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

717 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

718 else: 

719 self.assertIsNone(uri) 

720 self.assertEqual(set(components), set(storageClass.components)) 

721 for compuri in components.values(): 

722 self.assertIsInstance(compuri, ResourcePath) 

723 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

724 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

725 

726 # Predicted dataset 

727 dataId = {"instrument": "DummyCamComp", "visit": 424} 

728 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

729 

730 if butler._datastore.isEphemeral: 

731 # Never disassembled 

732 self.assertIsInstance(uri, ResourcePath) 

733 self.assertFalse(components) 

734 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

735 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

736 else: 

737 self.assertIsNone(uri) 

738 self.assertEqual(set(components), set(storageClass.components)) 

739 for compuri in components.values(): 

740 self.assertIsInstance(compuri, ResourcePath) 

741 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

742 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

743 

744 def testStorageClassOverrideGet(self) -> None: 

745 """Test storage class conversion on get with override.""" 

746 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

747 datasetTypeName = "anything" 

748 run = self.default_run 

749 

750 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

751 

752 # Create and store a dataset. 

753 metric = makeExampleMetrics() 

754 dataId = {"instrument": "DummyCamComp", "visit": 423} 

755 

756 ref = butler.put(metric, datasetType, dataId) 

757 

758 # Return native type. 

759 retrieved = butler.get(ref) 

760 self.assertEqual(retrieved, metric) 

761 

762 # Specify an override. 

763 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion") 

764 model = butler.get(ref, storageClass=new_sc) 

765 self.assertNotEqual(type(model), type(retrieved)) 

766 self.assertIs(type(model), new_sc.pytype) 

767 self.assertEqual(retrieved, model) 

768 

769 # Defer but override later. 

770 deferred = butler.getDeferred(ref) 

771 model = deferred.get(storageClass=new_sc) 

772 self.assertIs(type(model), new_sc.pytype) 

773 self.assertEqual(retrieved, model) 

774 

775 # Defer but override up front. 

776 deferred = butler.getDeferred(ref, storageClass=new_sc) 

777 model = deferred.get() 

778 self.assertIs(type(model), new_sc.pytype) 

779 self.assertEqual(retrieved, model) 

780 

781 # Retrieve a component. Should be a tuple. 

782 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple") 

783 self.assertIs(type(data), tuple) 

784 self.assertEqual(data, tuple(retrieved.data)) 

785 

786 # Parameter on the write storage class should work regardless 

787 # of read storage class. 

788 data = butler.get( 

789 "anything.data", 

790 dataId, 

791 storageClass="StructuredDataDataTestTuple", 

792 parameters={"slice": slice(2, 4)}, 

793 ) 

794 self.assertEqual(len(data), 2) 

795 

796 # Try a parameter that is known to the read storage class but not 

797 # the write storage class. 

798 with self.assertRaises(KeyError): 

799 butler.get( 

800 "anything.data", 

801 dataId, 

802 storageClass="StructuredDataDataTestTuple", 

803 parameters={"xslice": slice(2, 4)}, 

804 ) 

805 

806 def testPytypePutCoercion(self) -> None: 

807 """Test python type coercion on Butler.get and put.""" 

808 # Store some data with the normal example storage class. 

809 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

810 datasetTypeName = "test_metric" 

811 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName) 

812 

813 dataId = {"instrument": "DummyCamComp", "visit": 423} 

814 

815 # Put a dict and this should coerce to a MetricsExample 

816 test_dict = {"summary": {"a": 1}, "output": {"b": 2}} 

817 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424) 

818 test_metric = butler.get(metric_ref) 

819 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample") 

820 self.assertEqual(test_metric.summary, test_dict["summary"]) 

821 self.assertEqual(test_metric.output, test_dict["output"]) 

822 

823 # Check that the put still works if a DatasetType is given with 

824 # a definition matching this python type. 

825 registry_type = butler.registry.getDatasetType(datasetTypeName) 

826 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson") 

827 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425) 

828 self.assertEqual(metric2_ref.datasetType, registry_type) 

829 

830 # The get will return the type expected by registry. 

831 test_metric2 = butler.get(metric2_ref) 

832 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample") 

833 

834 # Make a new DatasetRef with the compatible but different DatasetType. 

835 # This should now return a dict. 

836 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run) 

837 test_dict2 = butler.get(new_ref) 

838 self.assertEqual(get_full_type_name(test_dict2), "dict") 

839 

840 # Get it again with the wrong dataset type definition using get() 

841 # rather than get(). This should be consistent with get() 

842 # behavior and return the type of the DatasetType. 

843 test_dict3 = butler.get(this_type, dataId=dataId, visit=425) 

844 self.assertEqual(get_full_type_name(test_dict3), "dict") 

845 

846 def testIngest(self) -> None: 

847 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

848 

849 # Create and register a DatasetType 

850 dimensions = butler.dimensions.extract(["instrument", "visit", "detector"]) 

851 

852 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

853 datasetTypeName = "metric" 

854 

855 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

856 

857 # Add needed Dimensions 

858 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

859 butler.registry.insertDimensionData( 

860 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

861 ) 

862 for detector in (1, 2): 

863 butler.registry.insertDimensionData( 

864 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

865 ) 

866 

867 butler.registry.insertDimensionData( 

868 "visit", 

869 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

870 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

871 ) 

872 

873 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter") 

874 dataRoot = os.path.join(TESTDIR, "data", "basic") 

875 datasets = [] 

876 for detector in (1, 2): 

877 detector_name = f"detector_{detector}" 

878 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

879 dataId = butler.registry.expandDataId( 

880 {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

881 ) 

882 # Create a DatasetRef for ingest 

883 refIn = DatasetRef(datasetType, dataId, run=self.default_run) 

884 

885 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

886 

887 butler.ingest(*datasets, transfer="copy") 

888 

889 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

890 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

891 

892 metrics1 = butler.get(datasetTypeName, dataId1) 

893 metrics2 = butler.get(datasetTypeName, dataId2) 

894 self.assertNotEqual(metrics1, metrics2) 

895 

896 # Compare URIs 

897 uri1 = butler.getURI(datasetTypeName, dataId1) 

898 uri2 = butler.getURI(datasetTypeName, dataId2) 

899 self.assertNotEqual(uri1, uri2) 

900 

901 # Now do a multi-dataset but single file ingest 

902 metricFile = os.path.join(dataRoot, "detectors.yaml") 

903 refs = [] 

904 for detector in (1, 2): 

905 detector_name = f"detector_{detector}" 

906 dataId = butler.registry.expandDataId( 

907 {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

908 ) 

909 # Create a DatasetRef for ingest 

910 refs.append(DatasetRef(datasetType, dataId, run=self.default_run)) 

911 

912 # Test "move" transfer to ensure that the files themselves 

913 # have disappeared following ingest. 

914 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile: 

915 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy") 

916 

917 datasets = [] 

918 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter)) 

919 

920 # For first ingest use copy. 

921 butler.ingest(*datasets, transfer="copy", record_validation_info=False) 

922 

923 # Now try to ingest again in "execution butler" mode where 

924 # the registry entries exist but the datastore does not have 

925 # the files. We also need to strip the dimension records to ensure 

926 # that they will be re-added by the ingest. 

927 ref = datasets[0].refs[0] 

928 datasets[0].refs = [ 

929 cast( 

930 DatasetRef, 

931 butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), 

932 ) 

933 for ref in datasets[0].refs 

934 ] 

935 all_refs = [] 

936 for dataset in datasets: 

937 refs = [] 

938 for ref in dataset.refs: 

939 # Create a dict from the dataId to drop the records. 

940 new_data_id = {str(k): v for k, v in ref.dataId.items()} 

941 new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run) 

942 assert new_ref is not None 

943 self.assertFalse(new_ref.dataId.hasRecords()) 

944 refs.append(new_ref) 

945 dataset.refs = refs 

946 all_refs.extend(dataset.refs) 

947 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False) 

948 

949 # Use move mode to test that the file is deleted. Also 

950 # disable recording of file size. 

951 butler.ingest(*datasets, transfer="move", record_validation_info=False) 

952 

953 # Check that every ref now has records. 

954 for dataset in datasets: 

955 for ref in dataset.refs: 

956 self.assertTrue(ref.dataId.hasRecords()) 

957 

958 # Ensure that the file has disappeared. 

959 self.assertFalse(tempFile.exists()) 

960 

961 # Check that the datastore recorded no file size. 

962 # Not all datastores can support this. 

963 try: 

964 infos = butler._datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined] 

965 self.assertEqual(infos[0].file_size, -1) 

966 except AttributeError: 

967 pass 

968 

969 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

970 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

971 

972 multi1 = butler.get(datasetTypeName, dataId1) 

973 multi2 = butler.get(datasetTypeName, dataId2) 

974 

975 self.assertEqual(multi1, metrics1) 

976 self.assertEqual(multi2, metrics2) 

977 

978 # Compare URIs 

979 uri1 = butler.getURI(datasetTypeName, dataId1) 

980 uri2 = butler.getURI(datasetTypeName, dataId2) 

981 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

982 

983 # Test that removing one does not break the second 

984 # This line will issue a warning log message for a ChainedDatastore 

985 # that uses an InMemoryDatastore since in-memory can not ingest 

986 # files. 

987 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

988 self.assertFalse(butler.exists(datasetTypeName, dataId1)) 

989 self.assertTrue(butler.exists(datasetTypeName, dataId2)) 

990 multi2b = butler.get(datasetTypeName, dataId2) 

991 self.assertEqual(multi2, multi2b) 

992 

993 # Ensure we can ingest 0 datasets 

994 datasets = [] 

995 butler.ingest(*datasets) 

996 

997 def testPickle(self) -> None: 

998 """Test pickle support.""" 

999 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1000 assert isinstance(butler, DirectButler), "Expect DirectButler in configuration" 

1001 butlerOut = pickle.loads(pickle.dumps(butler)) 

1002 self.assertIsInstance(butlerOut, Butler) 

1003 self.assertEqual(butlerOut._config, butler._config) 

1004 self.assertEqual(butlerOut.collections, butler.collections) 

1005 self.assertEqual(butlerOut.run, butler.run) 

1006 

1007 def testGetDatasetTypes(self) -> None: 

1008 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1009 dimensions = butler.dimensions.extract(["instrument", "visit", "physical_filter"]) 

1010 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [ 

1011 ( 

1012 "instrument", 

1013 [ 

1014 {"instrument": "DummyCam"}, 

1015 {"instrument": "DummyHSC"}, 

1016 {"instrument": "DummyCamComp"}, 

1017 ], 

1018 ), 

1019 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]), 

1020 ("visit", [{"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}]), 

1021 ] 

1022 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1023 # Add needed Dimensions 

1024 for element, data in dimensionEntries: 

1025 butler.registry.insertDimensionData(element, *data) 

1026 

1027 # When a DatasetType is added to the registry entries are not created 

1028 # for components but querying them can return the components. 

1029 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

1030 components = set() 

1031 for datasetTypeName in datasetTypeNames: 

1032 # Create and register a DatasetType 

1033 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1034 

1035 for componentName in storageClass.components: 

1036 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

1037 

1038 fromRegistry: set[DatasetType] = set() 

1039 for parent_dataset_type in butler.registry.queryDatasetTypes(): 

1040 fromRegistry.add(parent_dataset_type) 

1041 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes()) 

1042 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

1043 

1044 # Now that we have some dataset types registered, validate them 

1045 butler.validateConfiguration( 

1046 ignore=[ 

1047 "test_metric_comp", 

1048 "metric3", 

1049 "metric5", 

1050 "calexp", 

1051 "DummySC", 

1052 "datasetType.component", 

1053 "random_data", 

1054 "random_data_2", 

1055 ] 

1056 ) 

1057 

1058 # Add a new datasetType that will fail template validation 

1059 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

1060 if self.validationCanFail: 

1061 with self.assertRaises(ValidationError): 

1062 butler.validateConfiguration() 

1063 

1064 # Rerun validation but with a subset of dataset type names 

1065 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

1066 

1067 # Rerun validation but ignore the bad datasetType 

1068 butler.validateConfiguration( 

1069 ignore=[ 

1070 "test_metric_comp", 

1071 "metric3", 

1072 "metric5", 

1073 "calexp", 

1074 "DummySC", 

1075 "datasetType.component", 

1076 "random_data", 

1077 "random_data_2", 

1078 ] 

1079 ) 

1080 

1081 def testTransaction(self) -> None: 

1082 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1083 datasetTypeName = "test_metric" 

1084 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1085 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = ( 

1086 ("instrument", {"instrument": "DummyCam"}), 

1087 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

1088 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

1089 ) 

1090 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1091 metric = makeExampleMetrics() 

1092 dataId = {"instrument": "DummyCam", "visit": 42} 

1093 # Create and register a DatasetType 

1094 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1095 with self.assertRaises(TransactionTestError): 

1096 with butler.transaction(): 

1097 # Add needed Dimensions 

1098 for args in dimensionEntries: 

1099 butler.registry.insertDimensionData(*args) 

1100 # Store a dataset 

1101 ref = butler.put(metric, datasetTypeName, dataId) 

1102 self.assertIsInstance(ref, DatasetRef) 

1103 # Test getDirect 

1104 metricOut = butler.get(ref) 

1105 self.assertEqual(metric, metricOut) 

1106 # Test get 

1107 metricOut = butler.get(datasetTypeName, dataId) 

1108 self.assertEqual(metric, metricOut) 

1109 # Check we can get components 

1110 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

1111 raise TransactionTestError("This should roll back the entire transaction") 

1112 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"): 

1113 butler.registry.expandDataId(dataId) 

1114 # Should raise LookupError for missing data ID value 

1115 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

1116 butler.get(datasetTypeName, dataId) 

1117 # Also check explicitly if Dataset entry is missing 

1118 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

1119 # Direct retrieval should not find the file in the Datastore 

1120 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

1121 butler.get(ref) 

1122 

1123 def testMakeRepo(self) -> None: 

1124 """Test that we can write butler configuration to a new repository via 

1125 the Butler.makeRepo interface and then instantiate a butler from the 

1126 repo root. 

1127 """ 

1128 # Do not run the test if we know this datastore configuration does 

1129 # not support a file system root 

1130 if self.fullConfigKey is None: 

1131 return 

1132 

1133 # create two separate directories 

1134 root1 = tempfile.mkdtemp(dir=self.root) 

1135 root2 = tempfile.mkdtemp(dir=self.root) 

1136 

1137 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

1138 limited = Config(self.configFile) 

1139 butler1 = Butler.from_config(butlerConfig) 

1140 assert isinstance(butler1, DirectButler), "Expect DirectButler in configuration" 

1141 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

1142 full = Config(self.tmpConfigFile) 

1143 butler2 = Butler.from_config(butlerConfig) 

1144 assert isinstance(butler2, DirectButler), "Expect DirectButler in configuration" 

1145 # Butlers should have the same configuration regardless of whether 

1146 # defaults were expanded. 

1147 self.assertEqual(butler1._config, butler2._config) 

1148 # Config files loaded directly should not be the same. 

1149 self.assertNotEqual(limited, full) 

1150 # Make sure "limited" doesn't have a few keys we know it should be 

1151 # inheriting from defaults. 

1152 self.assertIn(self.fullConfigKey, full) 

1153 self.assertNotIn(self.fullConfigKey, limited) 

1154 

1155 # Collections don't appear until something is put in them 

1156 collections1 = set(butler1.registry.queryCollections()) 

1157 self.assertEqual(collections1, set()) 

1158 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

1159 

1160 # Check that a config with no associated file name will not 

1161 # work properly with relocatable Butler repo 

1162 butlerConfig.configFile = None 

1163 with self.assertRaises(ValueError): 

1164 Butler.from_config(butlerConfig) 

1165 

1166 with self.assertRaises(FileExistsError): 

1167 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

1168 

1169 def testStringification(self) -> None: 

1170 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1171 butlerStr = str(butler) 

1172 

1173 if self.datastoreStr is not None: 

1174 for testStr in self.datastoreStr: 

1175 self.assertIn(testStr, butlerStr) 

1176 if self.registryStr is not None: 

1177 self.assertIn(self.registryStr, butlerStr) 

1178 

1179 datastoreName = butler._datastore.name 

1180 if self.datastoreName is not None: 

1181 for testStr in self.datastoreName: 

1182 self.assertIn(testStr, datastoreName) 

1183 

1184 def testButlerRewriteDataId(self) -> None: 

1185 """Test that dataIds can be rewritten based on dimension records.""" 

1186 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1187 

1188 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1189 datasetTypeName = "random_data" 

1190 

1191 # Create dimension records. 

1192 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1193 butler.registry.insertDimensionData( 

1194 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1195 ) 

1196 butler.registry.insertDimensionData( 

1197 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1198 ) 

1199 

1200 dimensions = butler.dimensions.extract(["instrument", "exposure"]) 

1201 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1202 butler.registry.registerDatasetType(datasetType) 

1203 

1204 n_exposures = 5 

1205 dayobs = 20210530 

1206 

1207 for i in range(n_exposures): 

1208 butler.registry.insertDimensionData( 

1209 "exposure", 

1210 { 

1211 "instrument": "DummyCamComp", 

1212 "id": i, 

1213 "obs_id": f"exp{i}", 

1214 "seq_num": i, 

1215 "day_obs": dayobs, 

1216 "physical_filter": "d-r", 

1217 }, 

1218 ) 

1219 

1220 # Write some data. 

1221 for i in range(n_exposures): 

1222 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1223 

1224 # Use the seq_num for the put to test rewriting. 

1225 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1226 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1227 

1228 # Check that the exposure is correct in the dataId 

1229 self.assertEqual(ref.dataId["exposure"], i) 

1230 

1231 # and check that we can get the dataset back with the same dataId 

1232 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1233 self.assertEqual(new_metric, metric) 

1234 

1235 

1236class FileDatastoreButlerTests(ButlerTests): 

1237 """Common tests and specialization of ButlerTests for butlers backed 

1238 by datastores that inherit from FileDatastore. 

1239 """ 

1240 

1241 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool: 

1242 """Check if file exists at a given path (relative to root). 

1243 

1244 Test testPutTemplates verifies actual physical existance of the files 

1245 in the requested location. 

1246 """ 

1247 uri = ResourcePath(root, forceDirectory=True) 

1248 return uri.join(relpath).exists() 

1249 

1250 def testPutTemplates(self) -> None: 

1251 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1252 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1253 

1254 # Add needed Dimensions 

1255 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1256 butler.registry.insertDimensionData( 

1257 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1258 ) 

1259 butler.registry.insertDimensionData( 

1260 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1261 ) 

1262 butler.registry.insertDimensionData( 

1263 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1264 ) 

1265 

1266 # Create and store a dataset 

1267 metric = makeExampleMetrics() 

1268 

1269 # Create two almost-identical DatasetTypes (both will use default 

1270 # template) 

1271 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1272 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1273 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1274 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1275 

1276 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1277 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1278 

1279 # Put with exactly the data ID keys needed 

1280 ref = butler.put(metric, "metric1", dataId1) 

1281 uri = butler.getURI(ref) 

1282 self.assertTrue(uri.exists()) 

1283 self.assertTrue( 

1284 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle") 

1285 ) 

1286 

1287 # Check the template based on dimensions 

1288 if hasattr(butler._datastore, "templates"): 

1289 butler._datastore.templates.validateTemplates([ref]) 

1290 

1291 # Put with extra data ID keys (physical_filter is an optional 

1292 # dependency); should not change template (at least the way we're 

1293 # defining them to behave now; the important thing is that they 

1294 # must be consistent). 

1295 ref = butler.put(metric, "metric2", dataId2) 

1296 uri = butler.getURI(ref) 

1297 self.assertTrue(uri.exists()) 

1298 self.assertTrue( 

1299 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle") 

1300 ) 

1301 

1302 # Check the template based on dimensions 

1303 if hasattr(butler._datastore, "templates"): 

1304 butler._datastore.templates.validateTemplates([ref]) 

1305 

1306 # Use a template that has a typo in dimension record metadata. 

1307 # Easier to test with a butler that has a ref with records attached. 

1308 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits") 

1309 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"): 

1310 path = template.format(ref) 

1311 self.assertEqual(path, f"a/v423/{ref.id}_fits") 

1312 

1313 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits") 

1314 with self.assertRaises(KeyError): 

1315 with self.assertLogs("lsst.daf.butler.datastore.file_templates", "INFO"): 

1316 template.format(ref) 

1317 

1318 # Now use a file template that will not result in unique filenames 

1319 with self.assertRaises(FileTemplateValidationError): 

1320 butler.put(metric, "metric3", dataId1) 

1321 

1322 def testImportExport(self) -> None: 

1323 # Run put/get tests just to create and populate a repo. 

1324 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1325 self.runImportExportTest(storageClass) 

1326 

1327 @unittest.expectedFailure 

1328 def testImportExportVirtualComposite(self) -> None: 

1329 # Run put/get tests just to create and populate a repo. 

1330 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1331 self.runImportExportTest(storageClass) 

1332 

1333 def runImportExportTest(self, storageClass: StorageClass) -> None: 

1334 """Test exporting and importing. 

1335 

1336 This test does an export to a temp directory and an import back 

1337 into a new temp directory repo. It does not assume a posix datastore. 

1338 """ 

1339 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1340 

1341 # Test that we must have a file extension. 

1342 with self.assertRaises(ValueError): 

1343 with exportButler.export(filename="dump", directory=".") as export: 

1344 pass 

1345 

1346 # Test that unknown format is not allowed. 

1347 with self.assertRaises(ValueError): 

1348 with exportButler.export(filename="dump.fits", directory=".") as export: 

1349 pass 

1350 

1351 # Test that the repo actually has at least one dataset. 

1352 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1353 self.assertGreater(len(datasets), 0) 

1354 # Add a DimensionRecord that's unused by those datasets. 

1355 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1356 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1357 # Export and then import datasets. 

1358 with safeTestTempDir(TESTDIR) as exportDir: 

1359 exportFile = os.path.join(exportDir, "exports.yaml") 

1360 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1361 export.saveDatasets(datasets) 

1362 # Export the same datasets again. This should quietly do 

1363 # nothing because of internal deduplication, and it shouldn't 

1364 # complain about being asked to export the "htm7" elements even 

1365 # though there aren't any in these datasets or in the database. 

1366 export.saveDatasets(datasets, elements=["htm7"]) 

1367 # Save one of the data IDs again; this should be harmless 

1368 # because of internal deduplication. 

1369 export.saveDataIds([datasets[0].dataId]) 

1370 # Save some dimension records directly. 

1371 export.saveDimensionData("skymap", [skymapRecord]) 

1372 self.assertTrue(os.path.exists(exportFile)) 

1373 with safeTestTempDir(TESTDIR) as importDir: 

1374 # We always want this to be a local posix butler 

1375 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1376 # Calling script.butlerImport tests the implementation of the 

1377 # butler command line interface "import" subcommand. Functions 

1378 # in the script folder are generally considered protected and 

1379 # should not be used as public api. 

1380 with open(exportFile) as f: 

1381 script.butlerImport( 

1382 importDir, 

1383 export_file=f, 

1384 directory=exportDir, 

1385 transfer="auto", 

1386 skip_dimensions=None, 

1387 ) 

1388 importButler = Butler.from_config(importDir, run=self.default_run) 

1389 for ref in datasets: 

1390 with self.subTest(ref=ref): 

1391 # Test for existence by passing in the DatasetType and 

1392 # data ID separately, to avoid lookup by dataset_id. 

1393 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId)) 

1394 self.assertEqual( 

1395 list(importButler.registry.queryDimensionRecords("skymap")), 

1396 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)], 

1397 ) 

1398 

1399 def testRemoveRuns(self) -> None: 

1400 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1401 butler = Butler.from_config(self.tmpConfigFile, writeable=True) 

1402 # Load registry data with dimensions to hang datasets off of. 

1403 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1404 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1405 # Add some RUN-type collection. 

1406 run1 = "run1" 

1407 butler.registry.registerRun(run1) 

1408 run2 = "run2" 

1409 butler.registry.registerRun(run2) 

1410 # put a dataset in each 

1411 metric = makeExampleMetrics() 

1412 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1413 datasetType = self.addDatasetType( 

1414 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1415 ) 

1416 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1417 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1418 uri1 = butler.getURI(ref1) 

1419 uri2 = butler.getURI(ref2) 

1420 

1421 with self.assertRaises(OrphanedRecordError): 

1422 butler.registry.removeDatasetType(datasetType.name) 

1423 

1424 # Remove from both runs with different values for unstore. 

1425 butler.removeRuns([run1], unstore=True) 

1426 butler.removeRuns([run2], unstore=False) 

1427 # Should be nothing in registry for either one, and datastore should 

1428 # not think either exists. 

1429 with self.assertRaises(MissingCollectionError): 

1430 butler.registry.getCollectionType(run1) 

1431 with self.assertRaises(MissingCollectionError): 

1432 butler.registry.getCollectionType(run2) 

1433 self.assertFalse(butler.stored(ref1)) 

1434 self.assertFalse(butler.stored(ref2)) 

1435 # The ref we unstored should be gone according to the URI, but the 

1436 # one we forgot should still be around. 

1437 self.assertFalse(uri1.exists()) 

1438 self.assertTrue(uri2.exists()) 

1439 

1440 # Now that the collections have been pruned we can remove the 

1441 # dataset type 

1442 butler.registry.removeDatasetType(datasetType.name) 

1443 

1444 with self.assertLogs("lsst.daf.butler.registry", "INFO") as cm: 

1445 butler.registry.removeDatasetType(("test*", "test*")) 

1446 self.assertIn("not defined", "\n".join(cm.output)) 

1447 

1448 

1449class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1450 """PosixDatastore specialization of a butler""" 

1451 

1452 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1453 fullConfigKey: str | None = ".datastore.formatters" 

1454 validationCanFail = True 

1455 datastoreStr = ["/tmp"] 

1456 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1457 registryStr = "/gen3.sqlite3" 

1458 

1459 def testPathConstructor(self) -> None: 

1460 """Independent test of constructor using PathLike.""" 

1461 butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) 

1462 self.assertIsInstance(butler, Butler) 

1463 

1464 # And again with a Path object with the butler yaml 

1465 path = pathlib.Path(self.tmpConfigFile) 

1466 butler = Butler.from_config(path, writeable=False) 

1467 self.assertIsInstance(butler, Butler) 

1468 

1469 # And again with a Path object without the butler yaml 

1470 # (making sure we skip it if the tmp config doesn't end 

1471 # in butler.yaml -- which is the case for a subclass) 

1472 if self.tmpConfigFile.endswith("butler.yaml"): 

1473 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1474 butler = Butler.from_config(path, writeable=False) 

1475 self.assertIsInstance(butler, Butler) 

1476 

1477 def testExportTransferCopy(self) -> None: 

1478 """Test local export using all transfer modes""" 

1479 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1480 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1481 # Test that the repo actually has at least one dataset. 

1482 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1483 self.assertGreater(len(datasets), 0) 

1484 uris = [exportButler.getURI(d) for d in datasets] 

1485 assert isinstance(exportButler._datastore, FileDatastore) 

1486 datastoreRoot = exportButler.get_datastore_roots()[exportButler.get_datastore_names()[0]] 

1487 

1488 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1489 

1490 for path in pathsInStore: 

1491 # Assume local file system 

1492 assert path is not None 

1493 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1494 

1495 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1496 with safeTestTempDir(TESTDIR) as exportDir: 

1497 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1498 export.saveDatasets(datasets) 

1499 for path in pathsInStore: 

1500 assert path is not None 

1501 self.assertTrue( 

1502 self.checkFileExists(exportDir, path), 

1503 f"Check that mode {transfer} exported files", 

1504 ) 

1505 

1506 def testPruneDatasets(self) -> None: 

1507 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1508 butler = Butler.from_config(self.tmpConfigFile, writeable=True) 

1509 assert isinstance(butler._datastore, FileDatastore) 

1510 # Load registry data with dimensions to hang datasets off of. 

1511 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1512 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1513 # Add some RUN-type collections. 

1514 run1 = "run1" 

1515 butler.registry.registerRun(run1) 

1516 run2 = "run2" 

1517 butler.registry.registerRun(run2) 

1518 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1519 # different runs. ref3 has a different data ID. 

1520 metric = makeExampleMetrics() 

1521 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1522 datasetType = self.addDatasetType( 

1523 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1524 ) 

1525 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1526 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1527 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1528 

1529 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1530 for ref, stored in many_stored.items(): 

1531 self.assertTrue(stored, f"Ref {ref} should be stored") 

1532 

1533 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1534 for ref, exists in many_exists.items(): 

1535 self.assertTrue(exists, f"Checking ref {ref} exists.") 

1536 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored") 

1537 

1538 # Simple prune. 

1539 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1540 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1)) 

1541 

1542 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1543 for ref, stored in many_stored.items(): 

1544 self.assertFalse(stored, f"Ref {ref} should not be stored") 

1545 

1546 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1547 for ref, exists in many_exists.items(): 

1548 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored") 

1549 

1550 # Put data back. 

1551 ref1_new = butler.put(metric, ref1) 

1552 self.assertEqual(ref1_new, ref1) # Reuses original ID. 

1553 ref2 = butler.put(metric, ref2) 

1554 

1555 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1556 self.assertTrue(many_stored[ref1]) 

1557 self.assertTrue(many_stored[ref2]) 

1558 self.assertFalse(many_stored[ref3]) 

1559 

1560 ref3 = butler.put(metric, ref3) 

1561 

1562 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1563 for ref, exists in many_exists.items(): 

1564 self.assertTrue(exists, f"Ref {ref} should not be stored") 

1565 

1566 # Clear out the datasets from registry and start again. 

1567 refs = [ref1, ref2, ref3] 

1568 butler.pruneDatasets(refs, purge=True, unstore=True) 

1569 for ref in refs: 

1570 butler.put(metric, ref) 

1571 

1572 # Confirm we can retrieve deferred. 

1573 dref1 = butler.getDeferred(ref1) # known and exists 

1574 metric1 = dref1.get() 

1575 self.assertEqual(metric1, metric) 

1576 

1577 # Test different forms of file availability. 

1578 # Need to be in a state where: 

1579 # - one ref just has registry record. 

1580 # - one ref has a missing file but a datastore record. 

1581 # - one ref has a missing datastore record but file is there. 

1582 # - one ref does not exist anywhere. 

1583 # Do not need to test a ref that has everything since that is tested 

1584 # above. 

1585 ref0 = DatasetRef( 

1586 datasetType, 

1587 DataCoordinate.standardize( 

1588 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions 

1589 ), 

1590 run=run1, 

1591 ) 

1592 

1593 # Delete from datastore and retain in Registry. 

1594 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False) 

1595 

1596 # File has been removed. 

1597 uri2 = butler.getURI(ref2) 

1598 uri2.remove() 

1599 

1600 # Datastore has lost track. 

1601 butler._datastore.forget([ref3]) 

1602 

1603 # First test with a standard butler. 

1604 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1605 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1606 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1607 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1608 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED) 

1609 

1610 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False) 

1611 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1612 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1613 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN) 

1614 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1615 self.assertTrue(exists_many[ref2]) 

1616 

1617 # Check that per-ref query gives the same answer as many query. 

1618 for ref, exists in exists_many.items(): 

1619 self.assertEqual(butler.exists(ref, full_check=False), exists) 

1620 

1621 # Get deferred checks for existence before it allows it to be 

1622 # retrieved. 

1623 with self.assertRaises(LookupError): 

1624 butler.getDeferred(ref3) # not known, file exists 

1625 dref2 = butler.getDeferred(ref2) # known but file missing 

1626 with self.assertRaises(FileNotFoundError): 

1627 dref2.get() 

1628 

1629 # Test again with a trusting butler. 

1630 butler._datastore.trustGetRequest = True 

1631 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1632 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1633 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1634 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1635 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT) 

1636 

1637 # When trusting we can get a deferred dataset handle that is not 

1638 # known but does exist. 

1639 dref3 = butler.getDeferred(ref3) 

1640 metric3 = dref3.get() 

1641 self.assertEqual(metric3, metric) 

1642 

1643 # Check that per-ref query gives the same answer as many query. 

1644 for ref, exists in exists_many.items(): 

1645 self.assertEqual(butler.exists(ref, full_check=True), exists) 

1646 

1647 # Create a ref that surprisingly has the UUID of an existing ref 

1648 # but is not the same. 

1649 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id) 

1650 with self.assertRaises(ValueError): 

1651 butler.exists(ref_bad) 

1652 

1653 # Create a ref that has a compatible storage class. 

1654 ref_compat = ref2.overrideStorageClass("StructuredDataDict") 

1655 exists = butler.exists(ref_compat) 

1656 self.assertEqual(exists, exists_many[ref2]) 

1657 

1658 # Remove everything and start from scratch. 

1659 butler._datastore.trustGetRequest = False 

1660 butler.pruneDatasets(refs, purge=True, unstore=True) 

1661 for ref in refs: 

1662 butler.put(metric, ref) 

1663 

1664 # These tests mess directly with the trash table and can leave the 

1665 # datastore in an odd state. Do them at the end. 

1666 # Check that in normal mode, deleting the record will lead to 

1667 # trash not touching the file. 

1668 uri1 = butler.getURI(ref1) 

1669 butler._datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table 

1670 butler._datastore.forget([ref1]) 

1671 butler._datastore.trash(ref1) 

1672 butler._datastore.emptyTrash() 

1673 self.assertTrue(uri1.exists()) 

1674 uri1.remove() # Clean it up. 

1675 

1676 # Simulate execution butler setup by deleting the datastore 

1677 # record but keeping the file around and trusting. 

1678 butler._datastore.trustGetRequest = True 

1679 uris = butler.get_many_uris([ref2, ref3]) 

1680 uri2 = uris[ref2].primaryURI 

1681 uri3 = uris[ref3].primaryURI 

1682 self.assertTrue(uri2.exists()) 

1683 self.assertTrue(uri3.exists()) 

1684 

1685 # Remove the datastore record. 

1686 butler._datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table 

1687 butler._datastore.forget([ref2]) 

1688 self.assertTrue(uri2.exists()) 

1689 butler._datastore.trash([ref2, ref3]) 

1690 # Immediate removal for ref2 file 

1691 self.assertFalse(uri2.exists()) 

1692 # But ref3 has to wait for the empty. 

1693 self.assertTrue(uri3.exists()) 

1694 butler._datastore.emptyTrash() 

1695 self.assertFalse(uri3.exists()) 

1696 

1697 # Clear out the datasets from registry. 

1698 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1699 

1700 def testPytypeCoercion(self) -> None: 

1701 """Test python type coercion on Butler.get and put.""" 

1702 # Store some data with the normal example storage class. 

1703 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1704 datasetTypeName = "test_metric" 

1705 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1706 

1707 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1708 metric = butler.get(datasetTypeName, dataId=dataId) 

1709 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1710 

1711 datasetType_ori = butler.registry.getDatasetType(datasetTypeName) 

1712 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1713 

1714 # Now need to hack the registry dataset type definition. 

1715 # There is no API for this. 

1716 assert isinstance(butler._registry, SqlRegistry) 

1717 manager = butler._registry._managers.datasets 

1718 assert hasattr(manager, "_db") and hasattr(manager, "_static") 

1719 manager._db.update( 

1720 manager._static.dataset_type, 

1721 {"name": datasetTypeName}, 

1722 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1723 ) 

1724 

1725 # Force reset of dataset type cache 

1726 butler.registry.refresh() 

1727 

1728 datasetType_new = butler.registry.getDatasetType(datasetTypeName) 

1729 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1730 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1731 

1732 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1733 self.assertNotEqual(type(metric_model), type(metric)) 

1734 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1735 

1736 # Put the model and read it back to show that everything now 

1737 # works as normal. 

1738 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1739 metric_model_new = butler.get(metric_ref) 

1740 self.assertEqual(metric_model_new, metric_model) 

1741 

1742 # Hack the storage class again to something that will fail on the 

1743 # get with no conversion class. 

1744 manager._db.update( 

1745 manager._static.dataset_type, 

1746 {"name": datasetTypeName}, 

1747 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1748 ) 

1749 butler.registry.refresh() 

1750 

1751 with self.assertRaises(ValueError): 

1752 butler.get(datasetTypeName, dataId=dataId) 

1753 

1754 

1755@unittest.skipUnless(testing is not None, "testing.postgresql module not found") 

1756class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1757 """PosixDatastore specialization of a butler using Postgres""" 

1758 

1759 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1760 fullConfigKey = ".datastore.formatters" 

1761 validationCanFail = True 

1762 datastoreStr = ["/tmp"] 

1763 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1764 registryStr = "PostgreSQL@test" 

1765 postgresql: Any 

1766 

1767 @staticmethod 

1768 def _handler(postgresql: Any) -> None: 

1769 engine = sqlalchemy.engine.create_engine(postgresql.url()) 

1770 with engine.begin() as connection: 

1771 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;")) 

1772 

1773 @classmethod 

1774 def setUpClass(cls) -> None: 

1775 # Create the postgres test server. 

1776 cls.postgresql = testing.postgresql.PostgresqlFactory( 

1777 cache_initialized_db=True, on_initialized=cls._handler 

1778 ) 

1779 super().setUpClass() 

1780 

1781 @classmethod 

1782 def tearDownClass(cls) -> None: 

1783 # Clean up any lingering SQLAlchemy engines/connections 

1784 # so they're closed before we shut down the server. 

1785 gc.collect() 

1786 cls.postgresql.clear_cache() 

1787 super().tearDownClass() 

1788 

1789 def setUp(self) -> None: 

1790 self.server = self.postgresql() 

1791 

1792 # Need to add a registry section to the config. 

1793 self._temp_config = False 

1794 config = Config(self.configFile) 

1795 config["registry", "db"] = self.server.url() 

1796 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh: 

1797 config.dump(fh) 

1798 self.configFile = fh.name 

1799 self._temp_config = True 

1800 super().setUp() 

1801 

1802 def tearDown(self) -> None: 

1803 self.server.stop() 

1804 if self._temp_config and os.path.exists(self.configFile): 

1805 os.remove(self.configFile) 

1806 super().tearDown() 

1807 

1808 def testMakeRepo(self) -> None: 

1809 # The base class test assumes that it's using sqlite and assumes 

1810 # the config file is acceptable to sqlite. 

1811 raise unittest.SkipTest("Postgres config is not compatible with this test.") 

1812 

1813 

1814class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1815 """InMemoryDatastore specialization of a butler""" 

1816 

1817 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1818 fullConfigKey = None 

1819 useTempRoot = False 

1820 validationCanFail = False 

1821 datastoreStr = ["datastore='InMemory"] 

1822 datastoreName = ["InMemoryDatastore@"] 

1823 registryStr = "/gen3.sqlite3" 

1824 

1825 def testIngest(self) -> None: 

1826 pass 

1827 

1828 

1829class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1830 """PosixDatastore specialization""" 

1831 

1832 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1833 fullConfigKey = ".datastore.datastores.1.formatters" 

1834 validationCanFail = True 

1835 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1836 datastoreName = [ 

1837 "InMemoryDatastore@", 

1838 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1839 "SecondDatastore", 

1840 ] 

1841 registryStr = "/gen3.sqlite3" 

1842 

1843 

1844class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1845 """Test that a yaml file in one location can refer to a root in another.""" 

1846 

1847 datastoreStr = ["dir1"] 

1848 # Disable the makeRepo test since we are deliberately not using 

1849 # butler.yaml as the config name. 

1850 fullConfigKey = None 

1851 

1852 def setUp(self) -> None: 

1853 self.root = makeTestTempDir(TESTDIR) 

1854 

1855 # Make a new repository in one place 

1856 self.dir1 = os.path.join(self.root, "dir1") 

1857 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1858 

1859 # Move the yaml file to a different place and add a "root" 

1860 self.dir2 = os.path.join(self.root, "dir2") 

1861 os.makedirs(self.dir2, exist_ok=True) 

1862 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1863 config = Config(configFile1) 

1864 config["root"] = self.dir1 

1865 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1866 config.dumpToUri(configFile2) 

1867 os.remove(configFile1) 

1868 self.tmpConfigFile = configFile2 

1869 

1870 def testFileLocations(self) -> None: 

1871 self.assertNotEqual(self.dir1, self.dir2) 

1872 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1873 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1874 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1875 

1876 

1877class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1878 """Test that a config file created by makeRepo outside of repo works.""" 

1879 

1880 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1881 

1882 def setUp(self) -> None: 

1883 self.root = makeTestTempDir(TESTDIR) 

1884 self.root2 = makeTestTempDir(TESTDIR) 

1885 

1886 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1887 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1888 

1889 def tearDown(self) -> None: 

1890 if os.path.exists(self.root2): 

1891 shutil.rmtree(self.root2, ignore_errors=True) 

1892 super().tearDown() 

1893 

1894 def testConfigExistence(self) -> None: 

1895 c = Config(self.tmpConfigFile) 

1896 uri_config = ResourcePath(c["root"]) 

1897 uri_expected = ResourcePath(self.root, forceDirectory=True) 

1898 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1899 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1900 

1901 def testPutGet(self) -> None: 

1902 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1903 self.runPutGetTest(storageClass, "test_metric") 

1904 

1905 

1906class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1907 """Test that a config file created by makeRepo outside of repo works.""" 

1908 

1909 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1910 

1911 def setUp(self) -> None: 

1912 self.root = makeTestTempDir(TESTDIR) 

1913 self.root2 = makeTestTempDir(TESTDIR) 

1914 

1915 self.tmpConfigFile = self.root2 

1916 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1917 

1918 def testConfigExistence(self) -> None: 

1919 # Append the yaml file else Config constructor does not know the file 

1920 # type. 

1921 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1922 super().testConfigExistence() 

1923 

1924 

1925class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1926 """Test that a config file created by makeRepo outside of repo works.""" 

1927 

1928 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1929 

1930 def setUp(self) -> None: 

1931 self.root = makeTestTempDir(TESTDIR) 

1932 self.root2 = makeTestTempDir(TESTDIR) 

1933 

1934 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl() 

1935 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1936 

1937 

1938@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1939class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1940 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1941 a local in-memory SqlRegistry. 

1942 """ 

1943 

1944 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1945 fullConfigKey = None 

1946 validationCanFail = True 

1947 

1948 bucketName = "anybucketname" 

1949 """Name of the Bucket that will be used in the tests. The name is read from 

1950 the config file used with the tests during set-up. 

1951 """ 

1952 

1953 root = "butlerRoot/" 

1954 """Root repository directory expected to be used in case useTempRoot=False. 

1955 Otherwise the root is set to a 20 characters long randomly generated string 

1956 during set-up. 

1957 """ 

1958 

1959 datastoreStr = [f"datastore={root}"] 

1960 """Contains all expected root locations in a format expected to be 

1961 returned by Butler stringification. 

1962 """ 

1963 

1964 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1965 """The expected format of the S3 Datastore string.""" 

1966 

1967 registryStr = "/gen3.sqlite3" 

1968 """Expected format of the Registry string.""" 

1969 

1970 mock_s3 = mock_s3() 

1971 """The mocked s3 interface from moto.""" 

1972 

1973 def genRoot(self) -> str: 

1974 """Return a random string of len 20 to serve as a root 

1975 name for the temporary bucket repo. 

1976 

1977 This is equivalent to tempfile.mkdtemp as this is what self.root 

1978 becomes when useTempRoot is True. 

1979 """ 

1980 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1981 return rndstr + "/" 

1982 

1983 def setUp(self) -> None: 

1984 config = Config(self.configFile) 

1985 uri = ResourcePath(config[".datastore.datastore.root"]) 

1986 self.bucketName = uri.netloc 

1987 

1988 # Enable S3 mocking of tests. 

1989 self.mock_s3.start() 

1990 

1991 # set up some fake credentials if they do not exist 

1992 self.usingDummyCredentials = setAwsEnvCredentials() 

1993 

1994 if self.useTempRoot: 

1995 self.root = self.genRoot() 

1996 rooturi = f"s3://{self.bucketName}/{self.root}" 

1997 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1998 

1999 # need local folder to store registry database 

2000 self.reg_dir = makeTestTempDir(TESTDIR) 

2001 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

2002 

2003 # MOTO needs to know that we expect Bucket bucketname to exist 

2004 # (this used to be the class attribute bucketName) 

2005 s3 = boto3.resource("s3") 

2006 s3.create_bucket(Bucket=self.bucketName) 

2007 

2008 self.datastoreStr = [f"datastore='{rooturi}'"] 

2009 self.datastoreName = [f"FileDatastore@{rooturi}"] 

2010 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

2011 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

2012 

2013 def tearDown(self) -> None: 

2014 s3 = boto3.resource("s3") 

2015 bucket = s3.Bucket(self.bucketName) 

2016 try: 

2017 bucket.objects.all().delete() 

2018 except botocore.exceptions.ClientError as e: 

2019 if e.response["Error"]["Code"] == "404": 

2020 # the key was not reachable - pass 

2021 pass 

2022 else: 

2023 raise 

2024 

2025 bucket = s3.Bucket(self.bucketName) 

2026 bucket.delete() 

2027 

2028 # Stop the S3 mock. 

2029 self.mock_s3.stop() 

2030 

2031 # unset any potentially set dummy credentials 

2032 if self.usingDummyCredentials: 

2033 unsetAwsEnvCredentials() 

2034 

2035 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

2036 shutil.rmtree(self.reg_dir, ignore_errors=True) 

2037 

2038 if self.useTempRoot and os.path.exists(self.root): 

2039 shutil.rmtree(self.root, ignore_errors=True) 

2040 

2041 super().tearDown() 

2042 

2043 

2044class PosixDatastoreTransfers(unittest.TestCase): 

2045 """Test data transfers between butlers. 

2046 

2047 Test for different managers. UUID to UUID and integer to integer are 

2048 tested. UUID to integer is not supported since we do not currently 

2049 want to allow that. Integer to UUID is supported with the caveat 

2050 that UUID4 will be generated and this will be incorrect for raw 

2051 dataset types. The test ignores that. 

2052 """ 

2053 

2054 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

2055 storageClassFactory: StorageClassFactory 

2056 

2057 @classmethod 

2058 def setUpClass(cls) -> None: 

2059 cls.storageClassFactory = StorageClassFactory() 

2060 cls.storageClassFactory.addFromConfig(cls.configFile) 

2061 

2062 def setUp(self) -> None: 

2063 self.root = makeTestTempDir(TESTDIR) 

2064 self.config = Config(self.configFile) 

2065 

2066 def tearDown(self) -> None: 

2067 removeTestTempDir(self.root) 

2068 

2069 def create_butler(self, manager: str, label: str) -> Butler: 

2070 config = Config(self.configFile) 

2071 config["registry", "managers", "datasets"] = manager 

2072 return Butler.from_config( 

2073 Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True 

2074 ) 

2075 

2076 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None: 

2077 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID" 

2078 if manager1 is None: 

2079 manager1 = default 

2080 if manager2 is None: 

2081 manager2 = default 

2082 self.source_butler = self.create_butler(manager1, "1") 

2083 self.target_butler = self.create_butler(manager2, "2") 

2084 

2085 def testTransferUuidToUuid(self) -> None: 

2086 self.create_butlers() 

2087 self.assertButlerTransfers() 

2088 

2089 def _enable_trust(self, datastore: Datastore) -> None: 

2090 datastores = getattr(datastore, "datastores", [datastore]) 

2091 for this_datastore in datastores: 

2092 if hasattr(this_datastore, "trustGetRequest"): 

2093 this_datastore.trustGetRequest = True 

2094 

2095 def testTransferMissing(self) -> None: 

2096 """Test transfers where datastore records are missing. 

2097 

2098 This is how execution butler works. 

2099 """ 

2100 self.create_butlers() 

2101 

2102 # Configure the source butler to allow trust. 

2103 self._enable_trust(self.source_butler._datastore) 

2104 

2105 self.assertButlerTransfers(purge=True) 

2106 

2107 def testTransferMissingDisassembly(self) -> None: 

2108 """Test transfers where datastore records are missing. 

2109 

2110 This is how execution butler works. 

2111 """ 

2112 self.create_butlers() 

2113 

2114 # Configure the source butler to allow trust. 

2115 self._enable_trust(self.source_butler._datastore) 

2116 

2117 # Test disassembly. 

2118 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

2119 

2120 def testAbsoluteURITransferDirect(self) -> None: 

2121 """Test transfer using an absolute URI.""" 

2122 self._absolute_transfer("auto") 

2123 

2124 def testAbsoluteURITransferCopy(self) -> None: 

2125 """Test transfer using an absolute URI.""" 

2126 self._absolute_transfer("copy") 

2127 

2128 def _absolute_transfer(self, transfer: str) -> None: 

2129 self.create_butlers() 

2130 

2131 storageClassName = "StructuredData" 

2132 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2133 datasetTypeName = "random_data" 

2134 run = "run1" 

2135 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2136 

2137 dimensions = self.source_butler.dimensions.extract(()) 

2138 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2139 self.source_butler.registry.registerDatasetType(datasetType) 

2140 

2141 metrics = makeExampleMetrics() 

2142 with ResourcePath.temporary_uri(suffix=".json") as temp: 

2143 dataId = DataCoordinate.makeEmpty(self.source_butler.dimensions) 

2144 source_refs = [DatasetRef(datasetType, dataId, run=run)] 

2145 temp.write(json.dumps(metrics.exportAsDict()).encode()) 

2146 dataset = FileDataset(path=temp, refs=source_refs) 

2147 self.source_butler.ingest(dataset, transfer="direct") 

2148 

2149 self.target_butler.transfer_from( 

2150 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer 

2151 ) 

2152 

2153 uri = self.target_butler.getURI(dataset.refs[0]) 

2154 if transfer == "auto": 

2155 self.assertEqual(uri, temp) 

2156 else: 

2157 self.assertNotEqual(uri, temp) 

2158 

2159 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None: 

2160 """Test that a run can be transferred to another butler.""" 

2161 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2162 datasetTypeName = "random_data" 

2163 

2164 # Test will create 3 collections and we will want to transfer 

2165 # two of those three. 

2166 runs = ["run1", "run2", "other"] 

2167 

2168 # Also want to use two different dataset types to ensure that 

2169 # grouping works. 

2170 datasetTypeNames = ["random_data", "random_data_2"] 

2171 

2172 # Create the run collections in the source butler. 

2173 for run in runs: 

2174 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2175 

2176 # Create dimensions in source butler. 

2177 n_exposures = 30 

2178 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

2179 self.source_butler.registry.insertDimensionData( 

2180 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

2181 ) 

2182 self.source_butler.registry.insertDimensionData( 

2183 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

2184 ) 

2185 

2186 for i in range(n_exposures): 

2187 self.source_butler.registry.insertDimensionData( 

2188 "exposure", 

2189 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

2190 ) 

2191 

2192 # Create dataset types in the source butler. 

2193 dimensions = self.source_butler.dimensions.extract(["instrument", "exposure"]) 

2194 for datasetTypeName in datasetTypeNames: 

2195 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2196 self.source_butler.registry.registerDatasetType(datasetType) 

2197 

2198 # Write a dataset to an unrelated run -- this will ensure that 

2199 # we are rewriting integer dataset ids in the target if necessary. 

2200 # Will not be relevant for UUID. 

2201 run = "distraction" 

2202 butler = Butler.from_config(butler=self.source_butler, run=run) 

2203 butler.put( 

2204 makeExampleMetrics(), 

2205 datasetTypeName, 

2206 exposure=1, 

2207 instrument="DummyCamComp", 

2208 physical_filter="d-r", 

2209 ) 

2210 

2211 # Write some example metrics to the source 

2212 butler = Butler.from_config(butler=self.source_butler) 

2213 

2214 # Set of DatasetRefs that should be in the list of refs to transfer 

2215 # but which will not be transferred. 

2216 deleted: set[DatasetRef] = set() 

2217 

2218 n_expected = 20 # Number of datasets expected to be transferred 

2219 source_refs = [] 

2220 for i in range(n_exposures): 

2221 # Put a third of datasets into each collection, only retain 

2222 # two thirds. 

2223 index = i % 3 

2224 run = runs[index] 

2225 datasetTypeName = datasetTypeNames[i % 2] 

2226 

2227 metric = MetricsExample( 

2228 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)] 

2229 ) 

2230 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

2231 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

2232 

2233 # Remove the datastore record using low-level API, but only 

2234 # for a specific index. 

2235 if purge and index == 1: 

2236 # For one of these delete the file as well. 

2237 # This allows the "missing" code to filter the 

2238 # file out. 

2239 # Access the individual datastores. 

2240 datastores = [] 

2241 if hasattr(butler._datastore, "datastores"): 

2242 datastores.extend(butler._datastore.datastores) 

2243 else: 

2244 datastores.append(butler._datastore) 

2245 

2246 if not deleted: 

2247 # For a chained datastore we need to remove 

2248 # files in each chain. 

2249 for datastore in datastores: 

2250 # The file might not be known to the datastore 

2251 # if constraints are used. 

2252 try: 

2253 primary, uris = datastore.getURIs(ref) 

2254 except FileNotFoundError: 

2255 continue 

2256 if primary and primary.scheme != "mem": 

2257 primary.remove() 

2258 for uri in uris.values(): 

2259 if uri.scheme != "mem": 

2260 uri.remove() 

2261 n_expected -= 1 

2262 deleted.add(ref) 

2263 

2264 # Remove the datastore record. 

2265 for datastore in datastores: 

2266 if hasattr(datastore, "removeStoredItemInfo"): 

2267 datastore.removeStoredItemInfo(ref) 

2268 

2269 if index < 2: 

2270 source_refs.append(ref) 

2271 if ref not in deleted: 

2272 new_metric = butler.get(ref) 

2273 self.assertEqual(new_metric, metric) 

2274 

2275 # Create some bad dataset types to ensure we check for inconsistent 

2276 # definitions. 

2277 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2278 for datasetTypeName in datasetTypeNames: 

2279 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2280 self.target_butler.registry.registerDatasetType(datasetType) 

2281 with self.assertRaises(ConflictingDefinitionError) as cm: 

2282 self.target_butler.transfer_from(self.source_butler, source_refs) 

2283 self.assertIn("dataset type differs", str(cm.exception)) 

2284 

2285 # And remove the bad definitions. 

2286 for datasetTypeName in datasetTypeNames: 

2287 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2288 

2289 # Transfer without creating dataset types should fail. 

2290 with self.assertRaises(KeyError): 

2291 self.target_butler.transfer_from(self.source_butler, source_refs) 

2292 

2293 # Transfer without creating dimensions should fail. 

2294 with self.assertRaises(ConflictingDefinitionError) as cm: 

2295 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True) 

2296 self.assertIn("dimension", str(cm.exception)) 

2297 

2298 # The failed transfer above leaves registry in an inconsistent 

2299 # state because the run is created but then rolled back without 

2300 # the collection cache being cleared. For now force a refresh. 

2301 # Can remove with DM-35498. 

2302 self.target_butler.registry.refresh() 

2303 

2304 # Now transfer them to the second butler, including dimensions. 

2305 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2306 transferred = self.target_butler.transfer_from( 

2307 self.source_butler, 

2308 source_refs, 

2309 register_dataset_types=True, 

2310 transfer_dimensions=True, 

2311 ) 

2312 self.assertEqual(len(transferred), n_expected) 

2313 log_output = ";".join(log_cm.output) 

2314 

2315 # A ChainedDatastore will use the in-memory datastore for mexists 

2316 # so we can not rely on the mexists log message. 

2317 self.assertIn("Number of datastore records found in source", log_output) 

2318 self.assertIn("Creating output run", log_output) 

2319 

2320 # Do the transfer twice to ensure that it will do nothing extra. 

2321 # Only do this if purge=True because it does not work for int 

2322 # dataset_id. 

2323 if purge: 

2324 # This should not need to register dataset types. 

2325 transferred = self.target_butler.transfer_from(self.source_butler, source_refs) 

2326 self.assertEqual(len(transferred), n_expected) 

2327 

2328 # Also do an explicit low-level transfer to trigger some 

2329 # edge cases. 

2330 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2331 self.target_butler._datastore.transfer_from(self.source_butler._datastore, source_refs) 

2332 log_output = ";".join(log_cm.output) 

2333 self.assertIn("no file artifacts exist", log_output) 

2334 

2335 with self.assertRaises((TypeError, AttributeError)): 

2336 self.target_butler._datastore.transfer_from(self.source_butler, source_refs) # type: ignore 

2337 

2338 with self.assertRaises(ValueError): 

2339 self.target_butler._datastore.transfer_from( 

2340 self.source_butler._datastore, source_refs, transfer="split" 

2341 ) 

2342 

2343 # Now try to get the same refs from the new butler. 

2344 for ref in source_refs: 

2345 if ref not in deleted: 

2346 new_metric = self.target_butler.get(ref) 

2347 old_metric = self.source_butler.get(ref) 

2348 self.assertEqual(new_metric, old_metric) 

2349 

2350 # Now prune run2 collection and create instead a CHAINED collection. 

2351 # This should block the transfer. 

2352 self.target_butler.removeRuns(["run2"], unstore=True) 

2353 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2354 with self.assertRaises(CollectionTypeError): 

2355 # Re-importing the run1 datasets can be problematic if they 

2356 # use integer IDs so filter those out. 

2357 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2358 self.target_butler.transfer_from(self.source_butler, to_transfer) 

2359 

2360 

2361class ChainedDatastoreTransfers(PosixDatastoreTransfers): 

2362 """Test transfers using a chained datastore.""" 

2363 

2364 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

2365 

2366 

2367class NullDatastoreTestCase(unittest.TestCase): 

2368 """Test that we can fall back to a null datastore.""" 

2369 

2370 # Need a good config to create the repo. 

2371 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

2372 storageClassFactory: StorageClassFactory 

2373 

2374 @classmethod 

2375 def setUpClass(cls) -> None: 

2376 cls.storageClassFactory = StorageClassFactory() 

2377 cls.storageClassFactory.addFromConfig(cls.configFile) 

2378 

2379 def setUp(self) -> None: 

2380 """Create a new butler root for each test.""" 

2381 self.root = makeTestTempDir(TESTDIR) 

2382 Butler.makeRepo(self.root, config=Config(self.configFile)) 

2383 

2384 def tearDown(self) -> None: 

2385 removeTestTempDir(self.root) 

2386 

2387 def test_fallback(self) -> None: 

2388 # Read the butler config and mess with the datastore section. 

2389 bad_config = Config(os.path.join(self.root, "butler.yaml")) 

2390 bad_config["datastore", "cls"] = "lsst.not.a.datastore.Datastore" 

2391 

2392 with self.assertRaises(RuntimeError): 

2393 Butler.from_config(bad_config) 

2394 

2395 butler = Butler.from_config(bad_config, writeable=True, without_datastore=True) 

2396 self.assertIsInstance(butler._datastore, NullDatastore) 

2397 

2398 # Check that registry is working. 

2399 butler.registry.registerRun("MYRUN") 

2400 collections = butler.registry.queryCollections(...) 

2401 self.assertIn("MYRUN", set(collections)) 

2402 

2403 # Create a ref. 

2404 dimensions = butler.dimensions.extract([]) 

2405 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

2406 datasetTypeName = "metric" 

2407 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2408 butler.registry.registerDatasetType(datasetType) 

2409 ref = DatasetRef(datasetType, {}, run="MYRUN") 

2410 

2411 # Check that datastore will complain. 

2412 with self.assertRaises(FileNotFoundError): 

2413 butler.get(ref) 

2414 with self.assertRaises(FileNotFoundError): 

2415 butler.getURI(ref) 

2416 

2417 

2418def setup_module(module: types.ModuleType) -> None: 

2419 """Set up the module for pytest.""" 

2420 clean_environment() 

2421 

2422 

2423if __name__ == "__main__": 

2424 clean_environment() 

2425 unittest.main()