Coverage for tests/test_butler.py: 12%

1262 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-23 09:30 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24from __future__ import annotations 

25 

26import gc 

27import json 

28import logging 

29import os 

30import pathlib 

31import pickle 

32import posixpath 

33import random 

34import shutil 

35import string 

36import tempfile 

37import unittest 

38import uuid 

39from collections.abc import Mapping 

40from typing import TYPE_CHECKING, Any, cast 

41 

42try: 

43 import boto3 

44 import botocore 

45 from moto import mock_s3 # type: ignore[import] 

46except ImportError: 

47 boto3 = None 

48 

49 def mock_s3(cls): # type: ignore[no-untyped-def] 

50 """A no-op decorator in case moto mock_s3 can not be imported.""" 

51 return cls 

52 

53 

54try: 

55 # It's possible but silly to have testing.postgresql installed without 

56 # having the postgresql server installed (because then nothing in 

57 # testing.postgresql would work), so we use the presence of that module 

58 # to test whether we can expect the server to be available. 

59 import testing.postgresql # type: ignore[import] 

60except ImportError: 

61 testing = None 

62 

63import astropy.time 

64import sqlalchemy 

65from lsst.daf.butler import ( 

66 Butler, 

67 ButlerConfig, 

68 ButlerRepoIndex, 

69 CollectionType, 

70 Config, 

71 DataCoordinate, 

72 DatasetExistence, 

73 DatasetRef, 

74 DatasetType, 

75 FileDataset, 

76 FileTemplate, 

77 FileTemplateValidationError, 

78 StorageClassFactory, 

79 ValidationError, 

80 script, 

81) 

82from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

83from lsst.daf.butler.datastores.fileDatastore import FileDatastore 

84from lsst.daf.butler.registries.sql import SqlRegistry 

85from lsst.daf.butler.registry import ( 

86 CollectionError, 

87 CollectionTypeError, 

88 ConflictingDefinitionError, 

89 DataIdValueError, 

90 MissingCollectionError, 

91 OrphanedRecordError, 

92) 

93from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

94from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir 

95from lsst.resources import ResourcePath 

96from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

97from lsst.utils import doImportType 

98from lsst.utils.introspection import get_full_type_name 

99 

100if TYPE_CHECKING: 

101 import types 

102 

103 from lsst.daf.butler import Datastore, DimensionGraph, Registry, StorageClass 

104 

105TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

106 

107 

108def clean_environment() -> None: 

109 """Remove external environment variables that affect the tests.""" 

110 for k in ( 

111 "DAF_BUTLER_REPOSITORY_INDEX", 

112 "S3_ENDPOINT_URL", 

113 "AWS_ACCESS_KEY_ID", 

114 "AWS_SECRET_ACCESS_KEY", 

115 "AWS_SHARED_CREDENTIALS_FILE", 

116 ): 

117 os.environ.pop(k, None) 

118 

119 

120def makeExampleMetrics() -> MetricsExample: 

121 return MetricsExample( 

122 {"AM1": 5.2, "AM2": 30.6}, 

123 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

124 [563, 234, 456.7, 752, 8, 9, 27], 

125 ) 

126 

127 

128class TransactionTestError(Exception): 

129 """Specific error for testing transactions, to prevent misdiagnosing 

130 that might otherwise occur when a standard exception is used. 

131 """ 

132 

133 pass 

134 

135 

136class ButlerConfigTests(unittest.TestCase): 

137 """Simple tests for ButlerConfig that are not tested in any other test 

138 cases.""" 

139 

140 def testSearchPath(self) -> None: 

141 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

142 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

143 config1 = ButlerConfig(configFile) 

144 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

145 

146 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

147 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

148 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

149 self.assertIn("testConfigs", "\n".join(cm.output)) 

150 

151 key = ("datastore", "records", "table") 

152 self.assertNotEqual(config1[key], config2[key]) 

153 self.assertEqual(config2[key], "override_record") 

154 

155 

156class ButlerPutGetTests(TestCaseMixin): 

157 """Helper method for running a suite of put/get tests from different 

158 butler configurations.""" 

159 

160 root: str 

161 default_run = "ingésτ😺" 

162 storageClassFactory: StorageClassFactory 

163 configFile: str 

164 tmpConfigFile: str 

165 

166 @staticmethod 

167 def addDatasetType( 

168 datasetTypeName: str, dimensions: DimensionGraph, storageClass: StorageClass | str, registry: Registry 

169 ) -> DatasetType: 

170 """Create a DatasetType and register it""" 

171 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

172 registry.registerDatasetType(datasetType) 

173 return datasetType 

174 

175 @classmethod 

176 def setUpClass(cls) -> None: 

177 cls.storageClassFactory = StorageClassFactory() 

178 cls.storageClassFactory.addFromConfig(cls.configFile) 

179 

180 def assertGetComponents( 

181 self, 

182 butler: Butler, 

183 datasetRef: DatasetRef, 

184 components: tuple[str, ...], 

185 reference: Any, 

186 collections: Any = None, 

187 ) -> None: 

188 datasetType = datasetRef.datasetType 

189 dataId = datasetRef.dataId 

190 deferred = butler.getDeferred(datasetRef) 

191 

192 for component in components: 

193 compTypeName = datasetType.componentTypeName(component) 

194 result = butler.get(compTypeName, dataId, collections=collections) 

195 self.assertEqual(result, getattr(reference, component)) 

196 result_deferred = deferred.get(component=component) 

197 self.assertEqual(result_deferred, result) 

198 

199 def tearDown(self) -> None: 

200 removeTestTempDir(self.root) 

201 

202 def create_butler( 

203 self, run: str, storageClass: StorageClass | str, datasetTypeName: str 

204 ) -> tuple[Butler, DatasetType]: 

205 butler = Butler(self.tmpConfigFile, run=run) 

206 

207 collections = set(butler.registry.queryCollections()) 

208 self.assertEqual(collections, {run}) 

209 

210 # Create and register a DatasetType 

211 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

212 

213 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

214 

215 # Add needed Dimensions 

216 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

217 butler.registry.insertDimensionData( 

218 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

219 ) 

220 butler.registry.insertDimensionData( 

221 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

222 ) 

223 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

224 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

225 butler.registry.insertDimensionData( 

226 "visit", 

227 { 

228 "instrument": "DummyCamComp", 

229 "id": 423, 

230 "name": "fourtwentythree", 

231 "physical_filter": "d-r", 

232 "visit_system": 1, 

233 "datetime_begin": visit_start, 

234 "datetime_end": visit_end, 

235 }, 

236 ) 

237 

238 # Add more visits for some later tests 

239 for visit_id in (424, 425): 

240 butler.registry.insertDimensionData( 

241 "visit", 

242 { 

243 "instrument": "DummyCamComp", 

244 "id": visit_id, 

245 "name": f"fourtwentyfour_{visit_id}", 

246 "physical_filter": "d-r", 

247 "visit_system": 1, 

248 }, 

249 ) 

250 return butler, datasetType 

251 

252 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler: 

253 # New datasets will be added to run and tag, but we will only look in 

254 # tag when looking up datasets. 

255 run = self.default_run 

256 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

257 assert butler.run is not None 

258 

259 # Create and store a dataset 

260 metric = makeExampleMetrics() 

261 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423}) 

262 

263 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

264 # and once with a DatasetType 

265 

266 # Keep track of any collections we add and do not clean up 

267 expected_collections = {run} 

268 

269 counter = 0 

270 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1") 

271 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate] 

272 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)): 

273 # Since we are using subTest we can get cascading failures 

274 # here with the first attempt failing and the others failing 

275 # immediately because the dataset already exists. Work around 

276 # this by using a distinct run collection each time 

277 counter += 1 

278 this_run = f"put_run_{counter}" 

279 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

280 expected_collections.update({this_run}) 

281 

282 with self.subTest(args=args): 

283 kwargs: dict[str, Any] = {} 

284 if not isinstance(args[0], DatasetRef): # type: ignore 

285 kwargs["run"] = this_run 

286 ref = butler.put(metric, *args, **kwargs) 

287 self.assertIsInstance(ref, DatasetRef) 

288 

289 # Test getDirect 

290 metricOut = butler.get(ref) 

291 self.assertEqual(metric, metricOut) 

292 # Test get 

293 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

294 self.assertEqual(metric, metricOut) 

295 # Test get with a datasetRef 

296 metricOut = butler.get(ref) 

297 self.assertEqual(metric, metricOut) 

298 # Test getDeferred with dataId 

299 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

300 self.assertEqual(metric, metricOut) 

301 # Test getDeferred with a ref 

302 metricOut = butler.getDeferred(ref).get() 

303 self.assertEqual(metric, metricOut) 

304 

305 # Check we can get components 

306 if storageClass.isComposite(): 

307 self.assertGetComponents( 

308 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

309 ) 

310 

311 # Can the artifacts themselves be retrieved? 

312 if not butler.datastore.isEphemeral: 

313 root_uri = ResourcePath(self.root) 

314 

315 for preserve_path in (True, False): 

316 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

317 # Use copy so that we can test that overwrite 

318 # protection works (using "auto" for File URIs would 

319 # use hard links and subsequent transfer would work 

320 # because it knows they are the same file). 

321 transferred = butler.retrieveArtifacts( 

322 [ref], destination, preserve_path=preserve_path, transfer="copy" 

323 ) 

324 self.assertGreater(len(transferred), 0) 

325 artifacts = list(ResourcePath.findFileResources([destination])) 

326 self.assertEqual(set(transferred), set(artifacts)) 

327 

328 for artifact in transferred: 

329 path_in_destination = artifact.relative_to(destination) 

330 self.assertIsNotNone(path_in_destination) 

331 assert path_in_destination is not None 

332 

333 # when path is not preserved there should not be 

334 # any path separators. 

335 num_seps = path_in_destination.count("/") 

336 if preserve_path: 

337 self.assertGreater(num_seps, 0) 

338 else: 

339 self.assertEqual(num_seps, 0) 

340 

341 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

342 n_uris = len(secondary_uris) 

343 if primary_uri: 

344 n_uris += 1 

345 self.assertEqual( 

346 len(artifacts), 

347 n_uris, 

348 "Comparing expected artifacts vs actual:" 

349 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

350 ) 

351 

352 if preserve_path: 

353 # No need to run these twice 

354 with self.assertRaises(ValueError): 

355 butler.retrieveArtifacts([ref], destination, transfer="move") 

356 

357 with self.assertRaises(FileExistsError): 

358 butler.retrieveArtifacts([ref], destination) 

359 

360 transferred_again = butler.retrieveArtifacts( 

361 [ref], destination, preserve_path=preserve_path, overwrite=True 

362 ) 

363 self.assertEqual(set(transferred_again), set(transferred)) 

364 

365 # Now remove the dataset completely. 

366 butler.pruneDatasets([ref], purge=True, unstore=True) 

367 # Lookup with original args should still fail. 

368 kwargs = {"collections": this_run} 

369 if isinstance(args[0], DatasetRef): 

370 kwargs = {} # Prevent warning from being issued. 

371 self.assertFalse(butler.exists(*args, **kwargs)) 

372 # get() should still fail. 

373 with self.assertRaises(FileNotFoundError): 

374 butler.get(ref) 

375 # Registry shouldn't be able to find it by dataset_id anymore. 

376 self.assertIsNone(butler.registry.getDataset(ref.id)) 

377 

378 # Do explicit registry removal since we know they are 

379 # empty 

380 butler.registry.removeCollection(this_run) 

381 expected_collections.remove(this_run) 

382 

383 # Create DatasetRef for put using default run. 

384 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run) 

385 

386 # Put the dataset again, since the last thing we did was remove it 

387 # and we want to use the default collection. 

388 ref = butler.put(metric, refIn) 

389 

390 # Get with parameters 

391 stop = 4 

392 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

393 self.assertNotEqual(metric, sliced) 

394 self.assertEqual(metric.summary, sliced.summary) 

395 self.assertEqual(metric.output, sliced.output) 

396 assert metric.data is not None # for mypy 

397 self.assertEqual(metric.data[:stop], sliced.data) 

398 # getDeferred with parameters 

399 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

400 self.assertNotEqual(metric, sliced) 

401 self.assertEqual(metric.summary, sliced.summary) 

402 self.assertEqual(metric.output, sliced.output) 

403 self.assertEqual(metric.data[:stop], sliced.data) 

404 # getDeferred with deferred parameters 

405 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

406 self.assertNotEqual(metric, sliced) 

407 self.assertEqual(metric.summary, sliced.summary) 

408 self.assertEqual(metric.output, sliced.output) 

409 self.assertEqual(metric.data[:stop], sliced.data) 

410 

411 if storageClass.isComposite(): 

412 # Check that components can be retrieved 

413 metricOut = butler.get(ref.datasetType.name, dataId) 

414 compNameS = ref.datasetType.componentTypeName("summary") 

415 compNameD = ref.datasetType.componentTypeName("data") 

416 summary = butler.get(compNameS, dataId) 

417 self.assertEqual(summary, metric.summary) 

418 data = butler.get(compNameD, dataId) 

419 self.assertEqual(data, metric.data) 

420 

421 if "counter" in storageClass.derivedComponents: 

422 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

423 self.assertEqual(count, len(data)) 

424 

425 count = butler.get( 

426 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

427 ) 

428 self.assertEqual(count, stop) 

429 

430 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

431 assert compRef is not None 

432 summary = butler.get(compRef) 

433 self.assertEqual(summary, metric.summary) 

434 

435 # Create a Dataset type that has the same name but is inconsistent. 

436 inconsistentDatasetType = DatasetType( 

437 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config") 

438 ) 

439 

440 # Getting with a dataset type that does not match registry fails 

441 with self.assertRaisesRegex(ValueError, "Supplied dataset type .* inconsistent with registry"): 

442 butler.get(inconsistentDatasetType, dataId) 

443 

444 # Combining a DatasetRef with a dataId should fail 

445 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"): 

446 butler.get(ref, dataId) 

447 # Getting with an explicit ref should fail if the id doesn't match. 

448 with self.assertRaises(FileNotFoundError): 

449 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run)) 

450 

451 # Getting a dataset with unknown parameters should fail 

452 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"): 

453 butler.get(ref, parameters={"unsupported": True}) 

454 

455 # Check we have a collection 

456 collections = set(butler.registry.queryCollections()) 

457 self.assertEqual(collections, expected_collections) 

458 

459 # Clean up to check that we can remove something that may have 

460 # already had a component removed 

461 butler.pruneDatasets([ref], unstore=True, purge=True) 

462 

463 # Add the same ref again, so we can check that duplicate put fails. 

464 ref = butler.put(metric, datasetType, dataId) 

465 

466 # Repeat put will fail. 

467 with self.assertRaisesRegex( 

468 ConflictingDefinitionError, "A database constraint failure was triggered" 

469 ): 

470 butler.put(metric, datasetType, dataId) 

471 

472 # Remove the datastore entry. 

473 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

474 

475 # Put will still fail 

476 with self.assertRaisesRegex( 

477 ConflictingDefinitionError, "A database constraint failure was triggered" 

478 ): 

479 butler.put(metric, datasetType, dataId) 

480 

481 # Repeat the same sequence with resolved ref. 

482 butler.pruneDatasets([ref], unstore=True, purge=True) 

483 ref = butler.put(metric, refIn) 

484 

485 # Repeat put will fail. 

486 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"): 

487 butler.put(metric, refIn) 

488 

489 # Remove the datastore entry. 

490 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

491 

492 # In case of resolved ref this write will succeed. 

493 ref = butler.put(metric, refIn) 

494 

495 # Leave the dataset in place since some downstream tests require 

496 # something to be present 

497 

498 return butler 

499 

500 def testDeferredCollectionPassing(self) -> None: 

501 # Construct a butler with no run or collection, but make it writeable. 

502 butler = Butler(self.tmpConfigFile, writeable=True) 

503 # Create and register a DatasetType 

504 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

505 datasetType = self.addDatasetType( 

506 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

507 ) 

508 # Add needed Dimensions 

509 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

510 butler.registry.insertDimensionData( 

511 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

512 ) 

513 butler.registry.insertDimensionData( 

514 "visit", 

515 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

516 ) 

517 dataId = {"instrument": "DummyCamComp", "visit": 423} 

518 # Create dataset. 

519 metric = makeExampleMetrics() 

520 # Register a new run and put dataset. 

521 run = "deferred" 

522 self.assertTrue(butler.registry.registerRun(run)) 

523 # Second time it will be allowed but indicate no-op 

524 self.assertFalse(butler.registry.registerRun(run)) 

525 ref = butler.put(metric, datasetType, dataId, run=run) 

526 # Putting with no run should fail with TypeError. 

527 with self.assertRaises(CollectionError): 

528 butler.put(metric, datasetType, dataId) 

529 # Dataset should exist. 

530 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

531 # We should be able to get the dataset back, but with and without 

532 # a deferred dataset handle. 

533 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

534 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

535 # Trying to find the dataset without any collection is a TypeError. 

536 self.assertFalse(butler.exists(datasetType, dataId)) 

537 with self.assertRaises(CollectionError): 

538 butler.get(datasetType, dataId) 

539 # Associate the dataset with a different collection. 

540 butler.registry.registerCollection("tagged") 

541 butler.registry.associate("tagged", [ref]) 

542 # Deleting the dataset from the new collection should make it findable 

543 # in the original collection. 

544 butler.pruneDatasets([ref], tags=["tagged"]) 

545 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

546 

547 

548class ButlerTests(ButlerPutGetTests): 

549 """Tests for Butler.""" 

550 

551 useTempRoot = True 

552 validationCanFail: bool 

553 fullConfigKey: str | None 

554 registryStr: str | None 

555 datastoreName: list[str] | None 

556 datastoreStr: list[str] 

557 

558 def setUp(self) -> None: 

559 """Create a new butler root for each test.""" 

560 self.root = makeTestTempDir(TESTDIR) 

561 Butler.makeRepo(self.root, config=Config(self.configFile)) 

562 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

563 

564 def testConstructor(self) -> None: 

565 """Independent test of constructor.""" 

566 butler = Butler(self.tmpConfigFile, run=self.default_run) 

567 self.assertIsInstance(butler, Butler) 

568 

569 # Check that butler.yaml is added automatically. 

570 if self.tmpConfigFile.endswith(end := "/butler.yaml"): 

571 config_dir = self.tmpConfigFile[: -len(end)] 

572 butler = Butler(config_dir, run=self.default_run) 

573 self.assertIsInstance(butler, Butler) 

574 

575 # Even with a ResourcePath. 

576 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) 

577 self.assertIsInstance(butler, Butler) 

578 

579 collections = set(butler.registry.queryCollections()) 

580 self.assertEqual(collections, {self.default_run}) 

581 

582 # Check that some special characters can be included in run name. 

583 special_run = "u@b.c-A" 

584 butler_special = Butler(butler=butler, run=special_run) 

585 collections = set(butler_special.registry.queryCollections("*@*")) 

586 self.assertEqual(collections, {special_run}) 

587 

588 butler2 = Butler(butler=butler, collections=["other"]) 

589 self.assertEqual(butler2.collections, ("other",)) 

590 self.assertIsNone(butler2.run) 

591 self.assertIs(butler.datastore, butler2.datastore) 

592 

593 # Test that we can use an environment variable to find this 

594 # repository. 

595 butler_index = Config() 

596 butler_index["label"] = self.tmpConfigFile 

597 for suffix in (".yaml", ".json"): 

598 # Ensure that the content differs so that we know that 

599 # we aren't reusing the cache. 

600 bad_label = f"file://bucket/not_real{suffix}" 

601 butler_index["bad_label"] = bad_label 

602 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

603 butler_index.dumpToUri(temp_file) 

604 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

605 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"}) 

606 uri = Butler.get_repo_uri("bad_label") 

607 self.assertEqual(uri, ResourcePath(bad_label)) 

608 uri = Butler.get_repo_uri("label") 

609 butler = Butler(uri, writeable=False) 

610 self.assertIsInstance(butler, Butler) 

611 butler = Butler("label", writeable=False) 

612 self.assertIsInstance(butler, Butler) 

613 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"): 

614 Butler("not_there", writeable=False) 

615 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"): 

616 Butler("bad_label") 

617 with self.assertRaises(FileNotFoundError): 

618 # Should ignore aliases. 

619 Butler(ResourcePath("label", forceAbsolute=False)) 

620 with self.assertRaises(KeyError) as cm: 

621 Butler.get_repo_uri("missing") 

622 self.assertEqual( 

623 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False) 

624 ) 

625 self.assertIn("not known to", str(cm.exception)) 

626 # Should report no failure. 

627 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "") 

628 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

629 # Now with empty configuration. 

630 butler_index = Config() 

631 butler_index.dumpToUri(temp_file) 

632 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

633 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"): 

634 Butler("label") 

635 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

636 # Now with bad contents. 

637 with open(temp_file.ospath, "w") as fh: 

638 print("'", file=fh) 

639 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

640 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"): 

641 Butler("label") 

642 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

643 with self.assertRaises(FileNotFoundError): 

644 Butler.get_repo_uri("label") 

645 self.assertEqual(Butler.get_known_repos(), set()) 

646 

647 with self.assertRaisesRegex(FileNotFoundError, "index file not found"): 

648 Butler("label") 

649 

650 # Check that we can create Butler when the alias file is not found. 

651 butler = Butler(self.tmpConfigFile, writeable=False) 

652 self.assertIsInstance(butler, Butler) 

653 with self.assertRaises(KeyError) as cm: 

654 # No environment variable set. 

655 Butler.get_repo_uri("label") 

656 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False)) 

657 self.assertIn("No repository index defined", str(cm.exception)) 

658 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"): 

659 # No aliases registered. 

660 Butler("not_there") 

661 self.assertEqual(Butler.get_known_repos(), set()) 

662 

663 def testBasicPutGet(self) -> None: 

664 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

665 self.runPutGetTest(storageClass, "test_metric") 

666 

667 def testCompositePutGetConcrete(self) -> None: 

668 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

669 butler = self.runPutGetTest(storageClass, "test_metric") 

670 

671 # Should *not* be disassembled 

672 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

673 self.assertEqual(len(datasets), 1) 

674 uri, components = butler.getURIs(datasets[0]) 

675 self.assertIsInstance(uri, ResourcePath) 

676 self.assertFalse(components) 

677 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

678 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

679 

680 # Predicted dataset 

681 dataId = {"instrument": "DummyCamComp", "visit": 424} 

682 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

683 self.assertFalse(components) 

684 self.assertIsInstance(uri, ResourcePath) 

685 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

686 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

687 

688 def testCompositePutGetVirtual(self) -> None: 

689 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

690 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

691 

692 # Should be disassembled 

693 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

694 self.assertEqual(len(datasets), 1) 

695 uri, components = butler.getURIs(datasets[0]) 

696 

697 if butler.datastore.isEphemeral: 

698 # Never disassemble in-memory datastore 

699 self.assertIsInstance(uri, ResourcePath) 

700 self.assertFalse(components) 

701 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

702 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

703 else: 

704 self.assertIsNone(uri) 

705 self.assertEqual(set(components), set(storageClass.components)) 

706 for compuri in components.values(): 

707 self.assertIsInstance(compuri, ResourcePath) 

708 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

709 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

710 

711 # Predicted dataset 

712 dataId = {"instrument": "DummyCamComp", "visit": 424} 

713 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

714 

715 if butler.datastore.isEphemeral: 

716 # Never disassembled 

717 self.assertIsInstance(uri, ResourcePath) 

718 self.assertFalse(components) 

719 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

720 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

721 else: 

722 self.assertIsNone(uri) 

723 self.assertEqual(set(components), set(storageClass.components)) 

724 for compuri in components.values(): 

725 self.assertIsInstance(compuri, ResourcePath) 

726 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

727 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

728 

729 def testStorageClassOverrideGet(self) -> None: 

730 """Test storage class conversion on get with override.""" 

731 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

732 datasetTypeName = "anything" 

733 run = self.default_run 

734 

735 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

736 

737 # Create and store a dataset. 

738 metric = makeExampleMetrics() 

739 dataId = {"instrument": "DummyCamComp", "visit": 423} 

740 

741 ref = butler.put(metric, datasetType, dataId) 

742 

743 # Return native type. 

744 retrieved = butler.get(ref) 

745 self.assertEqual(retrieved, metric) 

746 

747 # Specify an override. 

748 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion") 

749 model = butler.get(ref, storageClass=new_sc) 

750 self.assertNotEqual(type(model), type(retrieved)) 

751 self.assertIs(type(model), new_sc.pytype) 

752 self.assertEqual(retrieved, model) 

753 

754 # Defer but override later. 

755 deferred = butler.getDeferred(ref) 

756 model = deferred.get(storageClass=new_sc) 

757 self.assertIs(type(model), new_sc.pytype) 

758 self.assertEqual(retrieved, model) 

759 

760 # Defer but override up front. 

761 deferred = butler.getDeferred(ref, storageClass=new_sc) 

762 model = deferred.get() 

763 self.assertIs(type(model), new_sc.pytype) 

764 self.assertEqual(retrieved, model) 

765 

766 # Retrieve a component. Should be a tuple. 

767 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple") 

768 self.assertIs(type(data), tuple) 

769 self.assertEqual(data, tuple(retrieved.data)) 

770 

771 # Parameter on the write storage class should work regardless 

772 # of read storage class. 

773 data = butler.get( 

774 "anything.data", 

775 dataId, 

776 storageClass="StructuredDataDataTestTuple", 

777 parameters={"slice": slice(2, 4)}, 

778 ) 

779 self.assertEqual(len(data), 2) 

780 

781 # Try a parameter that is known to the read storage class but not 

782 # the write storage class. 

783 with self.assertRaises(KeyError): 

784 butler.get( 

785 "anything.data", 

786 dataId, 

787 storageClass="StructuredDataDataTestTuple", 

788 parameters={"xslice": slice(2, 4)}, 

789 ) 

790 

791 def testPytypePutCoercion(self) -> None: 

792 """Test python type coercion on Butler.get and put.""" 

793 

794 # Store some data with the normal example storage class. 

795 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

796 datasetTypeName = "test_metric" 

797 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName) 

798 

799 dataId = {"instrument": "DummyCamComp", "visit": 423} 

800 

801 # Put a dict and this should coerce to a MetricsExample 

802 test_dict = {"summary": {"a": 1}, "output": {"b": 2}} 

803 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424) 

804 test_metric = butler.get(metric_ref) 

805 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample") 

806 self.assertEqual(test_metric.summary, test_dict["summary"]) 

807 self.assertEqual(test_metric.output, test_dict["output"]) 

808 

809 # Check that the put still works if a DatasetType is given with 

810 # a definition matching this python type. 

811 registry_type = butler.registry.getDatasetType(datasetTypeName) 

812 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson") 

813 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425) 

814 self.assertEqual(metric2_ref.datasetType, registry_type) 

815 

816 # The get will return the type expected by registry. 

817 test_metric2 = butler.get(metric2_ref) 

818 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample") 

819 

820 # Make a new DatasetRef with the compatible but different DatasetType. 

821 # This should now return a dict. 

822 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run) 

823 test_dict2 = butler.get(new_ref) 

824 self.assertEqual(get_full_type_name(test_dict2), "dict") 

825 

826 # Get it again with the wrong dataset type definition using get() 

827 # rather than get(). This should be consistent with get() 

828 # behavior and return the type of the DatasetType. 

829 test_dict3 = butler.get(this_type, dataId=dataId, visit=425) 

830 self.assertEqual(get_full_type_name(test_dict3), "dict") 

831 

832 def testIngest(self) -> None: 

833 butler = Butler(self.tmpConfigFile, run=self.default_run) 

834 

835 # Create and register a DatasetType 

836 dimensions = butler.dimensions.extract(["instrument", "visit", "detector"]) 

837 

838 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

839 datasetTypeName = "metric" 

840 

841 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

842 

843 # Add needed Dimensions 

844 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

845 butler.registry.insertDimensionData( 

846 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

847 ) 

848 for detector in (1, 2): 

849 butler.registry.insertDimensionData( 

850 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

851 ) 

852 

853 butler.registry.insertDimensionData( 

854 "visit", 

855 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

856 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

857 ) 

858 

859 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter") 

860 dataRoot = os.path.join(TESTDIR, "data", "basic") 

861 datasets = [] 

862 for detector in (1, 2): 

863 detector_name = f"detector_{detector}" 

864 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

865 dataId = butler.registry.expandDataId( 

866 {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

867 ) 

868 # Create a DatasetRef for ingest 

869 refIn = DatasetRef(datasetType, dataId, run=self.default_run) 

870 

871 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

872 

873 butler.ingest(*datasets, transfer="copy") 

874 

875 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

876 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

877 

878 metrics1 = butler.get(datasetTypeName, dataId1) 

879 metrics2 = butler.get(datasetTypeName, dataId2) 

880 self.assertNotEqual(metrics1, metrics2) 

881 

882 # Compare URIs 

883 uri1 = butler.getURI(datasetTypeName, dataId1) 

884 uri2 = butler.getURI(datasetTypeName, dataId2) 

885 self.assertNotEqual(uri1, uri2) 

886 

887 # Now do a multi-dataset but single file ingest 

888 metricFile = os.path.join(dataRoot, "detectors.yaml") 

889 refs = [] 

890 for detector in (1, 2): 

891 detector_name = f"detector_{detector}" 

892 dataId = butler.registry.expandDataId( 

893 {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

894 ) 

895 # Create a DatasetRef for ingest 

896 refs.append(DatasetRef(datasetType, dataId, run=self.default_run)) 

897 

898 # Test "move" transfer to ensure that the files themselves 

899 # have disappeared following ingest. 

900 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile: 

901 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy") 

902 

903 datasets = [] 

904 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter)) 

905 

906 # For first ingest use copy. 

907 butler.ingest(*datasets, transfer="copy", record_validation_info=False) 

908 

909 # Now try to ingest again in "execution butler" mode where 

910 # the registry entries exist but the datastore does not have 

911 # the files. We also need to strip the dimension records to ensure 

912 # that they will be re-added by the ingest. 

913 ref = datasets[0].refs[0] 

914 datasets[0].refs = [ 

915 cast( 

916 DatasetRef, 

917 butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), 

918 ) 

919 for ref in datasets[0].refs 

920 ] 

921 all_refs = [] 

922 for dataset in datasets: 

923 refs = [] 

924 for ref in dataset.refs: 

925 # Create a dict from the dataId to drop the records. 

926 new_data_id = {str(k): v for k, v in ref.dataId.items()} 

927 new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run) 

928 assert new_ref is not None 

929 self.assertFalse(new_ref.dataId.hasRecords()) 

930 refs.append(new_ref) 

931 dataset.refs = refs 

932 all_refs.extend(dataset.refs) 

933 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False) 

934 

935 # Use move mode to test that the file is deleted. Also 

936 # disable recording of file size. 

937 butler.ingest(*datasets, transfer="move", record_validation_info=False) 

938 

939 # Check that every ref now has records. 

940 for dataset in datasets: 

941 for ref in dataset.refs: 

942 self.assertTrue(ref.dataId.hasRecords()) 

943 

944 # Ensure that the file has disappeared. 

945 self.assertFalse(tempFile.exists()) 

946 

947 # Check that the datastore recorded no file size. 

948 # Not all datastores can support this. 

949 try: 

950 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined] 

951 self.assertEqual(infos[0].file_size, -1) 

952 except AttributeError: 

953 pass 

954 

955 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

956 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

957 

958 multi1 = butler.get(datasetTypeName, dataId1) 

959 multi2 = butler.get(datasetTypeName, dataId2) 

960 

961 self.assertEqual(multi1, metrics1) 

962 self.assertEqual(multi2, metrics2) 

963 

964 # Compare URIs 

965 uri1 = butler.getURI(datasetTypeName, dataId1) 

966 uri2 = butler.getURI(datasetTypeName, dataId2) 

967 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

968 

969 # Test that removing one does not break the second 

970 # This line will issue a warning log message for a ChainedDatastore 

971 # that uses an InMemoryDatastore since in-memory can not ingest 

972 # files. 

973 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

974 self.assertFalse(butler.exists(datasetTypeName, dataId1)) 

975 self.assertTrue(butler.exists(datasetTypeName, dataId2)) 

976 multi2b = butler.get(datasetTypeName, dataId2) 

977 self.assertEqual(multi2, multi2b) 

978 

979 # Ensure we can ingest 0 datasets 

980 datasets = [] 

981 butler.ingest(*datasets) 

982 

983 def testPickle(self) -> None: 

984 """Test pickle support.""" 

985 butler = Butler(self.tmpConfigFile, run=self.default_run) 

986 butlerOut = pickle.loads(pickle.dumps(butler)) 

987 self.assertIsInstance(butlerOut, Butler) 

988 self.assertEqual(butlerOut._config, butler._config) 

989 self.assertEqual(butlerOut.collections, butler.collections) 

990 self.assertEqual(butlerOut.run, butler.run) 

991 

992 def testGetDatasetTypes(self) -> None: 

993 butler = Butler(self.tmpConfigFile, run=self.default_run) 

994 dimensions = butler.dimensions.extract(["instrument", "visit", "physical_filter"]) 

995 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [ 

996 ( 

997 "instrument", 

998 [ 

999 {"instrument": "DummyCam"}, 

1000 {"instrument": "DummyHSC"}, 

1001 {"instrument": "DummyCamComp"}, 

1002 ], 

1003 ), 

1004 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]), 

1005 ("visit", [{"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}]), 

1006 ] 

1007 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1008 # Add needed Dimensions 

1009 for element, data in dimensionEntries: 

1010 butler.registry.insertDimensionData(element, *data) 

1011 

1012 # When a DatasetType is added to the registry entries are not created 

1013 # for components but querying them can return the components. 

1014 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

1015 components = set() 

1016 for datasetTypeName in datasetTypeNames: 

1017 # Create and register a DatasetType 

1018 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1019 

1020 for componentName in storageClass.components: 

1021 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

1022 

1023 fromRegistry: set[DatasetType] = set() 

1024 for parent_dataset_type in butler.registry.queryDatasetTypes(): 

1025 fromRegistry.add(parent_dataset_type) 

1026 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes()) 

1027 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

1028 

1029 # Now that we have some dataset types registered, validate them 

1030 butler.validateConfiguration( 

1031 ignore=[ 

1032 "test_metric_comp", 

1033 "metric3", 

1034 "metric5", 

1035 "calexp", 

1036 "DummySC", 

1037 "datasetType.component", 

1038 "random_data", 

1039 "random_data_2", 

1040 ] 

1041 ) 

1042 

1043 # Add a new datasetType that will fail template validation 

1044 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

1045 if self.validationCanFail: 

1046 with self.assertRaises(ValidationError): 

1047 butler.validateConfiguration() 

1048 

1049 # Rerun validation but with a subset of dataset type names 

1050 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

1051 

1052 # Rerun validation but ignore the bad datasetType 

1053 butler.validateConfiguration( 

1054 ignore=[ 

1055 "test_metric_comp", 

1056 "metric3", 

1057 "metric5", 

1058 "calexp", 

1059 "DummySC", 

1060 "datasetType.component", 

1061 "random_data", 

1062 "random_data_2", 

1063 ] 

1064 ) 

1065 

1066 def testTransaction(self) -> None: 

1067 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1068 datasetTypeName = "test_metric" 

1069 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1070 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = ( 

1071 ("instrument", {"instrument": "DummyCam"}), 

1072 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

1073 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

1074 ) 

1075 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1076 metric = makeExampleMetrics() 

1077 dataId = {"instrument": "DummyCam", "visit": 42} 

1078 # Create and register a DatasetType 

1079 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1080 with self.assertRaises(TransactionTestError): 

1081 with butler.transaction(): 

1082 # Add needed Dimensions 

1083 for args in dimensionEntries: 

1084 butler.registry.insertDimensionData(*args) 

1085 # Store a dataset 

1086 ref = butler.put(metric, datasetTypeName, dataId) 

1087 self.assertIsInstance(ref, DatasetRef) 

1088 # Test getDirect 

1089 metricOut = butler.get(ref) 

1090 self.assertEqual(metric, metricOut) 

1091 # Test get 

1092 metricOut = butler.get(datasetTypeName, dataId) 

1093 self.assertEqual(metric, metricOut) 

1094 # Check we can get components 

1095 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

1096 raise TransactionTestError("This should roll back the entire transaction") 

1097 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"): 

1098 butler.registry.expandDataId(dataId) 

1099 # Should raise LookupError for missing data ID value 

1100 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

1101 butler.get(datasetTypeName, dataId) 

1102 # Also check explicitly if Dataset entry is missing 

1103 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

1104 # Direct retrieval should not find the file in the Datastore 

1105 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

1106 butler.get(ref) 

1107 

1108 def testMakeRepo(self) -> None: 

1109 """Test that we can write butler configuration to a new repository via 

1110 the Butler.makeRepo interface and then instantiate a butler from the 

1111 repo root. 

1112 """ 

1113 # Do not run the test if we know this datastore configuration does 

1114 # not support a file system root 

1115 if self.fullConfigKey is None: 

1116 return 

1117 

1118 # create two separate directories 

1119 root1 = tempfile.mkdtemp(dir=self.root) 

1120 root2 = tempfile.mkdtemp(dir=self.root) 

1121 

1122 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

1123 limited = Config(self.configFile) 

1124 butler1 = Butler(butlerConfig) 

1125 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

1126 full = Config(self.tmpConfigFile) 

1127 butler2 = Butler(butlerConfig) 

1128 # Butlers should have the same configuration regardless of whether 

1129 # defaults were expanded. 

1130 self.assertEqual(butler1._config, butler2._config) 

1131 # Config files loaded directly should not be the same. 

1132 self.assertNotEqual(limited, full) 

1133 # Make sure "limited" doesn't have a few keys we know it should be 

1134 # inheriting from defaults. 

1135 self.assertIn(self.fullConfigKey, full) 

1136 self.assertNotIn(self.fullConfigKey, limited) 

1137 

1138 # Collections don't appear until something is put in them 

1139 collections1 = set(butler1.registry.queryCollections()) 

1140 self.assertEqual(collections1, set()) 

1141 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

1142 

1143 # Check that a config with no associated file name will not 

1144 # work properly with relocatable Butler repo 

1145 butlerConfig.configFile = None 

1146 with self.assertRaises(ValueError): 

1147 Butler(butlerConfig) 

1148 

1149 with self.assertRaises(FileExistsError): 

1150 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

1151 

1152 def testStringification(self) -> None: 

1153 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1154 butlerStr = str(butler) 

1155 

1156 if self.datastoreStr is not None: 

1157 for testStr in self.datastoreStr: 

1158 self.assertIn(testStr, butlerStr) 

1159 if self.registryStr is not None: 

1160 self.assertIn(self.registryStr, butlerStr) 

1161 

1162 datastoreName = butler.datastore.name 

1163 if self.datastoreName is not None: 

1164 for testStr in self.datastoreName: 

1165 self.assertIn(testStr, datastoreName) 

1166 

1167 def testButlerRewriteDataId(self) -> None: 

1168 """Test that dataIds can be rewritten based on dimension records.""" 

1169 

1170 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1171 

1172 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1173 datasetTypeName = "random_data" 

1174 

1175 # Create dimension records. 

1176 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1177 butler.registry.insertDimensionData( 

1178 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1179 ) 

1180 butler.registry.insertDimensionData( 

1181 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1182 ) 

1183 

1184 dimensions = butler.dimensions.extract(["instrument", "exposure"]) 

1185 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1186 butler.registry.registerDatasetType(datasetType) 

1187 

1188 n_exposures = 5 

1189 dayobs = 20210530 

1190 

1191 for i in range(n_exposures): 

1192 butler.registry.insertDimensionData( 

1193 "exposure", 

1194 { 

1195 "instrument": "DummyCamComp", 

1196 "id": i, 

1197 "obs_id": f"exp{i}", 

1198 "seq_num": i, 

1199 "day_obs": dayobs, 

1200 "physical_filter": "d-r", 

1201 }, 

1202 ) 

1203 

1204 # Write some data. 

1205 for i in range(n_exposures): 

1206 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1207 

1208 # Use the seq_num for the put to test rewriting. 

1209 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1210 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1211 

1212 # Check that the exposure is correct in the dataId 

1213 self.assertEqual(ref.dataId["exposure"], i) 

1214 

1215 # and check that we can get the dataset back with the same dataId 

1216 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1217 self.assertEqual(new_metric, metric) 

1218 

1219 

1220class FileDatastoreButlerTests(ButlerTests): 

1221 """Common tests and specialization of ButlerTests for butlers backed 

1222 by datastores that inherit from FileDatastore. 

1223 """ 

1224 

1225 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool: 

1226 """Checks if file exists at a given path (relative to root). 

1227 

1228 Test testPutTemplates verifies actual physical existance of the files 

1229 in the requested location. 

1230 """ 

1231 uri = ResourcePath(root, forceDirectory=True) 

1232 return uri.join(relpath).exists() 

1233 

1234 def testPutTemplates(self) -> None: 

1235 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1236 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1237 

1238 # Add needed Dimensions 

1239 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1240 butler.registry.insertDimensionData( 

1241 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1242 ) 

1243 butler.registry.insertDimensionData( 

1244 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1245 ) 

1246 butler.registry.insertDimensionData( 

1247 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1248 ) 

1249 

1250 # Create and store a dataset 

1251 metric = makeExampleMetrics() 

1252 

1253 # Create two almost-identical DatasetTypes (both will use default 

1254 # template) 

1255 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1256 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1257 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1258 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1259 

1260 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1261 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1262 

1263 # Put with exactly the data ID keys needed 

1264 ref = butler.put(metric, "metric1", dataId1) 

1265 uri = butler.getURI(ref) 

1266 self.assertTrue(uri.exists()) 

1267 self.assertTrue( 

1268 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle") 

1269 ) 

1270 

1271 # Check the template based on dimensions 

1272 if hasattr(butler.datastore, "templates"): 

1273 butler.datastore.templates.validateTemplates([ref]) 

1274 

1275 # Put with extra data ID keys (physical_filter is an optional 

1276 # dependency); should not change template (at least the way we're 

1277 # defining them to behave now; the important thing is that they 

1278 # must be consistent). 

1279 ref = butler.put(metric, "metric2", dataId2) 

1280 uri = butler.getURI(ref) 

1281 self.assertTrue(uri.exists()) 

1282 self.assertTrue( 

1283 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle") 

1284 ) 

1285 

1286 # Check the template based on dimensions 

1287 if hasattr(butler.datastore, "templates"): 

1288 butler.datastore.templates.validateTemplates([ref]) 

1289 

1290 # Use a template that has a typo in dimension record metadata. 

1291 # Easier to test with a butler that has a ref with records attached. 

1292 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits") 

1293 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"): 

1294 path = template.format(ref) 

1295 self.assertEqual(path, f"a/v423/{ref.id}_fits") 

1296 

1297 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits") 

1298 with self.assertRaises(KeyError): 

1299 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"): 

1300 template.format(ref) 

1301 

1302 # Now use a file template that will not result in unique filenames 

1303 with self.assertRaises(FileTemplateValidationError): 

1304 butler.put(metric, "metric3", dataId1) 

1305 

1306 def testImportExport(self) -> None: 

1307 # Run put/get tests just to create and populate a repo. 

1308 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1309 self.runImportExportTest(storageClass) 

1310 

1311 @unittest.expectedFailure 

1312 def testImportExportVirtualComposite(self) -> None: 

1313 # Run put/get tests just to create and populate a repo. 

1314 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1315 self.runImportExportTest(storageClass) 

1316 

1317 def runImportExportTest(self, storageClass: StorageClass) -> None: 

1318 """This test does an export to a temp directory and an import back 

1319 into a new temp directory repo. It does not assume a posix datastore""" 

1320 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1321 

1322 # Test that we must have a file extension. 

1323 with self.assertRaises(ValueError): 

1324 with exportButler.export(filename="dump", directory=".") as export: 

1325 pass 

1326 

1327 # Test that unknown format is not allowed. 

1328 with self.assertRaises(ValueError): 

1329 with exportButler.export(filename="dump.fits", directory=".") as export: 

1330 pass 

1331 

1332 # Test that the repo actually has at least one dataset. 

1333 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1334 self.assertGreater(len(datasets), 0) 

1335 # Add a DimensionRecord that's unused by those datasets. 

1336 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1337 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1338 # Export and then import datasets. 

1339 with safeTestTempDir(TESTDIR) as exportDir: 

1340 exportFile = os.path.join(exportDir, "exports.yaml") 

1341 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1342 export.saveDatasets(datasets) 

1343 # Export the same datasets again. This should quietly do 

1344 # nothing because of internal deduplication, and it shouldn't 

1345 # complain about being asked to export the "htm7" elements even 

1346 # though there aren't any in these datasets or in the database. 

1347 export.saveDatasets(datasets, elements=["htm7"]) 

1348 # Save one of the data IDs again; this should be harmless 

1349 # because of internal deduplication. 

1350 export.saveDataIds([datasets[0].dataId]) 

1351 # Save some dimension records directly. 

1352 export.saveDimensionData("skymap", [skymapRecord]) 

1353 self.assertTrue(os.path.exists(exportFile)) 

1354 with safeTestTempDir(TESTDIR) as importDir: 

1355 # We always want this to be a local posix butler 

1356 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1357 # Calling script.butlerImport tests the implementation of the 

1358 # butler command line interface "import" subcommand. Functions 

1359 # in the script folder are generally considered protected and 

1360 # should not be used as public api. 

1361 with open(exportFile) as f: 

1362 script.butlerImport( 

1363 importDir, 

1364 export_file=f, 

1365 directory=exportDir, 

1366 transfer="auto", 

1367 skip_dimensions=None, 

1368 ) 

1369 importButler = Butler(importDir, run=self.default_run) 

1370 for ref in datasets: 

1371 with self.subTest(ref=ref): 

1372 # Test for existence by passing in the DatasetType and 

1373 # data ID separately, to avoid lookup by dataset_id. 

1374 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId)) 

1375 self.assertEqual( 

1376 list(importButler.registry.queryDimensionRecords("skymap")), 

1377 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)], 

1378 ) 

1379 

1380 def testRemoveRuns(self) -> None: 

1381 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1382 butler = Butler(self.tmpConfigFile, writeable=True) 

1383 # Load registry data with dimensions to hang datasets off of. 

1384 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1385 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1386 # Add some RUN-type collection. 

1387 run1 = "run1" 

1388 butler.registry.registerRun(run1) 

1389 run2 = "run2" 

1390 butler.registry.registerRun(run2) 

1391 # put a dataset in each 

1392 metric = makeExampleMetrics() 

1393 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1394 datasetType = self.addDatasetType( 

1395 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1396 ) 

1397 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1398 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1399 uri1 = butler.getURI(ref1) 

1400 uri2 = butler.getURI(ref2) 

1401 

1402 with self.assertRaises(OrphanedRecordError): 

1403 butler.registry.removeDatasetType(datasetType.name) 

1404 

1405 # Remove from both runs with different values for unstore. 

1406 butler.removeRuns([run1], unstore=True) 

1407 butler.removeRuns([run2], unstore=False) 

1408 # Should be nothing in registry for either one, and datastore should 

1409 # not think either exists. 

1410 with self.assertRaises(MissingCollectionError): 

1411 butler.registry.getCollectionType(run1) 

1412 with self.assertRaises(MissingCollectionError): 

1413 butler.registry.getCollectionType(run2) 

1414 self.assertFalse(butler.datastore.exists(ref1)) 

1415 self.assertFalse(butler.datastore.exists(ref2)) 

1416 # The ref we unstored should be gone according to the URI, but the 

1417 # one we forgot should still be around. 

1418 self.assertFalse(uri1.exists()) 

1419 self.assertTrue(uri2.exists()) 

1420 

1421 # Now that the collections have been pruned we can remove the 

1422 # dataset type 

1423 butler.registry.removeDatasetType(datasetType.name) 

1424 

1425 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm: 

1426 butler.registry.removeDatasetType(tuple(["test*", "test*"])) 

1427 self.assertIn("not defined", "\n".join(cm.output)) 

1428 

1429 

1430class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1431 """PosixDatastore specialization of a butler""" 

1432 

1433 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1434 fullConfigKey: str | None = ".datastore.formatters" 

1435 validationCanFail = True 

1436 datastoreStr = ["/tmp"] 

1437 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1438 registryStr = "/gen3.sqlite3" 

1439 

1440 def testPathConstructor(self) -> None: 

1441 """Independent test of constructor using PathLike.""" 

1442 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1443 self.assertIsInstance(butler, Butler) 

1444 

1445 # And again with a Path object with the butler yaml 

1446 path = pathlib.Path(self.tmpConfigFile) 

1447 butler = Butler(path, writeable=False) 

1448 self.assertIsInstance(butler, Butler) 

1449 

1450 # And again with a Path object without the butler yaml 

1451 # (making sure we skip it if the tmp config doesn't end 

1452 # in butler.yaml -- which is the case for a subclass) 

1453 if self.tmpConfigFile.endswith("butler.yaml"): 

1454 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1455 butler = Butler(path, writeable=False) 

1456 self.assertIsInstance(butler, Butler) 

1457 

1458 def testExportTransferCopy(self) -> None: 

1459 """Test local export using all transfer modes""" 

1460 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1461 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1462 # Test that the repo actually has at least one dataset. 

1463 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1464 self.assertGreater(len(datasets), 0) 

1465 uris = [exportButler.getURI(d) for d in datasets] 

1466 assert isinstance(exportButler.datastore, FileDatastore) 

1467 datastoreRoot = exportButler.datastore.root 

1468 

1469 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1470 

1471 for path in pathsInStore: 

1472 # Assume local file system 

1473 assert path is not None 

1474 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1475 

1476 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1477 with safeTestTempDir(TESTDIR) as exportDir: 

1478 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1479 export.saveDatasets(datasets) 

1480 for path in pathsInStore: 

1481 assert path is not None 

1482 self.assertTrue( 

1483 self.checkFileExists(exportDir, path), 

1484 f"Check that mode {transfer} exported files", 

1485 ) 

1486 

1487 def testPruneDatasets(self) -> None: 

1488 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1489 butler = Butler(self.tmpConfigFile, writeable=True) 

1490 assert isinstance(butler.datastore, FileDatastore) 

1491 # Load registry data with dimensions to hang datasets off of. 

1492 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1493 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1494 # Add some RUN-type collections. 

1495 run1 = "run1" 

1496 butler.registry.registerRun(run1) 

1497 run2 = "run2" 

1498 butler.registry.registerRun(run2) 

1499 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1500 # different runs. ref3 has a different data ID. 

1501 metric = makeExampleMetrics() 

1502 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1503 datasetType = self.addDatasetType( 

1504 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1505 ) 

1506 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1507 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1508 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1509 

1510 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1511 for ref, stored in many_stored.items(): 

1512 self.assertTrue(stored, f"Ref {ref} should be stored") 

1513 

1514 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1515 for ref, exists in many_exists.items(): 

1516 self.assertTrue(exists, f"Checking ref {ref} exists.") 

1517 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored") 

1518 

1519 # Simple prune. 

1520 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1521 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1)) 

1522 

1523 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1524 for ref, stored in many_stored.items(): 

1525 self.assertFalse(stored, f"Ref {ref} should not be stored") 

1526 

1527 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1528 for ref, exists in many_exists.items(): 

1529 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored") 

1530 

1531 # Put data back. 

1532 ref1_new = butler.put(metric, ref1) 

1533 self.assertEqual(ref1_new, ref1) # Reuses original ID. 

1534 ref2 = butler.put(metric, ref2) 

1535 

1536 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1537 self.assertTrue(many_stored[ref1]) 

1538 self.assertTrue(many_stored[ref2]) 

1539 self.assertFalse(many_stored[ref3]) 

1540 

1541 ref3 = butler.put(metric, ref3) 

1542 

1543 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1544 for ref, exists in many_exists.items(): 

1545 self.assertTrue(exists, f"Ref {ref} should not be stored") 

1546 

1547 # Clear out the datasets from registry and start again. 

1548 refs = [ref1, ref2, ref3] 

1549 butler.pruneDatasets(refs, purge=True, unstore=True) 

1550 for ref in refs: 

1551 butler.put(metric, ref) 

1552 

1553 # Test different forms of file availability. 

1554 # Need to be in a state where: 

1555 # - one ref just has registry record. 

1556 # - one ref has a missing file but a datastore record. 

1557 # - one ref has a missing datastore record but file is there. 

1558 # - one ref does not exist anywhere. 

1559 # Do not need to test a ref that has everything since that is tested 

1560 # above. 

1561 ref0 = DatasetRef( 

1562 datasetType, 

1563 DataCoordinate.standardize( 

1564 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions 

1565 ), 

1566 run=run1, 

1567 ) 

1568 

1569 # Delete from datastore and retain in Registry. 

1570 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False) 

1571 

1572 # File has been removed. 

1573 uri2 = butler.datastore.getURI(ref2) 

1574 uri2.remove() 

1575 

1576 # Datastore has lost track. 

1577 butler.datastore.forget([ref3]) 

1578 

1579 # First test with a standard butler. 

1580 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1581 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1582 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1583 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1584 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED) 

1585 

1586 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False) 

1587 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1588 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1589 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN) 

1590 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1591 self.assertTrue(exists_many[ref2]) 

1592 

1593 # Check that per-ref query gives the same answer as many query. 

1594 for ref, exists in exists_many.items(): 

1595 self.assertEqual(butler.exists(ref, full_check=False), exists) 

1596 

1597 # Test again with a trusting butler. 

1598 butler.datastore.trustGetRequest = True 

1599 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1600 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1601 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1602 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1603 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT) 

1604 

1605 # Check that per-ref query gives the same answer as many query. 

1606 for ref, exists in exists_many.items(): 

1607 self.assertEqual(butler.exists(ref, full_check=True), exists) 

1608 

1609 # Create a ref that surprisingly has the UUID of an existing ref 

1610 # but is not the same. 

1611 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id) 

1612 with self.assertRaises(ValueError): 

1613 butler.exists(ref_bad) 

1614 

1615 # Create a ref that has a compatible storage class. 

1616 ref_compat = ref2.overrideStorageClass("StructuredDataDict") 

1617 exists = butler.exists(ref_compat) 

1618 self.assertEqual(exists, exists_many[ref2]) 

1619 

1620 # Remove everything and start from scratch. 

1621 butler.datastore.trustGetRequest = False 

1622 butler.pruneDatasets(refs, purge=True, unstore=True) 

1623 for ref in refs: 

1624 butler.put(metric, ref) 

1625 

1626 # These tests mess directly with the trash table and can leave the 

1627 # datastore in an odd state. Do them at the end. 

1628 # Check that in normal mode, deleting the record will lead to 

1629 # trash not touching the file. 

1630 uri1 = butler.datastore.getURI(ref1) 

1631 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table 

1632 butler.datastore.forget([ref1]) 

1633 butler.datastore.trash(ref1) 

1634 butler.datastore.emptyTrash() 

1635 self.assertTrue(uri1.exists()) 

1636 uri1.remove() # Clean it up. 

1637 

1638 # Simulate execution butler setup by deleting the datastore 

1639 # record but keeping the file around and trusting. 

1640 butler.datastore.trustGetRequest = True 

1641 uri2 = butler.datastore.getURI(ref2) 

1642 uri3 = butler.datastore.getURI(ref3) 

1643 self.assertTrue(uri2.exists()) 

1644 self.assertTrue(uri3.exists()) 

1645 

1646 # Remove the datastore record. 

1647 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table 

1648 butler.datastore.forget([ref2]) 

1649 self.assertTrue(uri2.exists()) 

1650 butler.datastore.trash([ref2, ref3]) 

1651 # Immediate removal for ref2 file 

1652 self.assertFalse(uri2.exists()) 

1653 # But ref3 has to wait for the empty. 

1654 self.assertTrue(uri3.exists()) 

1655 butler.datastore.emptyTrash() 

1656 self.assertFalse(uri3.exists()) 

1657 

1658 # Clear out the datasets from registry. 

1659 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1660 

1661 def testPytypeCoercion(self) -> None: 

1662 """Test python type coercion on Butler.get and put.""" 

1663 

1664 # Store some data with the normal example storage class. 

1665 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1666 datasetTypeName = "test_metric" 

1667 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1668 

1669 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1670 metric = butler.get(datasetTypeName, dataId=dataId) 

1671 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1672 

1673 datasetType_ori = butler.registry.getDatasetType(datasetTypeName) 

1674 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1675 

1676 # Now need to hack the registry dataset type definition. 

1677 # There is no API for this. 

1678 assert isinstance(butler.registry, SqlRegistry) 

1679 manager = butler.registry._managers.datasets 

1680 assert hasattr(manager, "_db") and hasattr(manager, "_static") 

1681 manager._db.update( 

1682 manager._static.dataset_type, 

1683 {"name": datasetTypeName}, 

1684 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1685 ) 

1686 

1687 # Force reset of dataset type cache 

1688 butler.registry.refresh() 

1689 

1690 datasetType_new = butler.registry.getDatasetType(datasetTypeName) 

1691 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1692 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1693 

1694 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1695 self.assertNotEqual(type(metric_model), type(metric)) 

1696 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1697 

1698 # Put the model and read it back to show that everything now 

1699 # works as normal. 

1700 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1701 metric_model_new = butler.get(metric_ref) 

1702 self.assertEqual(metric_model_new, metric_model) 

1703 

1704 # Hack the storage class again to something that will fail on the 

1705 # get with no conversion class. 

1706 manager._db.update( 

1707 manager._static.dataset_type, 

1708 {"name": datasetTypeName}, 

1709 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1710 ) 

1711 butler.registry.refresh() 

1712 

1713 with self.assertRaises(ValueError): 

1714 butler.get(datasetTypeName, dataId=dataId) 

1715 

1716 

1717@unittest.skipUnless(testing is not None, "testing.postgresql module not found") 

1718class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1719 """PosixDatastore specialization of a butler using Postgres""" 

1720 

1721 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1722 fullConfigKey = ".datastore.formatters" 

1723 validationCanFail = True 

1724 datastoreStr = ["/tmp"] 

1725 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1726 registryStr = "PostgreSQL@test" 

1727 postgresql: Any 

1728 

1729 @staticmethod 

1730 def _handler(postgresql: Any) -> None: 

1731 engine = sqlalchemy.engine.create_engine(postgresql.url()) 

1732 with engine.begin() as connection: 

1733 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;")) 

1734 

1735 @classmethod 

1736 def setUpClass(cls) -> None: 

1737 # Create the postgres test server. 

1738 cls.postgresql = testing.postgresql.PostgresqlFactory( 

1739 cache_initialized_db=True, on_initialized=cls._handler 

1740 ) 

1741 super().setUpClass() 

1742 

1743 @classmethod 

1744 def tearDownClass(cls) -> None: 

1745 # Clean up any lingering SQLAlchemy engines/connections 

1746 # so they're closed before we shut down the server. 

1747 gc.collect() 

1748 cls.postgresql.clear_cache() 

1749 super().tearDownClass() 

1750 

1751 def setUp(self) -> None: 

1752 self.server = self.postgresql() 

1753 

1754 # Need to add a registry section to the config. 

1755 self._temp_config = False 

1756 config = Config(self.configFile) 

1757 config["registry", "db"] = self.server.url() 

1758 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh: 

1759 config.dump(fh) 

1760 self.configFile = fh.name 

1761 self._temp_config = True 

1762 super().setUp() 

1763 

1764 def tearDown(self) -> None: 

1765 self.server.stop() 

1766 if self._temp_config and os.path.exists(self.configFile): 

1767 os.remove(self.configFile) 

1768 super().tearDown() 

1769 

1770 def testMakeRepo(self) -> None: 

1771 # The base class test assumes that it's using sqlite and assumes 

1772 # the config file is acceptable to sqlite. 

1773 raise unittest.SkipTest("Postgres config is not compatible with this test.") 

1774 

1775 

1776class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1777 """InMemoryDatastore specialization of a butler""" 

1778 

1779 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1780 fullConfigKey = None 

1781 useTempRoot = False 

1782 validationCanFail = False 

1783 datastoreStr = ["datastore='InMemory"] 

1784 datastoreName = ["InMemoryDatastore@"] 

1785 registryStr = "/gen3.sqlite3" 

1786 

1787 def testIngest(self) -> None: 

1788 pass 

1789 

1790 

1791class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1792 """PosixDatastore specialization""" 

1793 

1794 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1795 fullConfigKey = ".datastore.datastores.1.formatters" 

1796 validationCanFail = True 

1797 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1798 datastoreName = [ 

1799 "InMemoryDatastore@", 

1800 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1801 "SecondDatastore", 

1802 ] 

1803 registryStr = "/gen3.sqlite3" 

1804 

1805 

1806class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1807 """Test that a yaml file in one location can refer to a root in another.""" 

1808 

1809 datastoreStr = ["dir1"] 

1810 # Disable the makeRepo test since we are deliberately not using 

1811 # butler.yaml as the config name. 

1812 fullConfigKey = None 

1813 

1814 def setUp(self) -> None: 

1815 self.root = makeTestTempDir(TESTDIR) 

1816 

1817 # Make a new repository in one place 

1818 self.dir1 = os.path.join(self.root, "dir1") 

1819 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1820 

1821 # Move the yaml file to a different place and add a "root" 

1822 self.dir2 = os.path.join(self.root, "dir2") 

1823 os.makedirs(self.dir2, exist_ok=True) 

1824 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1825 config = Config(configFile1) 

1826 config["root"] = self.dir1 

1827 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1828 config.dumpToUri(configFile2) 

1829 os.remove(configFile1) 

1830 self.tmpConfigFile = configFile2 

1831 

1832 def testFileLocations(self) -> None: 

1833 self.assertNotEqual(self.dir1, self.dir2) 

1834 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1835 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1836 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1837 

1838 

1839class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1840 """Test that a config file created by makeRepo outside of repo works.""" 

1841 

1842 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1843 

1844 def setUp(self) -> None: 

1845 self.root = makeTestTempDir(TESTDIR) 

1846 self.root2 = makeTestTempDir(TESTDIR) 

1847 

1848 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1849 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1850 

1851 def tearDown(self) -> None: 

1852 if os.path.exists(self.root2): 

1853 shutil.rmtree(self.root2, ignore_errors=True) 

1854 super().tearDown() 

1855 

1856 def testConfigExistence(self) -> None: 

1857 c = Config(self.tmpConfigFile) 

1858 uri_config = ResourcePath(c["root"]) 

1859 uri_expected = ResourcePath(self.root, forceDirectory=True) 

1860 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1861 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1862 

1863 def testPutGet(self) -> None: 

1864 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1865 self.runPutGetTest(storageClass, "test_metric") 

1866 

1867 

1868class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1869 """Test that a config file created by makeRepo outside of repo works.""" 

1870 

1871 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1872 

1873 def setUp(self) -> None: 

1874 self.root = makeTestTempDir(TESTDIR) 

1875 self.root2 = makeTestTempDir(TESTDIR) 

1876 

1877 self.tmpConfigFile = self.root2 

1878 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1879 

1880 def testConfigExistence(self) -> None: 

1881 # Append the yaml file else Config constructor does not know the file 

1882 # type. 

1883 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1884 super().testConfigExistence() 

1885 

1886 

1887class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1888 """Test that a config file created by makeRepo outside of repo works.""" 

1889 

1890 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1891 

1892 def setUp(self) -> None: 

1893 self.root = makeTestTempDir(TESTDIR) 

1894 self.root2 = makeTestTempDir(TESTDIR) 

1895 

1896 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl() 

1897 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1898 

1899 

1900@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1901class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1902 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1903 a local in-memory SqlRegistry. 

1904 """ 

1905 

1906 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1907 fullConfigKey = None 

1908 validationCanFail = True 

1909 

1910 bucketName = "anybucketname" 

1911 """Name of the Bucket that will be used in the tests. The name is read from 

1912 the config file used with the tests during set-up. 

1913 """ 

1914 

1915 root = "butlerRoot/" 

1916 """Root repository directory expected to be used in case useTempRoot=False. 

1917 Otherwise the root is set to a 20 characters long randomly generated string 

1918 during set-up. 

1919 """ 

1920 

1921 datastoreStr = [f"datastore={root}"] 

1922 """Contains all expected root locations in a format expected to be 

1923 returned by Butler stringification. 

1924 """ 

1925 

1926 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1927 """The expected format of the S3 Datastore string.""" 

1928 

1929 registryStr = "/gen3.sqlite3" 

1930 """Expected format of the Registry string.""" 

1931 

1932 mock_s3 = mock_s3() 

1933 """The mocked s3 interface from moto.""" 

1934 

1935 def genRoot(self) -> str: 

1936 """Returns a random string of len 20 to serve as a root 

1937 name for the temporary bucket repo. 

1938 

1939 This is equivalent to tempfile.mkdtemp as this is what self.root 

1940 becomes when useTempRoot is True. 

1941 """ 

1942 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1943 return rndstr + "/" 

1944 

1945 def setUp(self) -> None: 

1946 config = Config(self.configFile) 

1947 uri = ResourcePath(config[".datastore.datastore.root"]) 

1948 self.bucketName = uri.netloc 

1949 

1950 # Enable S3 mocking of tests. 

1951 self.mock_s3.start() 

1952 

1953 # set up some fake credentials if they do not exist 

1954 self.usingDummyCredentials = setAwsEnvCredentials() 

1955 

1956 if self.useTempRoot: 

1957 self.root = self.genRoot() 

1958 rooturi = f"s3://{self.bucketName}/{self.root}" 

1959 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1960 

1961 # need local folder to store registry database 

1962 self.reg_dir = makeTestTempDir(TESTDIR) 

1963 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1964 

1965 # MOTO needs to know that we expect Bucket bucketname to exist 

1966 # (this used to be the class attribute bucketName) 

1967 s3 = boto3.resource("s3") 

1968 s3.create_bucket(Bucket=self.bucketName) 

1969 

1970 self.datastoreStr = [f"datastore='{rooturi}'"] 

1971 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1972 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1973 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1974 

1975 def tearDown(self) -> None: 

1976 s3 = boto3.resource("s3") 

1977 bucket = s3.Bucket(self.bucketName) 

1978 try: 

1979 bucket.objects.all().delete() 

1980 except botocore.exceptions.ClientError as e: 

1981 if e.response["Error"]["Code"] == "404": 

1982 # the key was not reachable - pass 

1983 pass 

1984 else: 

1985 raise 

1986 

1987 bucket = s3.Bucket(self.bucketName) 

1988 bucket.delete() 

1989 

1990 # Stop the S3 mock. 

1991 self.mock_s3.stop() 

1992 

1993 # unset any potentially set dummy credentials 

1994 if self.usingDummyCredentials: 

1995 unsetAwsEnvCredentials() 

1996 

1997 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1998 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1999 

2000 if self.useTempRoot and os.path.exists(self.root): 

2001 shutil.rmtree(self.root, ignore_errors=True) 

2002 

2003 super().tearDown() 

2004 

2005 

2006class PosixDatastoreTransfers(unittest.TestCase): 

2007 """Test data transfers between butlers. 

2008 

2009 Test for different managers. UUID to UUID and integer to integer are 

2010 tested. UUID to integer is not supported since we do not currently 

2011 want to allow that. Integer to UUID is supported with the caveat 

2012 that UUID4 will be generated and this will be incorrect for raw 

2013 dataset types. The test ignores that. 

2014 """ 

2015 

2016 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

2017 storageClassFactory: StorageClassFactory 

2018 

2019 @classmethod 

2020 def setUpClass(cls) -> None: 

2021 cls.storageClassFactory = StorageClassFactory() 

2022 cls.storageClassFactory.addFromConfig(cls.configFile) 

2023 

2024 def setUp(self) -> None: 

2025 self.root = makeTestTempDir(TESTDIR) 

2026 self.config = Config(self.configFile) 

2027 

2028 def tearDown(self) -> None: 

2029 removeTestTempDir(self.root) 

2030 

2031 def create_butler(self, manager: str, label: str) -> Butler: 

2032 config = Config(self.configFile) 

2033 config["registry", "managers", "datasets"] = manager 

2034 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True) 

2035 

2036 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None: 

2037 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID" 

2038 if manager1 is None: 

2039 manager1 = default 

2040 if manager2 is None: 

2041 manager2 = default 

2042 self.source_butler = self.create_butler(manager1, "1") 

2043 self.target_butler = self.create_butler(manager2, "2") 

2044 

2045 def testTransferUuidToUuid(self) -> None: 

2046 self.create_butlers() 

2047 self.assertButlerTransfers() 

2048 

2049 def _enable_trust(self, datastore: Datastore) -> None: 

2050 if hasattr(datastore, "trustGetRequest"): 

2051 datastore.trustGetRequest = True 

2052 elif hasattr(datastore, "datastores"): 

2053 for datastore in datastore.datastores: 

2054 if hasattr(datastore, "trustGetRequest"): 

2055 datastore.trustGetRequest = True 

2056 

2057 def testTransferMissing(self) -> None: 

2058 """Test transfers where datastore records are missing. 

2059 

2060 This is how execution butler works. 

2061 """ 

2062 self.create_butlers() 

2063 

2064 # Configure the source butler to allow trust. 

2065 self._enable_trust(self.source_butler.datastore) 

2066 

2067 self.assertButlerTransfers(purge=True) 

2068 

2069 def testTransferMissingDisassembly(self) -> None: 

2070 """Test transfers where datastore records are missing. 

2071 

2072 This is how execution butler works. 

2073 """ 

2074 self.create_butlers() 

2075 

2076 # Configure the source butler to allow trust. 

2077 self._enable_trust(self.source_butler.datastore) 

2078 

2079 # Test disassembly. 

2080 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

2081 

2082 def testAbsoluteURITransferDirect(self) -> None: 

2083 """Test transfer using an absolute URI.""" 

2084 self._absolute_transfer("auto") 

2085 

2086 def testAbsoluteURITransferCopy(self) -> None: 

2087 """Test transfer using an absolute URI.""" 

2088 self._absolute_transfer("copy") 

2089 

2090 def _absolute_transfer(self, transfer: str) -> None: 

2091 self.create_butlers() 

2092 

2093 storageClassName = "StructuredData" 

2094 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2095 datasetTypeName = "random_data" 

2096 run = "run1" 

2097 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2098 

2099 dimensions = self.source_butler.dimensions.extract(()) 

2100 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2101 self.source_butler.registry.registerDatasetType(datasetType) 

2102 

2103 metrics = makeExampleMetrics() 

2104 with ResourcePath.temporary_uri(suffix=".json") as temp: 

2105 dataId = DataCoordinate.makeEmpty(self.source_butler.dimensions) 

2106 source_refs = [DatasetRef(datasetType, dataId, run=run)] 

2107 temp.write(json.dumps(metrics.exportAsDict()).encode()) 

2108 dataset = FileDataset(path=temp, refs=source_refs) 

2109 self.source_butler.ingest(dataset, transfer="direct") 

2110 

2111 self.target_butler.transfer_from( 

2112 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer 

2113 ) 

2114 

2115 uri = self.target_butler.getURI(dataset.refs[0]) 

2116 if transfer == "auto": 

2117 self.assertEqual(uri, temp) 

2118 else: 

2119 self.assertNotEqual(uri, temp) 

2120 

2121 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None: 

2122 """Test that a run can be transferred to another butler.""" 

2123 

2124 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2125 datasetTypeName = "random_data" 

2126 

2127 # Test will create 3 collections and we will want to transfer 

2128 # two of those three. 

2129 runs = ["run1", "run2", "other"] 

2130 

2131 # Also want to use two different dataset types to ensure that 

2132 # grouping works. 

2133 datasetTypeNames = ["random_data", "random_data_2"] 

2134 

2135 # Create the run collections in the source butler. 

2136 for run in runs: 

2137 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2138 

2139 # Create dimensions in source butler. 

2140 n_exposures = 30 

2141 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

2142 self.source_butler.registry.insertDimensionData( 

2143 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

2144 ) 

2145 self.source_butler.registry.insertDimensionData( 

2146 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

2147 ) 

2148 

2149 for i in range(n_exposures): 

2150 self.source_butler.registry.insertDimensionData( 

2151 "exposure", 

2152 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

2153 ) 

2154 

2155 # Create dataset types in the source butler. 

2156 dimensions = self.source_butler.dimensions.extract(["instrument", "exposure"]) 

2157 for datasetTypeName in datasetTypeNames: 

2158 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2159 self.source_butler.registry.registerDatasetType(datasetType) 

2160 

2161 # Write a dataset to an unrelated run -- this will ensure that 

2162 # we are rewriting integer dataset ids in the target if necessary. 

2163 # Will not be relevant for UUID. 

2164 run = "distraction" 

2165 butler = Butler(butler=self.source_butler, run=run) 

2166 butler.put( 

2167 makeExampleMetrics(), 

2168 datasetTypeName, 

2169 exposure=1, 

2170 instrument="DummyCamComp", 

2171 physical_filter="d-r", 

2172 ) 

2173 

2174 # Write some example metrics to the source 

2175 butler = Butler(butler=self.source_butler) 

2176 

2177 # Set of DatasetRefs that should be in the list of refs to transfer 

2178 # but which will not be transferred. 

2179 deleted: set[DatasetRef] = set() 

2180 

2181 n_expected = 20 # Number of datasets expected to be transferred 

2182 source_refs = [] 

2183 for i in range(n_exposures): 

2184 # Put a third of datasets into each collection, only retain 

2185 # two thirds. 

2186 index = i % 3 

2187 run = runs[index] 

2188 datasetTypeName = datasetTypeNames[i % 2] 

2189 

2190 metric = MetricsExample( 

2191 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)] 

2192 ) 

2193 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

2194 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

2195 

2196 # Remove the datastore record using low-level API 

2197 if purge: 

2198 # Remove records for a fraction. 

2199 if index == 1: 

2200 # For one of these delete the file as well. 

2201 # This allows the "missing" code to filter the 

2202 # file out. 

2203 # Access the individual datastores. 

2204 datastores = [] 

2205 if hasattr(butler.datastore, "datastores"): 

2206 datastores.extend(butler.datastore.datastores) 

2207 else: 

2208 datastores.append(butler.datastore) 

2209 

2210 if not deleted: 

2211 # For a chained datastore we need to remove 

2212 # files in each chain. 

2213 for datastore in datastores: 

2214 # The file might not be known to the datastore 

2215 # if constraints are used. 

2216 try: 

2217 primary, uris = datastore.getURIs(ref) 

2218 except FileNotFoundError: 

2219 continue 

2220 if primary: 

2221 if primary.scheme != "mem": 

2222 primary.remove() 

2223 for uri in uris.values(): 

2224 if uri.scheme != "mem": 

2225 uri.remove() 

2226 n_expected -= 1 

2227 deleted.add(ref) 

2228 

2229 # Remove the datastore record. 

2230 for datastore in datastores: 

2231 if hasattr(datastore, "removeStoredItemInfo"): 

2232 datastore.removeStoredItemInfo(ref) 

2233 

2234 if index < 2: 

2235 source_refs.append(ref) 

2236 if ref not in deleted: 

2237 new_metric = butler.get(ref) 

2238 self.assertEqual(new_metric, metric) 

2239 

2240 # Create some bad dataset types to ensure we check for inconsistent 

2241 # definitions. 

2242 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2243 for datasetTypeName in datasetTypeNames: 

2244 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2245 self.target_butler.registry.registerDatasetType(datasetType) 

2246 with self.assertRaises(ConflictingDefinitionError) as cm: 

2247 self.target_butler.transfer_from(self.source_butler, source_refs) 

2248 self.assertIn("dataset type differs", str(cm.exception)) 

2249 

2250 # And remove the bad definitions. 

2251 for datasetTypeName in datasetTypeNames: 

2252 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2253 

2254 # Transfer without creating dataset types should fail. 

2255 with self.assertRaises(KeyError): 

2256 self.target_butler.transfer_from(self.source_butler, source_refs) 

2257 

2258 # Transfer without creating dimensions should fail. 

2259 with self.assertRaises(ConflictingDefinitionError) as cm: 

2260 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True) 

2261 self.assertIn("dimension", str(cm.exception)) 

2262 

2263 # The failed transfer above leaves registry in an inconsistent 

2264 # state because the run is created but then rolled back without 

2265 # the collection cache being cleared. For now force a refresh. 

2266 # Can remove with DM-35498. 

2267 self.target_butler.registry.refresh() 

2268 

2269 # Now transfer them to the second butler, including dimensions. 

2270 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2271 transferred = self.target_butler.transfer_from( 

2272 self.source_butler, 

2273 source_refs, 

2274 register_dataset_types=True, 

2275 transfer_dimensions=True, 

2276 ) 

2277 self.assertEqual(len(transferred), n_expected) 

2278 log_output = ";".join(log_cm.output) 

2279 

2280 # A ChainedDatastore will use the in-memory datastore for mexists 

2281 # so we can not rely on the mexists log message. 

2282 self.assertIn("Number of datastore records found in source", log_output) 

2283 self.assertIn("Creating output run", log_output) 

2284 

2285 # Do the transfer twice to ensure that it will do nothing extra. 

2286 # Only do this if purge=True because it does not work for int 

2287 # dataset_id. 

2288 if purge: 

2289 # This should not need to register dataset types. 

2290 transferred = self.target_butler.transfer_from(self.source_butler, source_refs) 

2291 self.assertEqual(len(transferred), n_expected) 

2292 

2293 # Also do an explicit low-level transfer to trigger some 

2294 # edge cases. 

2295 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2296 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

2297 log_output = ";".join(log_cm.output) 

2298 self.assertIn("no file artifacts exist", log_output) 

2299 

2300 with self.assertRaises((TypeError, AttributeError)): 

2301 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) # type: ignore 

2302 

2303 with self.assertRaises(ValueError): 

2304 self.target_butler.datastore.transfer_from( 

2305 self.source_butler.datastore, source_refs, transfer="split" 

2306 ) 

2307 

2308 # Now try to get the same refs from the new butler. 

2309 for ref in source_refs: 

2310 if ref not in deleted: 

2311 new_metric = self.target_butler.get(ref) 

2312 old_metric = self.source_butler.get(ref) 

2313 self.assertEqual(new_metric, old_metric) 

2314 

2315 # Now prune run2 collection and create instead a CHAINED collection. 

2316 # This should block the transfer. 

2317 self.target_butler.removeRuns(["run2"], unstore=True) 

2318 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2319 with self.assertRaises(CollectionTypeError): 

2320 # Re-importing the run1 datasets can be problematic if they 

2321 # use integer IDs so filter those out. 

2322 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2323 self.target_butler.transfer_from(self.source_butler, to_transfer) 

2324 

2325 

2326class ChainedDatastoreTransfers(PosixDatastoreTransfers): 

2327 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

2328 

2329 

2330def setup_module(module: types.ModuleType) -> None: 

2331 clean_environment() 

2332 

2333 

2334if __name__ == "__main__": 

2335 clean_environment() 

2336 unittest.main()