Coverage for tests/test_butler.py: 12%

1262 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-28 10:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24from __future__ import annotations 

25 

26import gc 

27import json 

28import logging 

29import os 

30import pathlib 

31import pickle 

32import posixpath 

33import random 

34import shutil 

35import string 

36import tempfile 

37import unittest 

38import uuid 

39from collections.abc import Mapping 

40from typing import TYPE_CHECKING, Any, cast 

41 

42try: 

43 import boto3 

44 import botocore 

45 from moto import mock_s3 # type: ignore[import] 

46except ImportError: 

47 boto3 = None 

48 

49 def mock_s3(cls): # type: ignore[no-untyped-def] 

50 """No-op decorator in case moto mock_s3 can not be imported.""" 

51 return cls 

52 

53 

54try: 

55 # It's possible but silly to have testing.postgresql installed without 

56 # having the postgresql server installed (because then nothing in 

57 # testing.postgresql would work), so we use the presence of that module 

58 # to test whether we can expect the server to be available. 

59 import testing.postgresql # type: ignore[import] 

60except ImportError: 

61 testing = None 

62 

63import astropy.time 

64import sqlalchemy 

65from lsst.daf.butler import ( 

66 Butler, 

67 ButlerConfig, 

68 ButlerRepoIndex, 

69 CollectionType, 

70 Config, 

71 DataCoordinate, 

72 DatasetExistence, 

73 DatasetRef, 

74 DatasetType, 

75 FileDataset, 

76 FileTemplate, 

77 FileTemplateValidationError, 

78 StorageClassFactory, 

79 ValidationError, 

80 script, 

81) 

82from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

83from lsst.daf.butler.datastores.fileDatastore import FileDatastore 

84from lsst.daf.butler.registries.sql import SqlRegistry 

85from lsst.daf.butler.registry import ( 

86 CollectionError, 

87 CollectionTypeError, 

88 ConflictingDefinitionError, 

89 DataIdValueError, 

90 MissingCollectionError, 

91 OrphanedRecordError, 

92) 

93from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

94from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir 

95from lsst.resources import ResourcePath 

96from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

97from lsst.utils import doImportType 

98from lsst.utils.introspection import get_full_type_name 

99 

100if TYPE_CHECKING: 

101 import types 

102 

103 from lsst.daf.butler import Datastore, DimensionGraph, Registry, StorageClass 

104 

105TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

106 

107 

108def clean_environment() -> None: 

109 """Remove external environment variables that affect the tests.""" 

110 for k in ( 

111 "DAF_BUTLER_REPOSITORY_INDEX", 

112 "S3_ENDPOINT_URL", 

113 "AWS_ACCESS_KEY_ID", 

114 "AWS_SECRET_ACCESS_KEY", 

115 "AWS_SHARED_CREDENTIALS_FILE", 

116 ): 

117 os.environ.pop(k, None) 

118 

119 

120def makeExampleMetrics() -> MetricsExample: 

121 """Return example dataset suitable for tests.""" 

122 return MetricsExample( 

123 {"AM1": 5.2, "AM2": 30.6}, 

124 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

125 [563, 234, 456.7, 752, 8, 9, 27], 

126 ) 

127 

128 

129class TransactionTestError(Exception): 

130 """Specific error for testing transactions, to prevent misdiagnosing 

131 that might otherwise occur when a standard exception is used. 

132 """ 

133 

134 pass 

135 

136 

137class ButlerConfigTests(unittest.TestCase): 

138 """Simple tests for ButlerConfig that are not tested in any other test 

139 cases. 

140 """ 

141 

142 def testSearchPath(self) -> None: 

143 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

144 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

145 config1 = ButlerConfig(configFile) 

146 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

147 

148 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

149 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

150 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

151 self.assertIn("testConfigs", "\n".join(cm.output)) 

152 

153 key = ("datastore", "records", "table") 

154 self.assertNotEqual(config1[key], config2[key]) 

155 self.assertEqual(config2[key], "override_record") 

156 

157 

158class ButlerPutGetTests(TestCaseMixin): 

159 """Helper method for running a suite of put/get tests from different 

160 butler configurations. 

161 """ 

162 

163 root: str 

164 default_run = "ingésτ😺" 

165 storageClassFactory: StorageClassFactory 

166 configFile: str 

167 tmpConfigFile: str 

168 

169 @staticmethod 

170 def addDatasetType( 

171 datasetTypeName: str, dimensions: DimensionGraph, storageClass: StorageClass | str, registry: Registry 

172 ) -> DatasetType: 

173 """Create a DatasetType and register it""" 

174 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

175 registry.registerDatasetType(datasetType) 

176 return datasetType 

177 

178 @classmethod 

179 def setUpClass(cls) -> None: 

180 cls.storageClassFactory = StorageClassFactory() 

181 cls.storageClassFactory.addFromConfig(cls.configFile) 

182 

183 def assertGetComponents( 

184 self, 

185 butler: Butler, 

186 datasetRef: DatasetRef, 

187 components: tuple[str, ...], 

188 reference: Any, 

189 collections: Any = None, 

190 ) -> None: 

191 datasetType = datasetRef.datasetType 

192 dataId = datasetRef.dataId 

193 deferred = butler.getDeferred(datasetRef) 

194 

195 for component in components: 

196 compTypeName = datasetType.componentTypeName(component) 

197 result = butler.get(compTypeName, dataId, collections=collections) 

198 self.assertEqual(result, getattr(reference, component)) 

199 result_deferred = deferred.get(component=component) 

200 self.assertEqual(result_deferred, result) 

201 

202 def tearDown(self) -> None: 

203 removeTestTempDir(self.root) 

204 

205 def create_butler( 

206 self, run: str, storageClass: StorageClass | str, datasetTypeName: str 

207 ) -> tuple[Butler, DatasetType]: 

208 butler = Butler(self.tmpConfigFile, run=run) 

209 

210 collections = set(butler.registry.queryCollections()) 

211 self.assertEqual(collections, {run}) 

212 

213 # Create and register a DatasetType 

214 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

215 

216 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

217 

218 # Add needed Dimensions 

219 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

220 butler.registry.insertDimensionData( 

221 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

222 ) 

223 butler.registry.insertDimensionData( 

224 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

225 ) 

226 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

227 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

228 butler.registry.insertDimensionData( 

229 "visit", 

230 { 

231 "instrument": "DummyCamComp", 

232 "id": 423, 

233 "name": "fourtwentythree", 

234 "physical_filter": "d-r", 

235 "visit_system": 1, 

236 "datetime_begin": visit_start, 

237 "datetime_end": visit_end, 

238 }, 

239 ) 

240 

241 # Add more visits for some later tests 

242 for visit_id in (424, 425): 

243 butler.registry.insertDimensionData( 

244 "visit", 

245 { 

246 "instrument": "DummyCamComp", 

247 "id": visit_id, 

248 "name": f"fourtwentyfour_{visit_id}", 

249 "physical_filter": "d-r", 

250 "visit_system": 1, 

251 }, 

252 ) 

253 return butler, datasetType 

254 

255 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler: 

256 # New datasets will be added to run and tag, but we will only look in 

257 # tag when looking up datasets. 

258 run = self.default_run 

259 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

260 assert butler.run is not None 

261 

262 # Create and store a dataset 

263 metric = makeExampleMetrics() 

264 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423}) 

265 

266 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

267 # and once with a DatasetType 

268 

269 # Keep track of any collections we add and do not clean up 

270 expected_collections = {run} 

271 

272 counter = 0 

273 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1") 

274 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate] 

275 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)): 

276 # Since we are using subTest we can get cascading failures 

277 # here with the first attempt failing and the others failing 

278 # immediately because the dataset already exists. Work around 

279 # this by using a distinct run collection each time 

280 counter += 1 

281 this_run = f"put_run_{counter}" 

282 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

283 expected_collections.update({this_run}) 

284 

285 with self.subTest(args=args): 

286 kwargs: dict[str, Any] = {} 

287 if not isinstance(args[0], DatasetRef): # type: ignore 

288 kwargs["run"] = this_run 

289 ref = butler.put(metric, *args, **kwargs) 

290 self.assertIsInstance(ref, DatasetRef) 

291 

292 # Test getDirect 

293 metricOut = butler.get(ref) 

294 self.assertEqual(metric, metricOut) 

295 # Test get 

296 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

297 self.assertEqual(metric, metricOut) 

298 # Test get with a datasetRef 

299 metricOut = butler.get(ref) 

300 self.assertEqual(metric, metricOut) 

301 # Test getDeferred with dataId 

302 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

303 self.assertEqual(metric, metricOut) 

304 # Test getDeferred with a ref 

305 metricOut = butler.getDeferred(ref).get() 

306 self.assertEqual(metric, metricOut) 

307 

308 # Check we can get components 

309 if storageClass.isComposite(): 

310 self.assertGetComponents( 

311 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

312 ) 

313 

314 # Can the artifacts themselves be retrieved? 

315 if not butler.datastore.isEphemeral: 

316 root_uri = ResourcePath(self.root) 

317 

318 for preserve_path in (True, False): 

319 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

320 # Use copy so that we can test that overwrite 

321 # protection works (using "auto" for File URIs would 

322 # use hard links and subsequent transfer would work 

323 # because it knows they are the same file). 

324 transferred = butler.retrieveArtifacts( 

325 [ref], destination, preserve_path=preserve_path, transfer="copy" 

326 ) 

327 self.assertGreater(len(transferred), 0) 

328 artifacts = list(ResourcePath.findFileResources([destination])) 

329 self.assertEqual(set(transferred), set(artifacts)) 

330 

331 for artifact in transferred: 

332 path_in_destination = artifact.relative_to(destination) 

333 self.assertIsNotNone(path_in_destination) 

334 assert path_in_destination is not None 

335 

336 # when path is not preserved there should not be 

337 # any path separators. 

338 num_seps = path_in_destination.count("/") 

339 if preserve_path: 

340 self.assertGreater(num_seps, 0) 

341 else: 

342 self.assertEqual(num_seps, 0) 

343 

344 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

345 n_uris = len(secondary_uris) 

346 if primary_uri: 

347 n_uris += 1 

348 self.assertEqual( 

349 len(artifacts), 

350 n_uris, 

351 "Comparing expected artifacts vs actual:" 

352 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

353 ) 

354 

355 if preserve_path: 

356 # No need to run these twice 

357 with self.assertRaises(ValueError): 

358 butler.retrieveArtifacts([ref], destination, transfer="move") 

359 

360 with self.assertRaises(FileExistsError): 

361 butler.retrieveArtifacts([ref], destination) 

362 

363 transferred_again = butler.retrieveArtifacts( 

364 [ref], destination, preserve_path=preserve_path, overwrite=True 

365 ) 

366 self.assertEqual(set(transferred_again), set(transferred)) 

367 

368 # Now remove the dataset completely. 

369 butler.pruneDatasets([ref], purge=True, unstore=True) 

370 # Lookup with original args should still fail. 

371 kwargs = {"collections": this_run} 

372 if isinstance(args[0], DatasetRef): 

373 kwargs = {} # Prevent warning from being issued. 

374 self.assertFalse(butler.exists(*args, **kwargs)) 

375 # get() should still fail. 

376 with self.assertRaises(FileNotFoundError): 

377 butler.get(ref) 

378 # Registry shouldn't be able to find it by dataset_id anymore. 

379 self.assertIsNone(butler.registry.getDataset(ref.id)) 

380 

381 # Do explicit registry removal since we know they are 

382 # empty 

383 butler.registry.removeCollection(this_run) 

384 expected_collections.remove(this_run) 

385 

386 # Create DatasetRef for put using default run. 

387 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run) 

388 

389 # Put the dataset again, since the last thing we did was remove it 

390 # and we want to use the default collection. 

391 ref = butler.put(metric, refIn) 

392 

393 # Get with parameters 

394 stop = 4 

395 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

396 self.assertNotEqual(metric, sliced) 

397 self.assertEqual(metric.summary, sliced.summary) 

398 self.assertEqual(metric.output, sliced.output) 

399 assert metric.data is not None # for mypy 

400 self.assertEqual(metric.data[:stop], sliced.data) 

401 # getDeferred with parameters 

402 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

403 self.assertNotEqual(metric, sliced) 

404 self.assertEqual(metric.summary, sliced.summary) 

405 self.assertEqual(metric.output, sliced.output) 

406 self.assertEqual(metric.data[:stop], sliced.data) 

407 # getDeferred with deferred parameters 

408 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

409 self.assertNotEqual(metric, sliced) 

410 self.assertEqual(metric.summary, sliced.summary) 

411 self.assertEqual(metric.output, sliced.output) 

412 self.assertEqual(metric.data[:stop], sliced.data) 

413 

414 if storageClass.isComposite(): 

415 # Check that components can be retrieved 

416 metricOut = butler.get(ref.datasetType.name, dataId) 

417 compNameS = ref.datasetType.componentTypeName("summary") 

418 compNameD = ref.datasetType.componentTypeName("data") 

419 summary = butler.get(compNameS, dataId) 

420 self.assertEqual(summary, metric.summary) 

421 data = butler.get(compNameD, dataId) 

422 self.assertEqual(data, metric.data) 

423 

424 if "counter" in storageClass.derivedComponents: 

425 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

426 self.assertEqual(count, len(data)) 

427 

428 count = butler.get( 

429 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

430 ) 

431 self.assertEqual(count, stop) 

432 

433 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

434 assert compRef is not None 

435 summary = butler.get(compRef) 

436 self.assertEqual(summary, metric.summary) 

437 

438 # Create a Dataset type that has the same name but is inconsistent. 

439 inconsistentDatasetType = DatasetType( 

440 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config") 

441 ) 

442 

443 # Getting with a dataset type that does not match registry fails 

444 with self.assertRaisesRegex(ValueError, "Supplied dataset type .* inconsistent with registry"): 

445 butler.get(inconsistentDatasetType, dataId) 

446 

447 # Combining a DatasetRef with a dataId should fail 

448 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"): 

449 butler.get(ref, dataId) 

450 # Getting with an explicit ref should fail if the id doesn't match. 

451 with self.assertRaises(FileNotFoundError): 

452 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run)) 

453 

454 # Getting a dataset with unknown parameters should fail 

455 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"): 

456 butler.get(ref, parameters={"unsupported": True}) 

457 

458 # Check we have a collection 

459 collections = set(butler.registry.queryCollections()) 

460 self.assertEqual(collections, expected_collections) 

461 

462 # Clean up to check that we can remove something that may have 

463 # already had a component removed 

464 butler.pruneDatasets([ref], unstore=True, purge=True) 

465 

466 # Add the same ref again, so we can check that duplicate put fails. 

467 ref = butler.put(metric, datasetType, dataId) 

468 

469 # Repeat put will fail. 

470 with self.assertRaisesRegex( 

471 ConflictingDefinitionError, "A database constraint failure was triggered" 

472 ): 

473 butler.put(metric, datasetType, dataId) 

474 

475 # Remove the datastore entry. 

476 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

477 

478 # Put will still fail 

479 with self.assertRaisesRegex( 

480 ConflictingDefinitionError, "A database constraint failure was triggered" 

481 ): 

482 butler.put(metric, datasetType, dataId) 

483 

484 # Repeat the same sequence with resolved ref. 

485 butler.pruneDatasets([ref], unstore=True, purge=True) 

486 ref = butler.put(metric, refIn) 

487 

488 # Repeat put will fail. 

489 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"): 

490 butler.put(metric, refIn) 

491 

492 # Remove the datastore entry. 

493 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

494 

495 # In case of resolved ref this write will succeed. 

496 ref = butler.put(metric, refIn) 

497 

498 # Leave the dataset in place since some downstream tests require 

499 # something to be present 

500 

501 return butler 

502 

503 def testDeferredCollectionPassing(self) -> None: 

504 # Construct a butler with no run or collection, but make it writeable. 

505 butler = Butler(self.tmpConfigFile, writeable=True) 

506 # Create and register a DatasetType 

507 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

508 datasetType = self.addDatasetType( 

509 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

510 ) 

511 # Add needed Dimensions 

512 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

513 butler.registry.insertDimensionData( 

514 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

515 ) 

516 butler.registry.insertDimensionData( 

517 "visit", 

518 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

519 ) 

520 dataId = {"instrument": "DummyCamComp", "visit": 423} 

521 # Create dataset. 

522 metric = makeExampleMetrics() 

523 # Register a new run and put dataset. 

524 run = "deferred" 

525 self.assertTrue(butler.registry.registerRun(run)) 

526 # Second time it will be allowed but indicate no-op 

527 self.assertFalse(butler.registry.registerRun(run)) 

528 ref = butler.put(metric, datasetType, dataId, run=run) 

529 # Putting with no run should fail with TypeError. 

530 with self.assertRaises(CollectionError): 

531 butler.put(metric, datasetType, dataId) 

532 # Dataset should exist. 

533 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

534 # We should be able to get the dataset back, but with and without 

535 # a deferred dataset handle. 

536 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

537 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

538 # Trying to find the dataset without any collection is a TypeError. 

539 self.assertFalse(butler.exists(datasetType, dataId)) 

540 with self.assertRaises(CollectionError): 

541 butler.get(datasetType, dataId) 

542 # Associate the dataset with a different collection. 

543 butler.registry.registerCollection("tagged") 

544 butler.registry.associate("tagged", [ref]) 

545 # Deleting the dataset from the new collection should make it findable 

546 # in the original collection. 

547 butler.pruneDatasets([ref], tags=["tagged"]) 

548 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

549 

550 

551class ButlerTests(ButlerPutGetTests): 

552 """Tests for Butler.""" 

553 

554 useTempRoot = True 

555 validationCanFail: bool 

556 fullConfigKey: str | None 

557 registryStr: str | None 

558 datastoreName: list[str] | None 

559 datastoreStr: list[str] 

560 

561 def setUp(self) -> None: 

562 """Create a new butler root for each test.""" 

563 self.root = makeTestTempDir(TESTDIR) 

564 Butler.makeRepo(self.root, config=Config(self.configFile)) 

565 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

566 

567 def testConstructor(self) -> None: 

568 """Independent test of constructor.""" 

569 butler = Butler(self.tmpConfigFile, run=self.default_run) 

570 self.assertIsInstance(butler, Butler) 

571 

572 # Check that butler.yaml is added automatically. 

573 if self.tmpConfigFile.endswith(end := "/butler.yaml"): 

574 config_dir = self.tmpConfigFile[: -len(end)] 

575 butler = Butler(config_dir, run=self.default_run) 

576 self.assertIsInstance(butler, Butler) 

577 

578 # Even with a ResourcePath. 

579 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) 

580 self.assertIsInstance(butler, Butler) 

581 

582 collections = set(butler.registry.queryCollections()) 

583 self.assertEqual(collections, {self.default_run}) 

584 

585 # Check that some special characters can be included in run name. 

586 special_run = "u@b.c-A" 

587 butler_special = Butler(butler=butler, run=special_run) 

588 collections = set(butler_special.registry.queryCollections("*@*")) 

589 self.assertEqual(collections, {special_run}) 

590 

591 butler2 = Butler(butler=butler, collections=["other"]) 

592 self.assertEqual(butler2.collections, ("other",)) 

593 self.assertIsNone(butler2.run) 

594 self.assertIs(butler.datastore, butler2.datastore) 

595 

596 # Test that we can use an environment variable to find this 

597 # repository. 

598 butler_index = Config() 

599 butler_index["label"] = self.tmpConfigFile 

600 for suffix in (".yaml", ".json"): 

601 # Ensure that the content differs so that we know that 

602 # we aren't reusing the cache. 

603 bad_label = f"file://bucket/not_real{suffix}" 

604 butler_index["bad_label"] = bad_label 

605 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

606 butler_index.dumpToUri(temp_file) 

607 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

608 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"}) 

609 uri = Butler.get_repo_uri("bad_label") 

610 self.assertEqual(uri, ResourcePath(bad_label)) 

611 uri = Butler.get_repo_uri("label") 

612 butler = Butler(uri, writeable=False) 

613 self.assertIsInstance(butler, Butler) 

614 butler = Butler("label", writeable=False) 

615 self.assertIsInstance(butler, Butler) 

616 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"): 

617 Butler("not_there", writeable=False) 

618 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"): 

619 Butler("bad_label") 

620 with self.assertRaises(FileNotFoundError): 

621 # Should ignore aliases. 

622 Butler(ResourcePath("label", forceAbsolute=False)) 

623 with self.assertRaises(KeyError) as cm: 

624 Butler.get_repo_uri("missing") 

625 self.assertEqual( 

626 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False) 

627 ) 

628 self.assertIn("not known to", str(cm.exception)) 

629 # Should report no failure. 

630 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "") 

631 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

632 # Now with empty configuration. 

633 butler_index = Config() 

634 butler_index.dumpToUri(temp_file) 

635 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

636 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"): 

637 Butler("label") 

638 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

639 # Now with bad contents. 

640 with open(temp_file.ospath, "w") as fh: 

641 print("'", file=fh) 

642 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

643 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"): 

644 Butler("label") 

645 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

646 with self.assertRaises(FileNotFoundError): 

647 Butler.get_repo_uri("label") 

648 self.assertEqual(Butler.get_known_repos(), set()) 

649 

650 with self.assertRaisesRegex(FileNotFoundError, "index file not found"): 

651 Butler("label") 

652 

653 # Check that we can create Butler when the alias file is not found. 

654 butler = Butler(self.tmpConfigFile, writeable=False) 

655 self.assertIsInstance(butler, Butler) 

656 with self.assertRaises(KeyError) as cm: 

657 # No environment variable set. 

658 Butler.get_repo_uri("label") 

659 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False)) 

660 self.assertIn("No repository index defined", str(cm.exception)) 

661 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"): 

662 # No aliases registered. 

663 Butler("not_there") 

664 self.assertEqual(Butler.get_known_repos(), set()) 

665 

666 def testBasicPutGet(self) -> None: 

667 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

668 self.runPutGetTest(storageClass, "test_metric") 

669 

670 def testCompositePutGetConcrete(self) -> None: 

671 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

672 butler = self.runPutGetTest(storageClass, "test_metric") 

673 

674 # Should *not* be disassembled 

675 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

676 self.assertEqual(len(datasets), 1) 

677 uri, components = butler.getURIs(datasets[0]) 

678 self.assertIsInstance(uri, ResourcePath) 

679 self.assertFalse(components) 

680 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

681 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

682 

683 # Predicted dataset 

684 dataId = {"instrument": "DummyCamComp", "visit": 424} 

685 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

686 self.assertFalse(components) 

687 self.assertIsInstance(uri, ResourcePath) 

688 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

689 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

690 

691 def testCompositePutGetVirtual(self) -> None: 

692 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

693 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

694 

695 # Should be disassembled 

696 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

697 self.assertEqual(len(datasets), 1) 

698 uri, components = butler.getURIs(datasets[0]) 

699 

700 if butler.datastore.isEphemeral: 

701 # Never disassemble in-memory datastore 

702 self.assertIsInstance(uri, ResourcePath) 

703 self.assertFalse(components) 

704 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

705 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

706 else: 

707 self.assertIsNone(uri) 

708 self.assertEqual(set(components), set(storageClass.components)) 

709 for compuri in components.values(): 

710 self.assertIsInstance(compuri, ResourcePath) 

711 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

712 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

713 

714 # Predicted dataset 

715 dataId = {"instrument": "DummyCamComp", "visit": 424} 

716 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

717 

718 if butler.datastore.isEphemeral: 

719 # Never disassembled 

720 self.assertIsInstance(uri, ResourcePath) 

721 self.assertFalse(components) 

722 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

723 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

724 else: 

725 self.assertIsNone(uri) 

726 self.assertEqual(set(components), set(storageClass.components)) 

727 for compuri in components.values(): 

728 self.assertIsInstance(compuri, ResourcePath) 

729 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

730 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

731 

732 def testStorageClassOverrideGet(self) -> None: 

733 """Test storage class conversion on get with override.""" 

734 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

735 datasetTypeName = "anything" 

736 run = self.default_run 

737 

738 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

739 

740 # Create and store a dataset. 

741 metric = makeExampleMetrics() 

742 dataId = {"instrument": "DummyCamComp", "visit": 423} 

743 

744 ref = butler.put(metric, datasetType, dataId) 

745 

746 # Return native type. 

747 retrieved = butler.get(ref) 

748 self.assertEqual(retrieved, metric) 

749 

750 # Specify an override. 

751 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion") 

752 model = butler.get(ref, storageClass=new_sc) 

753 self.assertNotEqual(type(model), type(retrieved)) 

754 self.assertIs(type(model), new_sc.pytype) 

755 self.assertEqual(retrieved, model) 

756 

757 # Defer but override later. 

758 deferred = butler.getDeferred(ref) 

759 model = deferred.get(storageClass=new_sc) 

760 self.assertIs(type(model), new_sc.pytype) 

761 self.assertEqual(retrieved, model) 

762 

763 # Defer but override up front. 

764 deferred = butler.getDeferred(ref, storageClass=new_sc) 

765 model = deferred.get() 

766 self.assertIs(type(model), new_sc.pytype) 

767 self.assertEqual(retrieved, model) 

768 

769 # Retrieve a component. Should be a tuple. 

770 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple") 

771 self.assertIs(type(data), tuple) 

772 self.assertEqual(data, tuple(retrieved.data)) 

773 

774 # Parameter on the write storage class should work regardless 

775 # of read storage class. 

776 data = butler.get( 

777 "anything.data", 

778 dataId, 

779 storageClass="StructuredDataDataTestTuple", 

780 parameters={"slice": slice(2, 4)}, 

781 ) 

782 self.assertEqual(len(data), 2) 

783 

784 # Try a parameter that is known to the read storage class but not 

785 # the write storage class. 

786 with self.assertRaises(KeyError): 

787 butler.get( 

788 "anything.data", 

789 dataId, 

790 storageClass="StructuredDataDataTestTuple", 

791 parameters={"xslice": slice(2, 4)}, 

792 ) 

793 

794 def testPytypePutCoercion(self) -> None: 

795 """Test python type coercion on Butler.get and put.""" 

796 # Store some data with the normal example storage class. 

797 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

798 datasetTypeName = "test_metric" 

799 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName) 

800 

801 dataId = {"instrument": "DummyCamComp", "visit": 423} 

802 

803 # Put a dict and this should coerce to a MetricsExample 

804 test_dict = {"summary": {"a": 1}, "output": {"b": 2}} 

805 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424) 

806 test_metric = butler.get(metric_ref) 

807 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample") 

808 self.assertEqual(test_metric.summary, test_dict["summary"]) 

809 self.assertEqual(test_metric.output, test_dict["output"]) 

810 

811 # Check that the put still works if a DatasetType is given with 

812 # a definition matching this python type. 

813 registry_type = butler.registry.getDatasetType(datasetTypeName) 

814 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson") 

815 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425) 

816 self.assertEqual(metric2_ref.datasetType, registry_type) 

817 

818 # The get will return the type expected by registry. 

819 test_metric2 = butler.get(metric2_ref) 

820 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample") 

821 

822 # Make a new DatasetRef with the compatible but different DatasetType. 

823 # This should now return a dict. 

824 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run) 

825 test_dict2 = butler.get(new_ref) 

826 self.assertEqual(get_full_type_name(test_dict2), "dict") 

827 

828 # Get it again with the wrong dataset type definition using get() 

829 # rather than get(). This should be consistent with get() 

830 # behavior and return the type of the DatasetType. 

831 test_dict3 = butler.get(this_type, dataId=dataId, visit=425) 

832 self.assertEqual(get_full_type_name(test_dict3), "dict") 

833 

834 def testIngest(self) -> None: 

835 butler = Butler(self.tmpConfigFile, run=self.default_run) 

836 

837 # Create and register a DatasetType 

838 dimensions = butler.dimensions.extract(["instrument", "visit", "detector"]) 

839 

840 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

841 datasetTypeName = "metric" 

842 

843 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

844 

845 # Add needed Dimensions 

846 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

847 butler.registry.insertDimensionData( 

848 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

849 ) 

850 for detector in (1, 2): 

851 butler.registry.insertDimensionData( 

852 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

853 ) 

854 

855 butler.registry.insertDimensionData( 

856 "visit", 

857 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

858 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

859 ) 

860 

861 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter") 

862 dataRoot = os.path.join(TESTDIR, "data", "basic") 

863 datasets = [] 

864 for detector in (1, 2): 

865 detector_name = f"detector_{detector}" 

866 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

867 dataId = butler.registry.expandDataId( 

868 {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

869 ) 

870 # Create a DatasetRef for ingest 

871 refIn = DatasetRef(datasetType, dataId, run=self.default_run) 

872 

873 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

874 

875 butler.ingest(*datasets, transfer="copy") 

876 

877 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

878 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

879 

880 metrics1 = butler.get(datasetTypeName, dataId1) 

881 metrics2 = butler.get(datasetTypeName, dataId2) 

882 self.assertNotEqual(metrics1, metrics2) 

883 

884 # Compare URIs 

885 uri1 = butler.getURI(datasetTypeName, dataId1) 

886 uri2 = butler.getURI(datasetTypeName, dataId2) 

887 self.assertNotEqual(uri1, uri2) 

888 

889 # Now do a multi-dataset but single file ingest 

890 metricFile = os.path.join(dataRoot, "detectors.yaml") 

891 refs = [] 

892 for detector in (1, 2): 

893 detector_name = f"detector_{detector}" 

894 dataId = butler.registry.expandDataId( 

895 {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

896 ) 

897 # Create a DatasetRef for ingest 

898 refs.append(DatasetRef(datasetType, dataId, run=self.default_run)) 

899 

900 # Test "move" transfer to ensure that the files themselves 

901 # have disappeared following ingest. 

902 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile: 

903 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy") 

904 

905 datasets = [] 

906 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter)) 

907 

908 # For first ingest use copy. 

909 butler.ingest(*datasets, transfer="copy", record_validation_info=False) 

910 

911 # Now try to ingest again in "execution butler" mode where 

912 # the registry entries exist but the datastore does not have 

913 # the files. We also need to strip the dimension records to ensure 

914 # that they will be re-added by the ingest. 

915 ref = datasets[0].refs[0] 

916 datasets[0].refs = [ 

917 cast( 

918 DatasetRef, 

919 butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), 

920 ) 

921 for ref in datasets[0].refs 

922 ] 

923 all_refs = [] 

924 for dataset in datasets: 

925 refs = [] 

926 for ref in dataset.refs: 

927 # Create a dict from the dataId to drop the records. 

928 new_data_id = {str(k): v for k, v in ref.dataId.items()} 

929 new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run) 

930 assert new_ref is not None 

931 self.assertFalse(new_ref.dataId.hasRecords()) 

932 refs.append(new_ref) 

933 dataset.refs = refs 

934 all_refs.extend(dataset.refs) 

935 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False) 

936 

937 # Use move mode to test that the file is deleted. Also 

938 # disable recording of file size. 

939 butler.ingest(*datasets, transfer="move", record_validation_info=False) 

940 

941 # Check that every ref now has records. 

942 for dataset in datasets: 

943 for ref in dataset.refs: 

944 self.assertTrue(ref.dataId.hasRecords()) 

945 

946 # Ensure that the file has disappeared. 

947 self.assertFalse(tempFile.exists()) 

948 

949 # Check that the datastore recorded no file size. 

950 # Not all datastores can support this. 

951 try: 

952 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined] 

953 self.assertEqual(infos[0].file_size, -1) 

954 except AttributeError: 

955 pass 

956 

957 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

958 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

959 

960 multi1 = butler.get(datasetTypeName, dataId1) 

961 multi2 = butler.get(datasetTypeName, dataId2) 

962 

963 self.assertEqual(multi1, metrics1) 

964 self.assertEqual(multi2, metrics2) 

965 

966 # Compare URIs 

967 uri1 = butler.getURI(datasetTypeName, dataId1) 

968 uri2 = butler.getURI(datasetTypeName, dataId2) 

969 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

970 

971 # Test that removing one does not break the second 

972 # This line will issue a warning log message for a ChainedDatastore 

973 # that uses an InMemoryDatastore since in-memory can not ingest 

974 # files. 

975 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

976 self.assertFalse(butler.exists(datasetTypeName, dataId1)) 

977 self.assertTrue(butler.exists(datasetTypeName, dataId2)) 

978 multi2b = butler.get(datasetTypeName, dataId2) 

979 self.assertEqual(multi2, multi2b) 

980 

981 # Ensure we can ingest 0 datasets 

982 datasets = [] 

983 butler.ingest(*datasets) 

984 

985 def testPickle(self) -> None: 

986 """Test pickle support.""" 

987 butler = Butler(self.tmpConfigFile, run=self.default_run) 

988 butlerOut = pickle.loads(pickle.dumps(butler)) 

989 self.assertIsInstance(butlerOut, Butler) 

990 self.assertEqual(butlerOut._config, butler._config) 

991 self.assertEqual(butlerOut.collections, butler.collections) 

992 self.assertEqual(butlerOut.run, butler.run) 

993 

994 def testGetDatasetTypes(self) -> None: 

995 butler = Butler(self.tmpConfigFile, run=self.default_run) 

996 dimensions = butler.dimensions.extract(["instrument", "visit", "physical_filter"]) 

997 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [ 

998 ( 

999 "instrument", 

1000 [ 

1001 {"instrument": "DummyCam"}, 

1002 {"instrument": "DummyHSC"}, 

1003 {"instrument": "DummyCamComp"}, 

1004 ], 

1005 ), 

1006 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]), 

1007 ("visit", [{"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}]), 

1008 ] 

1009 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1010 # Add needed Dimensions 

1011 for element, data in dimensionEntries: 

1012 butler.registry.insertDimensionData(element, *data) 

1013 

1014 # When a DatasetType is added to the registry entries are not created 

1015 # for components but querying them can return the components. 

1016 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

1017 components = set() 

1018 for datasetTypeName in datasetTypeNames: 

1019 # Create and register a DatasetType 

1020 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1021 

1022 for componentName in storageClass.components: 

1023 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

1024 

1025 fromRegistry: set[DatasetType] = set() 

1026 for parent_dataset_type in butler.registry.queryDatasetTypes(): 

1027 fromRegistry.add(parent_dataset_type) 

1028 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes()) 

1029 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

1030 

1031 # Now that we have some dataset types registered, validate them 

1032 butler.validateConfiguration( 

1033 ignore=[ 

1034 "test_metric_comp", 

1035 "metric3", 

1036 "metric5", 

1037 "calexp", 

1038 "DummySC", 

1039 "datasetType.component", 

1040 "random_data", 

1041 "random_data_2", 

1042 ] 

1043 ) 

1044 

1045 # Add a new datasetType that will fail template validation 

1046 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

1047 if self.validationCanFail: 

1048 with self.assertRaises(ValidationError): 

1049 butler.validateConfiguration() 

1050 

1051 # Rerun validation but with a subset of dataset type names 

1052 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

1053 

1054 # Rerun validation but ignore the bad datasetType 

1055 butler.validateConfiguration( 

1056 ignore=[ 

1057 "test_metric_comp", 

1058 "metric3", 

1059 "metric5", 

1060 "calexp", 

1061 "DummySC", 

1062 "datasetType.component", 

1063 "random_data", 

1064 "random_data_2", 

1065 ] 

1066 ) 

1067 

1068 def testTransaction(self) -> None: 

1069 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1070 datasetTypeName = "test_metric" 

1071 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1072 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = ( 

1073 ("instrument", {"instrument": "DummyCam"}), 

1074 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

1075 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

1076 ) 

1077 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1078 metric = makeExampleMetrics() 

1079 dataId = {"instrument": "DummyCam", "visit": 42} 

1080 # Create and register a DatasetType 

1081 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1082 with self.assertRaises(TransactionTestError): 

1083 with butler.transaction(): 

1084 # Add needed Dimensions 

1085 for args in dimensionEntries: 

1086 butler.registry.insertDimensionData(*args) 

1087 # Store a dataset 

1088 ref = butler.put(metric, datasetTypeName, dataId) 

1089 self.assertIsInstance(ref, DatasetRef) 

1090 # Test getDirect 

1091 metricOut = butler.get(ref) 

1092 self.assertEqual(metric, metricOut) 

1093 # Test get 

1094 metricOut = butler.get(datasetTypeName, dataId) 

1095 self.assertEqual(metric, metricOut) 

1096 # Check we can get components 

1097 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

1098 raise TransactionTestError("This should roll back the entire transaction") 

1099 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"): 

1100 butler.registry.expandDataId(dataId) 

1101 # Should raise LookupError for missing data ID value 

1102 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

1103 butler.get(datasetTypeName, dataId) 

1104 # Also check explicitly if Dataset entry is missing 

1105 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

1106 # Direct retrieval should not find the file in the Datastore 

1107 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

1108 butler.get(ref) 

1109 

1110 def testMakeRepo(self) -> None: 

1111 """Test that we can write butler configuration to a new repository via 

1112 the Butler.makeRepo interface and then instantiate a butler from the 

1113 repo root. 

1114 """ 

1115 # Do not run the test if we know this datastore configuration does 

1116 # not support a file system root 

1117 if self.fullConfigKey is None: 

1118 return 

1119 

1120 # create two separate directories 

1121 root1 = tempfile.mkdtemp(dir=self.root) 

1122 root2 = tempfile.mkdtemp(dir=self.root) 

1123 

1124 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

1125 limited = Config(self.configFile) 

1126 butler1 = Butler(butlerConfig) 

1127 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

1128 full = Config(self.tmpConfigFile) 

1129 butler2 = Butler(butlerConfig) 

1130 # Butlers should have the same configuration regardless of whether 

1131 # defaults were expanded. 

1132 self.assertEqual(butler1._config, butler2._config) 

1133 # Config files loaded directly should not be the same. 

1134 self.assertNotEqual(limited, full) 

1135 # Make sure "limited" doesn't have a few keys we know it should be 

1136 # inheriting from defaults. 

1137 self.assertIn(self.fullConfigKey, full) 

1138 self.assertNotIn(self.fullConfigKey, limited) 

1139 

1140 # Collections don't appear until something is put in them 

1141 collections1 = set(butler1.registry.queryCollections()) 

1142 self.assertEqual(collections1, set()) 

1143 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

1144 

1145 # Check that a config with no associated file name will not 

1146 # work properly with relocatable Butler repo 

1147 butlerConfig.configFile = None 

1148 with self.assertRaises(ValueError): 

1149 Butler(butlerConfig) 

1150 

1151 with self.assertRaises(FileExistsError): 

1152 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

1153 

1154 def testStringification(self) -> None: 

1155 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1156 butlerStr = str(butler) 

1157 

1158 if self.datastoreStr is not None: 

1159 for testStr in self.datastoreStr: 

1160 self.assertIn(testStr, butlerStr) 

1161 if self.registryStr is not None: 

1162 self.assertIn(self.registryStr, butlerStr) 

1163 

1164 datastoreName = butler.datastore.name 

1165 if self.datastoreName is not None: 

1166 for testStr in self.datastoreName: 

1167 self.assertIn(testStr, datastoreName) 

1168 

1169 def testButlerRewriteDataId(self) -> None: 

1170 """Test that dataIds can be rewritten based on dimension records.""" 

1171 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1172 

1173 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1174 datasetTypeName = "random_data" 

1175 

1176 # Create dimension records. 

1177 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1178 butler.registry.insertDimensionData( 

1179 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1180 ) 

1181 butler.registry.insertDimensionData( 

1182 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1183 ) 

1184 

1185 dimensions = butler.dimensions.extract(["instrument", "exposure"]) 

1186 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1187 butler.registry.registerDatasetType(datasetType) 

1188 

1189 n_exposures = 5 

1190 dayobs = 20210530 

1191 

1192 for i in range(n_exposures): 

1193 butler.registry.insertDimensionData( 

1194 "exposure", 

1195 { 

1196 "instrument": "DummyCamComp", 

1197 "id": i, 

1198 "obs_id": f"exp{i}", 

1199 "seq_num": i, 

1200 "day_obs": dayobs, 

1201 "physical_filter": "d-r", 

1202 }, 

1203 ) 

1204 

1205 # Write some data. 

1206 for i in range(n_exposures): 

1207 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1208 

1209 # Use the seq_num for the put to test rewriting. 

1210 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1211 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1212 

1213 # Check that the exposure is correct in the dataId 

1214 self.assertEqual(ref.dataId["exposure"], i) 

1215 

1216 # and check that we can get the dataset back with the same dataId 

1217 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1218 self.assertEqual(new_metric, metric) 

1219 

1220 

1221class FileDatastoreButlerTests(ButlerTests): 

1222 """Common tests and specialization of ButlerTests for butlers backed 

1223 by datastores that inherit from FileDatastore. 

1224 """ 

1225 

1226 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool: 

1227 """Check if file exists at a given path (relative to root). 

1228 

1229 Test testPutTemplates verifies actual physical existance of the files 

1230 in the requested location. 

1231 """ 

1232 uri = ResourcePath(root, forceDirectory=True) 

1233 return uri.join(relpath).exists() 

1234 

1235 def testPutTemplates(self) -> None: 

1236 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1237 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1238 

1239 # Add needed Dimensions 

1240 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1241 butler.registry.insertDimensionData( 

1242 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1243 ) 

1244 butler.registry.insertDimensionData( 

1245 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1246 ) 

1247 butler.registry.insertDimensionData( 

1248 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1249 ) 

1250 

1251 # Create and store a dataset 

1252 metric = makeExampleMetrics() 

1253 

1254 # Create two almost-identical DatasetTypes (both will use default 

1255 # template) 

1256 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1257 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1258 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1259 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1260 

1261 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1262 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1263 

1264 # Put with exactly the data ID keys needed 

1265 ref = butler.put(metric, "metric1", dataId1) 

1266 uri = butler.getURI(ref) 

1267 self.assertTrue(uri.exists()) 

1268 self.assertTrue( 

1269 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle") 

1270 ) 

1271 

1272 # Check the template based on dimensions 

1273 if hasattr(butler.datastore, "templates"): 

1274 butler.datastore.templates.validateTemplates([ref]) 

1275 

1276 # Put with extra data ID keys (physical_filter is an optional 

1277 # dependency); should not change template (at least the way we're 

1278 # defining them to behave now; the important thing is that they 

1279 # must be consistent). 

1280 ref = butler.put(metric, "metric2", dataId2) 

1281 uri = butler.getURI(ref) 

1282 self.assertTrue(uri.exists()) 

1283 self.assertTrue( 

1284 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle") 

1285 ) 

1286 

1287 # Check the template based on dimensions 

1288 if hasattr(butler.datastore, "templates"): 

1289 butler.datastore.templates.validateTemplates([ref]) 

1290 

1291 # Use a template that has a typo in dimension record metadata. 

1292 # Easier to test with a butler that has a ref with records attached. 

1293 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits") 

1294 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"): 

1295 path = template.format(ref) 

1296 self.assertEqual(path, f"a/v423/{ref.id}_fits") 

1297 

1298 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits") 

1299 with self.assertRaises(KeyError): 

1300 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"): 

1301 template.format(ref) 

1302 

1303 # Now use a file template that will not result in unique filenames 

1304 with self.assertRaises(FileTemplateValidationError): 

1305 butler.put(metric, "metric3", dataId1) 

1306 

1307 def testImportExport(self) -> None: 

1308 # Run put/get tests just to create and populate a repo. 

1309 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1310 self.runImportExportTest(storageClass) 

1311 

1312 @unittest.expectedFailure 

1313 def testImportExportVirtualComposite(self) -> None: 

1314 # Run put/get tests just to create and populate a repo. 

1315 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1316 self.runImportExportTest(storageClass) 

1317 

1318 def runImportExportTest(self, storageClass: StorageClass) -> None: 

1319 """Test exporting and importing. 

1320 

1321 This test does an export to a temp directory and an import back 

1322 into a new temp directory repo. It does not assume a posix datastore. 

1323 """ 

1324 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1325 

1326 # Test that we must have a file extension. 

1327 with self.assertRaises(ValueError): 

1328 with exportButler.export(filename="dump", directory=".") as export: 

1329 pass 

1330 

1331 # Test that unknown format is not allowed. 

1332 with self.assertRaises(ValueError): 

1333 with exportButler.export(filename="dump.fits", directory=".") as export: 

1334 pass 

1335 

1336 # Test that the repo actually has at least one dataset. 

1337 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1338 self.assertGreater(len(datasets), 0) 

1339 # Add a DimensionRecord that's unused by those datasets. 

1340 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1341 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1342 # Export and then import datasets. 

1343 with safeTestTempDir(TESTDIR) as exportDir: 

1344 exportFile = os.path.join(exportDir, "exports.yaml") 

1345 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1346 export.saveDatasets(datasets) 

1347 # Export the same datasets again. This should quietly do 

1348 # nothing because of internal deduplication, and it shouldn't 

1349 # complain about being asked to export the "htm7" elements even 

1350 # though there aren't any in these datasets or in the database. 

1351 export.saveDatasets(datasets, elements=["htm7"]) 

1352 # Save one of the data IDs again; this should be harmless 

1353 # because of internal deduplication. 

1354 export.saveDataIds([datasets[0].dataId]) 

1355 # Save some dimension records directly. 

1356 export.saveDimensionData("skymap", [skymapRecord]) 

1357 self.assertTrue(os.path.exists(exportFile)) 

1358 with safeTestTempDir(TESTDIR) as importDir: 

1359 # We always want this to be a local posix butler 

1360 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1361 # Calling script.butlerImport tests the implementation of the 

1362 # butler command line interface "import" subcommand. Functions 

1363 # in the script folder are generally considered protected and 

1364 # should not be used as public api. 

1365 with open(exportFile) as f: 

1366 script.butlerImport( 

1367 importDir, 

1368 export_file=f, 

1369 directory=exportDir, 

1370 transfer="auto", 

1371 skip_dimensions=None, 

1372 ) 

1373 importButler = Butler(importDir, run=self.default_run) 

1374 for ref in datasets: 

1375 with self.subTest(ref=ref): 

1376 # Test for existence by passing in the DatasetType and 

1377 # data ID separately, to avoid lookup by dataset_id. 

1378 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId)) 

1379 self.assertEqual( 

1380 list(importButler.registry.queryDimensionRecords("skymap")), 

1381 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)], 

1382 ) 

1383 

1384 def testRemoveRuns(self) -> None: 

1385 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1386 butler = Butler(self.tmpConfigFile, writeable=True) 

1387 # Load registry data with dimensions to hang datasets off of. 

1388 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1389 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1390 # Add some RUN-type collection. 

1391 run1 = "run1" 

1392 butler.registry.registerRun(run1) 

1393 run2 = "run2" 

1394 butler.registry.registerRun(run2) 

1395 # put a dataset in each 

1396 metric = makeExampleMetrics() 

1397 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1398 datasetType = self.addDatasetType( 

1399 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1400 ) 

1401 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1402 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1403 uri1 = butler.getURI(ref1) 

1404 uri2 = butler.getURI(ref2) 

1405 

1406 with self.assertRaises(OrphanedRecordError): 

1407 butler.registry.removeDatasetType(datasetType.name) 

1408 

1409 # Remove from both runs with different values for unstore. 

1410 butler.removeRuns([run1], unstore=True) 

1411 butler.removeRuns([run2], unstore=False) 

1412 # Should be nothing in registry for either one, and datastore should 

1413 # not think either exists. 

1414 with self.assertRaises(MissingCollectionError): 

1415 butler.registry.getCollectionType(run1) 

1416 with self.assertRaises(MissingCollectionError): 

1417 butler.registry.getCollectionType(run2) 

1418 self.assertFalse(butler.datastore.exists(ref1)) 

1419 self.assertFalse(butler.datastore.exists(ref2)) 

1420 # The ref we unstored should be gone according to the URI, but the 

1421 # one we forgot should still be around. 

1422 self.assertFalse(uri1.exists()) 

1423 self.assertTrue(uri2.exists()) 

1424 

1425 # Now that the collections have been pruned we can remove the 

1426 # dataset type 

1427 butler.registry.removeDatasetType(datasetType.name) 

1428 

1429 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm: 

1430 butler.registry.removeDatasetType(tuple(["test*", "test*"])) 

1431 self.assertIn("not defined", "\n".join(cm.output)) 

1432 

1433 

1434class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1435 """PosixDatastore specialization of a butler""" 

1436 

1437 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1438 fullConfigKey: str | None = ".datastore.formatters" 

1439 validationCanFail = True 

1440 datastoreStr = ["/tmp"] 

1441 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1442 registryStr = "/gen3.sqlite3" 

1443 

1444 def testPathConstructor(self) -> None: 

1445 """Independent test of constructor using PathLike.""" 

1446 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1447 self.assertIsInstance(butler, Butler) 

1448 

1449 # And again with a Path object with the butler yaml 

1450 path = pathlib.Path(self.tmpConfigFile) 

1451 butler = Butler(path, writeable=False) 

1452 self.assertIsInstance(butler, Butler) 

1453 

1454 # And again with a Path object without the butler yaml 

1455 # (making sure we skip it if the tmp config doesn't end 

1456 # in butler.yaml -- which is the case for a subclass) 

1457 if self.tmpConfigFile.endswith("butler.yaml"): 

1458 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1459 butler = Butler(path, writeable=False) 

1460 self.assertIsInstance(butler, Butler) 

1461 

1462 def testExportTransferCopy(self) -> None: 

1463 """Test local export using all transfer modes""" 

1464 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1465 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1466 # Test that the repo actually has at least one dataset. 

1467 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1468 self.assertGreater(len(datasets), 0) 

1469 uris = [exportButler.getURI(d) for d in datasets] 

1470 assert isinstance(exportButler.datastore, FileDatastore) 

1471 datastoreRoot = exportButler.datastore.root 

1472 

1473 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1474 

1475 for path in pathsInStore: 

1476 # Assume local file system 

1477 assert path is not None 

1478 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1479 

1480 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1481 with safeTestTempDir(TESTDIR) as exportDir: 

1482 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1483 export.saveDatasets(datasets) 

1484 for path in pathsInStore: 

1485 assert path is not None 

1486 self.assertTrue( 

1487 self.checkFileExists(exportDir, path), 

1488 f"Check that mode {transfer} exported files", 

1489 ) 

1490 

1491 def testPruneDatasets(self) -> None: 

1492 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1493 butler = Butler(self.tmpConfigFile, writeable=True) 

1494 assert isinstance(butler.datastore, FileDatastore) 

1495 # Load registry data with dimensions to hang datasets off of. 

1496 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1497 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1498 # Add some RUN-type collections. 

1499 run1 = "run1" 

1500 butler.registry.registerRun(run1) 

1501 run2 = "run2" 

1502 butler.registry.registerRun(run2) 

1503 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1504 # different runs. ref3 has a different data ID. 

1505 metric = makeExampleMetrics() 

1506 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1507 datasetType = self.addDatasetType( 

1508 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1509 ) 

1510 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1511 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1512 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1513 

1514 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1515 for ref, stored in many_stored.items(): 

1516 self.assertTrue(stored, f"Ref {ref} should be stored") 

1517 

1518 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1519 for ref, exists in many_exists.items(): 

1520 self.assertTrue(exists, f"Checking ref {ref} exists.") 

1521 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored") 

1522 

1523 # Simple prune. 

1524 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1525 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1)) 

1526 

1527 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1528 for ref, stored in many_stored.items(): 

1529 self.assertFalse(stored, f"Ref {ref} should not be stored") 

1530 

1531 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1532 for ref, exists in many_exists.items(): 

1533 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored") 

1534 

1535 # Put data back. 

1536 ref1_new = butler.put(metric, ref1) 

1537 self.assertEqual(ref1_new, ref1) # Reuses original ID. 

1538 ref2 = butler.put(metric, ref2) 

1539 

1540 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1541 self.assertTrue(many_stored[ref1]) 

1542 self.assertTrue(many_stored[ref2]) 

1543 self.assertFalse(many_stored[ref3]) 

1544 

1545 ref3 = butler.put(metric, ref3) 

1546 

1547 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1548 for ref, exists in many_exists.items(): 

1549 self.assertTrue(exists, f"Ref {ref} should not be stored") 

1550 

1551 # Clear out the datasets from registry and start again. 

1552 refs = [ref1, ref2, ref3] 

1553 butler.pruneDatasets(refs, purge=True, unstore=True) 

1554 for ref in refs: 

1555 butler.put(metric, ref) 

1556 

1557 # Test different forms of file availability. 

1558 # Need to be in a state where: 

1559 # - one ref just has registry record. 

1560 # - one ref has a missing file but a datastore record. 

1561 # - one ref has a missing datastore record but file is there. 

1562 # - one ref does not exist anywhere. 

1563 # Do not need to test a ref that has everything since that is tested 

1564 # above. 

1565 ref0 = DatasetRef( 

1566 datasetType, 

1567 DataCoordinate.standardize( 

1568 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions 

1569 ), 

1570 run=run1, 

1571 ) 

1572 

1573 # Delete from datastore and retain in Registry. 

1574 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False) 

1575 

1576 # File has been removed. 

1577 uri2 = butler.datastore.getURI(ref2) 

1578 uri2.remove() 

1579 

1580 # Datastore has lost track. 

1581 butler.datastore.forget([ref3]) 

1582 

1583 # First test with a standard butler. 

1584 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1585 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1586 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1587 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1588 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED) 

1589 

1590 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False) 

1591 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1592 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1593 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN) 

1594 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1595 self.assertTrue(exists_many[ref2]) 

1596 

1597 # Check that per-ref query gives the same answer as many query. 

1598 for ref, exists in exists_many.items(): 

1599 self.assertEqual(butler.exists(ref, full_check=False), exists) 

1600 

1601 # Test again with a trusting butler. 

1602 butler.datastore.trustGetRequest = True 

1603 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1604 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1605 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1606 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1607 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT) 

1608 

1609 # Check that per-ref query gives the same answer as many query. 

1610 for ref, exists in exists_many.items(): 

1611 self.assertEqual(butler.exists(ref, full_check=True), exists) 

1612 

1613 # Create a ref that surprisingly has the UUID of an existing ref 

1614 # but is not the same. 

1615 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id) 

1616 with self.assertRaises(ValueError): 

1617 butler.exists(ref_bad) 

1618 

1619 # Create a ref that has a compatible storage class. 

1620 ref_compat = ref2.overrideStorageClass("StructuredDataDict") 

1621 exists = butler.exists(ref_compat) 

1622 self.assertEqual(exists, exists_many[ref2]) 

1623 

1624 # Remove everything and start from scratch. 

1625 butler.datastore.trustGetRequest = False 

1626 butler.pruneDatasets(refs, purge=True, unstore=True) 

1627 for ref in refs: 

1628 butler.put(metric, ref) 

1629 

1630 # These tests mess directly with the trash table and can leave the 

1631 # datastore in an odd state. Do them at the end. 

1632 # Check that in normal mode, deleting the record will lead to 

1633 # trash not touching the file. 

1634 uri1 = butler.datastore.getURI(ref1) 

1635 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table 

1636 butler.datastore.forget([ref1]) 

1637 butler.datastore.trash(ref1) 

1638 butler.datastore.emptyTrash() 

1639 self.assertTrue(uri1.exists()) 

1640 uri1.remove() # Clean it up. 

1641 

1642 # Simulate execution butler setup by deleting the datastore 

1643 # record but keeping the file around and trusting. 

1644 butler.datastore.trustGetRequest = True 

1645 uri2 = butler.datastore.getURI(ref2) 

1646 uri3 = butler.datastore.getURI(ref3) 

1647 self.assertTrue(uri2.exists()) 

1648 self.assertTrue(uri3.exists()) 

1649 

1650 # Remove the datastore record. 

1651 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table 

1652 butler.datastore.forget([ref2]) 

1653 self.assertTrue(uri2.exists()) 

1654 butler.datastore.trash([ref2, ref3]) 

1655 # Immediate removal for ref2 file 

1656 self.assertFalse(uri2.exists()) 

1657 # But ref3 has to wait for the empty. 

1658 self.assertTrue(uri3.exists()) 

1659 butler.datastore.emptyTrash() 

1660 self.assertFalse(uri3.exists()) 

1661 

1662 # Clear out the datasets from registry. 

1663 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1664 

1665 def testPytypeCoercion(self) -> None: 

1666 """Test python type coercion on Butler.get and put.""" 

1667 # Store some data with the normal example storage class. 

1668 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1669 datasetTypeName = "test_metric" 

1670 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1671 

1672 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1673 metric = butler.get(datasetTypeName, dataId=dataId) 

1674 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1675 

1676 datasetType_ori = butler.registry.getDatasetType(datasetTypeName) 

1677 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1678 

1679 # Now need to hack the registry dataset type definition. 

1680 # There is no API for this. 

1681 assert isinstance(butler.registry, SqlRegistry) 

1682 manager = butler.registry._managers.datasets 

1683 assert hasattr(manager, "_db") and hasattr(manager, "_static") 

1684 manager._db.update( 

1685 manager._static.dataset_type, 

1686 {"name": datasetTypeName}, 

1687 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1688 ) 

1689 

1690 # Force reset of dataset type cache 

1691 butler.registry.refresh() 

1692 

1693 datasetType_new = butler.registry.getDatasetType(datasetTypeName) 

1694 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1695 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1696 

1697 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1698 self.assertNotEqual(type(metric_model), type(metric)) 

1699 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1700 

1701 # Put the model and read it back to show that everything now 

1702 # works as normal. 

1703 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1704 metric_model_new = butler.get(metric_ref) 

1705 self.assertEqual(metric_model_new, metric_model) 

1706 

1707 # Hack the storage class again to something that will fail on the 

1708 # get with no conversion class. 

1709 manager._db.update( 

1710 manager._static.dataset_type, 

1711 {"name": datasetTypeName}, 

1712 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1713 ) 

1714 butler.registry.refresh() 

1715 

1716 with self.assertRaises(ValueError): 

1717 butler.get(datasetTypeName, dataId=dataId) 

1718 

1719 

1720@unittest.skipUnless(testing is not None, "testing.postgresql module not found") 

1721class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1722 """PosixDatastore specialization of a butler using Postgres""" 

1723 

1724 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1725 fullConfigKey = ".datastore.formatters" 

1726 validationCanFail = True 

1727 datastoreStr = ["/tmp"] 

1728 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1729 registryStr = "PostgreSQL@test" 

1730 postgresql: Any 

1731 

1732 @staticmethod 

1733 def _handler(postgresql: Any) -> None: 

1734 engine = sqlalchemy.engine.create_engine(postgresql.url()) 

1735 with engine.begin() as connection: 

1736 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;")) 

1737 

1738 @classmethod 

1739 def setUpClass(cls) -> None: 

1740 # Create the postgres test server. 

1741 cls.postgresql = testing.postgresql.PostgresqlFactory( 

1742 cache_initialized_db=True, on_initialized=cls._handler 

1743 ) 

1744 super().setUpClass() 

1745 

1746 @classmethod 

1747 def tearDownClass(cls) -> None: 

1748 # Clean up any lingering SQLAlchemy engines/connections 

1749 # so they're closed before we shut down the server. 

1750 gc.collect() 

1751 cls.postgresql.clear_cache() 

1752 super().tearDownClass() 

1753 

1754 def setUp(self) -> None: 

1755 self.server = self.postgresql() 

1756 

1757 # Need to add a registry section to the config. 

1758 self._temp_config = False 

1759 config = Config(self.configFile) 

1760 config["registry", "db"] = self.server.url() 

1761 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh: 

1762 config.dump(fh) 

1763 self.configFile = fh.name 

1764 self._temp_config = True 

1765 super().setUp() 

1766 

1767 def tearDown(self) -> None: 

1768 self.server.stop() 

1769 if self._temp_config and os.path.exists(self.configFile): 

1770 os.remove(self.configFile) 

1771 super().tearDown() 

1772 

1773 def testMakeRepo(self) -> None: 

1774 # The base class test assumes that it's using sqlite and assumes 

1775 # the config file is acceptable to sqlite. 

1776 raise unittest.SkipTest("Postgres config is not compatible with this test.") 

1777 

1778 

1779class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1780 """InMemoryDatastore specialization of a butler""" 

1781 

1782 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1783 fullConfigKey = None 

1784 useTempRoot = False 

1785 validationCanFail = False 

1786 datastoreStr = ["datastore='InMemory"] 

1787 datastoreName = ["InMemoryDatastore@"] 

1788 registryStr = "/gen3.sqlite3" 

1789 

1790 def testIngest(self) -> None: 

1791 pass 

1792 

1793 

1794class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1795 """PosixDatastore specialization""" 

1796 

1797 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1798 fullConfigKey = ".datastore.datastores.1.formatters" 

1799 validationCanFail = True 

1800 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1801 datastoreName = [ 

1802 "InMemoryDatastore@", 

1803 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1804 "SecondDatastore", 

1805 ] 

1806 registryStr = "/gen3.sqlite3" 

1807 

1808 

1809class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1810 """Test that a yaml file in one location can refer to a root in another.""" 

1811 

1812 datastoreStr = ["dir1"] 

1813 # Disable the makeRepo test since we are deliberately not using 

1814 # butler.yaml as the config name. 

1815 fullConfigKey = None 

1816 

1817 def setUp(self) -> None: 

1818 self.root = makeTestTempDir(TESTDIR) 

1819 

1820 # Make a new repository in one place 

1821 self.dir1 = os.path.join(self.root, "dir1") 

1822 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1823 

1824 # Move the yaml file to a different place and add a "root" 

1825 self.dir2 = os.path.join(self.root, "dir2") 

1826 os.makedirs(self.dir2, exist_ok=True) 

1827 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1828 config = Config(configFile1) 

1829 config["root"] = self.dir1 

1830 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1831 config.dumpToUri(configFile2) 

1832 os.remove(configFile1) 

1833 self.tmpConfigFile = configFile2 

1834 

1835 def testFileLocations(self) -> None: 

1836 self.assertNotEqual(self.dir1, self.dir2) 

1837 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1838 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1839 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1840 

1841 

1842class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1843 """Test that a config file created by makeRepo outside of repo works.""" 

1844 

1845 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1846 

1847 def setUp(self) -> None: 

1848 self.root = makeTestTempDir(TESTDIR) 

1849 self.root2 = makeTestTempDir(TESTDIR) 

1850 

1851 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1852 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1853 

1854 def tearDown(self) -> None: 

1855 if os.path.exists(self.root2): 

1856 shutil.rmtree(self.root2, ignore_errors=True) 

1857 super().tearDown() 

1858 

1859 def testConfigExistence(self) -> None: 

1860 c = Config(self.tmpConfigFile) 

1861 uri_config = ResourcePath(c["root"]) 

1862 uri_expected = ResourcePath(self.root, forceDirectory=True) 

1863 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1864 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1865 

1866 def testPutGet(self) -> None: 

1867 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1868 self.runPutGetTest(storageClass, "test_metric") 

1869 

1870 

1871class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1872 """Test that a config file created by makeRepo outside of repo works.""" 

1873 

1874 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1875 

1876 def setUp(self) -> None: 

1877 self.root = makeTestTempDir(TESTDIR) 

1878 self.root2 = makeTestTempDir(TESTDIR) 

1879 

1880 self.tmpConfigFile = self.root2 

1881 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1882 

1883 def testConfigExistence(self) -> None: 

1884 # Append the yaml file else Config constructor does not know the file 

1885 # type. 

1886 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1887 super().testConfigExistence() 

1888 

1889 

1890class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1891 """Test that a config file created by makeRepo outside of repo works.""" 

1892 

1893 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1894 

1895 def setUp(self) -> None: 

1896 self.root = makeTestTempDir(TESTDIR) 

1897 self.root2 = makeTestTempDir(TESTDIR) 

1898 

1899 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl() 

1900 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1901 

1902 

1903@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1904class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1905 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1906 a local in-memory SqlRegistry. 

1907 """ 

1908 

1909 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1910 fullConfigKey = None 

1911 validationCanFail = True 

1912 

1913 bucketName = "anybucketname" 

1914 """Name of the Bucket that will be used in the tests. The name is read from 

1915 the config file used with the tests during set-up. 

1916 """ 

1917 

1918 root = "butlerRoot/" 

1919 """Root repository directory expected to be used in case useTempRoot=False. 

1920 Otherwise the root is set to a 20 characters long randomly generated string 

1921 during set-up. 

1922 """ 

1923 

1924 datastoreStr = [f"datastore={root}"] 

1925 """Contains all expected root locations in a format expected to be 

1926 returned by Butler stringification. 

1927 """ 

1928 

1929 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1930 """The expected format of the S3 Datastore string.""" 

1931 

1932 registryStr = "/gen3.sqlite3" 

1933 """Expected format of the Registry string.""" 

1934 

1935 mock_s3 = mock_s3() 

1936 """The mocked s3 interface from moto.""" 

1937 

1938 def genRoot(self) -> str: 

1939 """Return a random string of len 20 to serve as a root 

1940 name for the temporary bucket repo. 

1941 

1942 This is equivalent to tempfile.mkdtemp as this is what self.root 

1943 becomes when useTempRoot is True. 

1944 """ 

1945 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1946 return rndstr + "/" 

1947 

1948 def setUp(self) -> None: 

1949 config = Config(self.configFile) 

1950 uri = ResourcePath(config[".datastore.datastore.root"]) 

1951 self.bucketName = uri.netloc 

1952 

1953 # Enable S3 mocking of tests. 

1954 self.mock_s3.start() 

1955 

1956 # set up some fake credentials if they do not exist 

1957 self.usingDummyCredentials = setAwsEnvCredentials() 

1958 

1959 if self.useTempRoot: 

1960 self.root = self.genRoot() 

1961 rooturi = f"s3://{self.bucketName}/{self.root}" 

1962 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1963 

1964 # need local folder to store registry database 

1965 self.reg_dir = makeTestTempDir(TESTDIR) 

1966 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1967 

1968 # MOTO needs to know that we expect Bucket bucketname to exist 

1969 # (this used to be the class attribute bucketName) 

1970 s3 = boto3.resource("s3") 

1971 s3.create_bucket(Bucket=self.bucketName) 

1972 

1973 self.datastoreStr = [f"datastore='{rooturi}'"] 

1974 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1975 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1976 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1977 

1978 def tearDown(self) -> None: 

1979 s3 = boto3.resource("s3") 

1980 bucket = s3.Bucket(self.bucketName) 

1981 try: 

1982 bucket.objects.all().delete() 

1983 except botocore.exceptions.ClientError as e: 

1984 if e.response["Error"]["Code"] == "404": 

1985 # the key was not reachable - pass 

1986 pass 

1987 else: 

1988 raise 

1989 

1990 bucket = s3.Bucket(self.bucketName) 

1991 bucket.delete() 

1992 

1993 # Stop the S3 mock. 

1994 self.mock_s3.stop() 

1995 

1996 # unset any potentially set dummy credentials 

1997 if self.usingDummyCredentials: 

1998 unsetAwsEnvCredentials() 

1999 

2000 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

2001 shutil.rmtree(self.reg_dir, ignore_errors=True) 

2002 

2003 if self.useTempRoot and os.path.exists(self.root): 

2004 shutil.rmtree(self.root, ignore_errors=True) 

2005 

2006 super().tearDown() 

2007 

2008 

2009class PosixDatastoreTransfers(unittest.TestCase): 

2010 """Test data transfers between butlers. 

2011 

2012 Test for different managers. UUID to UUID and integer to integer are 

2013 tested. UUID to integer is not supported since we do not currently 

2014 want to allow that. Integer to UUID is supported with the caveat 

2015 that UUID4 will be generated and this will be incorrect for raw 

2016 dataset types. The test ignores that. 

2017 """ 

2018 

2019 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

2020 storageClassFactory: StorageClassFactory 

2021 

2022 @classmethod 

2023 def setUpClass(cls) -> None: 

2024 cls.storageClassFactory = StorageClassFactory() 

2025 cls.storageClassFactory.addFromConfig(cls.configFile) 

2026 

2027 def setUp(self) -> None: 

2028 self.root = makeTestTempDir(TESTDIR) 

2029 self.config = Config(self.configFile) 

2030 

2031 def tearDown(self) -> None: 

2032 removeTestTempDir(self.root) 

2033 

2034 def create_butler(self, manager: str, label: str) -> Butler: 

2035 config = Config(self.configFile) 

2036 config["registry", "managers", "datasets"] = manager 

2037 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True) 

2038 

2039 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None: 

2040 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID" 

2041 if manager1 is None: 

2042 manager1 = default 

2043 if manager2 is None: 

2044 manager2 = default 

2045 self.source_butler = self.create_butler(manager1, "1") 

2046 self.target_butler = self.create_butler(manager2, "2") 

2047 

2048 def testTransferUuidToUuid(self) -> None: 

2049 self.create_butlers() 

2050 self.assertButlerTransfers() 

2051 

2052 def _enable_trust(self, datastore: Datastore) -> None: 

2053 if hasattr(datastore, "trustGetRequest"): 

2054 datastore.trustGetRequest = True 

2055 elif hasattr(datastore, "datastores"): 

2056 for datastore in datastore.datastores: 

2057 if hasattr(datastore, "trustGetRequest"): 

2058 datastore.trustGetRequest = True 

2059 

2060 def testTransferMissing(self) -> None: 

2061 """Test transfers where datastore records are missing. 

2062 

2063 This is how execution butler works. 

2064 """ 

2065 self.create_butlers() 

2066 

2067 # Configure the source butler to allow trust. 

2068 self._enable_trust(self.source_butler.datastore) 

2069 

2070 self.assertButlerTransfers(purge=True) 

2071 

2072 def testTransferMissingDisassembly(self) -> None: 

2073 """Test transfers where datastore records are missing. 

2074 

2075 This is how execution butler works. 

2076 """ 

2077 self.create_butlers() 

2078 

2079 # Configure the source butler to allow trust. 

2080 self._enable_trust(self.source_butler.datastore) 

2081 

2082 # Test disassembly. 

2083 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

2084 

2085 def testAbsoluteURITransferDirect(self) -> None: 

2086 """Test transfer using an absolute URI.""" 

2087 self._absolute_transfer("auto") 

2088 

2089 def testAbsoluteURITransferCopy(self) -> None: 

2090 """Test transfer using an absolute URI.""" 

2091 self._absolute_transfer("copy") 

2092 

2093 def _absolute_transfer(self, transfer: str) -> None: 

2094 self.create_butlers() 

2095 

2096 storageClassName = "StructuredData" 

2097 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2098 datasetTypeName = "random_data" 

2099 run = "run1" 

2100 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2101 

2102 dimensions = self.source_butler.dimensions.extract(()) 

2103 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2104 self.source_butler.registry.registerDatasetType(datasetType) 

2105 

2106 metrics = makeExampleMetrics() 

2107 with ResourcePath.temporary_uri(suffix=".json") as temp: 

2108 dataId = DataCoordinate.makeEmpty(self.source_butler.dimensions) 

2109 source_refs = [DatasetRef(datasetType, dataId, run=run)] 

2110 temp.write(json.dumps(metrics.exportAsDict()).encode()) 

2111 dataset = FileDataset(path=temp, refs=source_refs) 

2112 self.source_butler.ingest(dataset, transfer="direct") 

2113 

2114 self.target_butler.transfer_from( 

2115 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer 

2116 ) 

2117 

2118 uri = self.target_butler.getURI(dataset.refs[0]) 

2119 if transfer == "auto": 

2120 self.assertEqual(uri, temp) 

2121 else: 

2122 self.assertNotEqual(uri, temp) 

2123 

2124 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None: 

2125 """Test that a run can be transferred to another butler.""" 

2126 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2127 datasetTypeName = "random_data" 

2128 

2129 # Test will create 3 collections and we will want to transfer 

2130 # two of those three. 

2131 runs = ["run1", "run2", "other"] 

2132 

2133 # Also want to use two different dataset types to ensure that 

2134 # grouping works. 

2135 datasetTypeNames = ["random_data", "random_data_2"] 

2136 

2137 # Create the run collections in the source butler. 

2138 for run in runs: 

2139 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2140 

2141 # Create dimensions in source butler. 

2142 n_exposures = 30 

2143 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

2144 self.source_butler.registry.insertDimensionData( 

2145 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

2146 ) 

2147 self.source_butler.registry.insertDimensionData( 

2148 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

2149 ) 

2150 

2151 for i in range(n_exposures): 

2152 self.source_butler.registry.insertDimensionData( 

2153 "exposure", 

2154 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

2155 ) 

2156 

2157 # Create dataset types in the source butler. 

2158 dimensions = self.source_butler.dimensions.extract(["instrument", "exposure"]) 

2159 for datasetTypeName in datasetTypeNames: 

2160 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2161 self.source_butler.registry.registerDatasetType(datasetType) 

2162 

2163 # Write a dataset to an unrelated run -- this will ensure that 

2164 # we are rewriting integer dataset ids in the target if necessary. 

2165 # Will not be relevant for UUID. 

2166 run = "distraction" 

2167 butler = Butler(butler=self.source_butler, run=run) 

2168 butler.put( 

2169 makeExampleMetrics(), 

2170 datasetTypeName, 

2171 exposure=1, 

2172 instrument="DummyCamComp", 

2173 physical_filter="d-r", 

2174 ) 

2175 

2176 # Write some example metrics to the source 

2177 butler = Butler(butler=self.source_butler) 

2178 

2179 # Set of DatasetRefs that should be in the list of refs to transfer 

2180 # but which will not be transferred. 

2181 deleted: set[DatasetRef] = set() 

2182 

2183 n_expected = 20 # Number of datasets expected to be transferred 

2184 source_refs = [] 

2185 for i in range(n_exposures): 

2186 # Put a third of datasets into each collection, only retain 

2187 # two thirds. 

2188 index = i % 3 

2189 run = runs[index] 

2190 datasetTypeName = datasetTypeNames[i % 2] 

2191 

2192 metric = MetricsExample( 

2193 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)] 

2194 ) 

2195 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

2196 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

2197 

2198 # Remove the datastore record using low-level API 

2199 if purge: 

2200 # Remove records for a fraction. 

2201 if index == 1: 

2202 # For one of these delete the file as well. 

2203 # This allows the "missing" code to filter the 

2204 # file out. 

2205 # Access the individual datastores. 

2206 datastores = [] 

2207 if hasattr(butler.datastore, "datastores"): 

2208 datastores.extend(butler.datastore.datastores) 

2209 else: 

2210 datastores.append(butler.datastore) 

2211 

2212 if not deleted: 

2213 # For a chained datastore we need to remove 

2214 # files in each chain. 

2215 for datastore in datastores: 

2216 # The file might not be known to the datastore 

2217 # if constraints are used. 

2218 try: 

2219 primary, uris = datastore.getURIs(ref) 

2220 except FileNotFoundError: 

2221 continue 

2222 if primary: 

2223 if primary.scheme != "mem": 

2224 primary.remove() 

2225 for uri in uris.values(): 

2226 if uri.scheme != "mem": 

2227 uri.remove() 

2228 n_expected -= 1 

2229 deleted.add(ref) 

2230 

2231 # Remove the datastore record. 

2232 for datastore in datastores: 

2233 if hasattr(datastore, "removeStoredItemInfo"): 

2234 datastore.removeStoredItemInfo(ref) 

2235 

2236 if index < 2: 

2237 source_refs.append(ref) 

2238 if ref not in deleted: 

2239 new_metric = butler.get(ref) 

2240 self.assertEqual(new_metric, metric) 

2241 

2242 # Create some bad dataset types to ensure we check for inconsistent 

2243 # definitions. 

2244 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2245 for datasetTypeName in datasetTypeNames: 

2246 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2247 self.target_butler.registry.registerDatasetType(datasetType) 

2248 with self.assertRaises(ConflictingDefinitionError) as cm: 

2249 self.target_butler.transfer_from(self.source_butler, source_refs) 

2250 self.assertIn("dataset type differs", str(cm.exception)) 

2251 

2252 # And remove the bad definitions. 

2253 for datasetTypeName in datasetTypeNames: 

2254 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2255 

2256 # Transfer without creating dataset types should fail. 

2257 with self.assertRaises(KeyError): 

2258 self.target_butler.transfer_from(self.source_butler, source_refs) 

2259 

2260 # Transfer without creating dimensions should fail. 

2261 with self.assertRaises(ConflictingDefinitionError) as cm: 

2262 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True) 

2263 self.assertIn("dimension", str(cm.exception)) 

2264 

2265 # The failed transfer above leaves registry in an inconsistent 

2266 # state because the run is created but then rolled back without 

2267 # the collection cache being cleared. For now force a refresh. 

2268 # Can remove with DM-35498. 

2269 self.target_butler.registry.refresh() 

2270 

2271 # Now transfer them to the second butler, including dimensions. 

2272 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2273 transferred = self.target_butler.transfer_from( 

2274 self.source_butler, 

2275 source_refs, 

2276 register_dataset_types=True, 

2277 transfer_dimensions=True, 

2278 ) 

2279 self.assertEqual(len(transferred), n_expected) 

2280 log_output = ";".join(log_cm.output) 

2281 

2282 # A ChainedDatastore will use the in-memory datastore for mexists 

2283 # so we can not rely on the mexists log message. 

2284 self.assertIn("Number of datastore records found in source", log_output) 

2285 self.assertIn("Creating output run", log_output) 

2286 

2287 # Do the transfer twice to ensure that it will do nothing extra. 

2288 # Only do this if purge=True because it does not work for int 

2289 # dataset_id. 

2290 if purge: 

2291 # This should not need to register dataset types. 

2292 transferred = self.target_butler.transfer_from(self.source_butler, source_refs) 

2293 self.assertEqual(len(transferred), n_expected) 

2294 

2295 # Also do an explicit low-level transfer to trigger some 

2296 # edge cases. 

2297 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2298 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

2299 log_output = ";".join(log_cm.output) 

2300 self.assertIn("no file artifacts exist", log_output) 

2301 

2302 with self.assertRaises((TypeError, AttributeError)): 

2303 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) # type: ignore 

2304 

2305 with self.assertRaises(ValueError): 

2306 self.target_butler.datastore.transfer_from( 

2307 self.source_butler.datastore, source_refs, transfer="split" 

2308 ) 

2309 

2310 # Now try to get the same refs from the new butler. 

2311 for ref in source_refs: 

2312 if ref not in deleted: 

2313 new_metric = self.target_butler.get(ref) 

2314 old_metric = self.source_butler.get(ref) 

2315 self.assertEqual(new_metric, old_metric) 

2316 

2317 # Now prune run2 collection and create instead a CHAINED collection. 

2318 # This should block the transfer. 

2319 self.target_butler.removeRuns(["run2"], unstore=True) 

2320 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2321 with self.assertRaises(CollectionTypeError): 

2322 # Re-importing the run1 datasets can be problematic if they 

2323 # use integer IDs so filter those out. 

2324 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2325 self.target_butler.transfer_from(self.source_butler, to_transfer) 

2326 

2327 

2328class ChainedDatastoreTransfers(PosixDatastoreTransfers): 

2329 """Test transfers using a chained datastore.""" 

2330 

2331 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

2332 

2333 

2334def setup_module(module: types.ModuleType) -> None: 

2335 """Set up the module for pytest.""" 

2336 clean_environment() 

2337 

2338 

2339if __name__ == "__main__": 

2340 clean_environment() 

2341 unittest.main()