Coverage for tests/test_butler.py: 13%

1263 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24from __future__ import annotations 

25 

26import gc 

27import json 

28import logging 

29import os 

30import pathlib 

31import pickle 

32import posixpath 

33import random 

34import shutil 

35import string 

36import tempfile 

37import unittest 

38import uuid 

39from collections.abc import Mapping 

40from typing import TYPE_CHECKING, Any, cast 

41 

42try: 

43 import boto3 

44 import botocore 

45 from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

46 from moto import mock_s3 # type: ignore[import] 

47except ImportError: 

48 boto3 = None 

49 

50 def mock_s3(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def] 

51 """No-op decorator in case moto mock_s3 can not be imported.""" 

52 return None 

53 

54 

55try: 

56 # It's possible but silly to have testing.postgresql installed without 

57 # having the postgresql server installed (because then nothing in 

58 # testing.postgresql would work), so we use the presence of that module 

59 # to test whether we can expect the server to be available. 

60 import testing.postgresql # type: ignore[import] 

61except ImportError: 

62 testing = None 

63 

64import astropy.time 

65import sqlalchemy 

66from lsst.daf.butler import ( 

67 Butler, 

68 ButlerConfig, 

69 ButlerRepoIndex, 

70 CollectionType, 

71 Config, 

72 DataCoordinate, 

73 DatasetExistence, 

74 DatasetRef, 

75 DatasetType, 

76 FileDataset, 

77 FileTemplate, 

78 FileTemplateValidationError, 

79 StorageClassFactory, 

80 ValidationError, 

81 script, 

82) 

83from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

84from lsst.daf.butler.datastores.fileDatastore import FileDatastore 

85from lsst.daf.butler.registries.sql import SqlRegistry 

86from lsst.daf.butler.registry import ( 

87 CollectionError, 

88 CollectionTypeError, 

89 ConflictingDefinitionError, 

90 DataIdValueError, 

91 MissingCollectionError, 

92 OrphanedRecordError, 

93) 

94from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

95from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir 

96from lsst.resources import ResourcePath 

97from lsst.utils import doImportType 

98from lsst.utils.introspection import get_full_type_name 

99 

100if TYPE_CHECKING: 

101 import types 

102 

103 from lsst.daf.butler import Datastore, DimensionGraph, Registry, StorageClass 

104 

105TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

106 

107 

108def clean_environment() -> None: 

109 """Remove external environment variables that affect the tests.""" 

110 for k in ( 

111 "DAF_BUTLER_REPOSITORY_INDEX", 

112 "S3_ENDPOINT_URL", 

113 "AWS_ACCESS_KEY_ID", 

114 "AWS_SECRET_ACCESS_KEY", 

115 "AWS_SHARED_CREDENTIALS_FILE", 

116 ): 

117 os.environ.pop(k, None) 

118 

119 

120def makeExampleMetrics() -> MetricsExample: 

121 """Return example dataset suitable for tests.""" 

122 return MetricsExample( 

123 {"AM1": 5.2, "AM2": 30.6}, 

124 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

125 [563, 234, 456.7, 752, 8, 9, 27], 

126 ) 

127 

128 

129class TransactionTestError(Exception): 

130 """Specific error for testing transactions, to prevent misdiagnosing 

131 that might otherwise occur when a standard exception is used. 

132 """ 

133 

134 pass 

135 

136 

137class ButlerConfigTests(unittest.TestCase): 

138 """Simple tests for ButlerConfig that are not tested in any other test 

139 cases. 

140 """ 

141 

142 def testSearchPath(self) -> None: 

143 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

144 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

145 config1 = ButlerConfig(configFile) 

146 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

147 

148 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

149 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

150 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

151 self.assertIn("testConfigs", "\n".join(cm.output)) 

152 

153 key = ("datastore", "records", "table") 

154 self.assertNotEqual(config1[key], config2[key]) 

155 self.assertEqual(config2[key], "override_record") 

156 

157 

158class ButlerPutGetTests(TestCaseMixin): 

159 """Helper method for running a suite of put/get tests from different 

160 butler configurations. 

161 """ 

162 

163 root: str 

164 default_run = "ingésτ😺" 

165 storageClassFactory: StorageClassFactory 

166 configFile: str 

167 tmpConfigFile: str 

168 

169 @staticmethod 

170 def addDatasetType( 

171 datasetTypeName: str, dimensions: DimensionGraph, storageClass: StorageClass | str, registry: Registry 

172 ) -> DatasetType: 

173 """Create a DatasetType and register it""" 

174 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

175 registry.registerDatasetType(datasetType) 

176 return datasetType 

177 

178 @classmethod 

179 def setUpClass(cls) -> None: 

180 cls.storageClassFactory = StorageClassFactory() 

181 cls.storageClassFactory.addFromConfig(cls.configFile) 

182 

183 def assertGetComponents( 

184 self, 

185 butler: Butler, 

186 datasetRef: DatasetRef, 

187 components: tuple[str, ...], 

188 reference: Any, 

189 collections: Any = None, 

190 ) -> None: 

191 datasetType = datasetRef.datasetType 

192 dataId = datasetRef.dataId 

193 deferred = butler.getDeferred(datasetRef) 

194 

195 for component in components: 

196 compTypeName = datasetType.componentTypeName(component) 

197 result = butler.get(compTypeName, dataId, collections=collections) 

198 self.assertEqual(result, getattr(reference, component)) 

199 result_deferred = deferred.get(component=component) 

200 self.assertEqual(result_deferred, result) 

201 

202 def tearDown(self) -> None: 

203 removeTestTempDir(self.root) 

204 

205 def create_butler( 

206 self, run: str, storageClass: StorageClass | str, datasetTypeName: str 

207 ) -> tuple[Butler, DatasetType]: 

208 butler = Butler(self.tmpConfigFile, run=run) 

209 

210 collections = set(butler.registry.queryCollections()) 

211 self.assertEqual(collections, {run}) 

212 

213 # Create and register a DatasetType 

214 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

215 

216 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

217 

218 # Add needed Dimensions 

219 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

220 butler.registry.insertDimensionData( 

221 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

222 ) 

223 butler.registry.insertDimensionData( 

224 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

225 ) 

226 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

227 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

228 butler.registry.insertDimensionData( 

229 "visit", 

230 { 

231 "instrument": "DummyCamComp", 

232 "id": 423, 

233 "name": "fourtwentythree", 

234 "physical_filter": "d-r", 

235 "visit_system": 1, 

236 "datetime_begin": visit_start, 

237 "datetime_end": visit_end, 

238 }, 

239 ) 

240 

241 # Add more visits for some later tests 

242 for visit_id in (424, 425): 

243 butler.registry.insertDimensionData( 

244 "visit", 

245 { 

246 "instrument": "DummyCamComp", 

247 "id": visit_id, 

248 "name": f"fourtwentyfour_{visit_id}", 

249 "physical_filter": "d-r", 

250 "visit_system": 1, 

251 }, 

252 ) 

253 return butler, datasetType 

254 

255 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler: 

256 # New datasets will be added to run and tag, but we will only look in 

257 # tag when looking up datasets. 

258 run = self.default_run 

259 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

260 assert butler.run is not None 

261 

262 # Create and store a dataset 

263 metric = makeExampleMetrics() 

264 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423}) 

265 

266 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

267 # and once with a DatasetType 

268 

269 # Keep track of any collections we add and do not clean up 

270 expected_collections = {run} 

271 

272 counter = 0 

273 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1") 

274 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate] 

275 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)): 

276 # Since we are using subTest we can get cascading failures 

277 # here with the first attempt failing and the others failing 

278 # immediately because the dataset already exists. Work around 

279 # this by using a distinct run collection each time 

280 counter += 1 

281 this_run = f"put_run_{counter}" 

282 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

283 expected_collections.update({this_run}) 

284 

285 with self.subTest(args=args): 

286 kwargs: dict[str, Any] = {} 

287 if not isinstance(args[0], DatasetRef): # type: ignore 

288 kwargs["run"] = this_run 

289 ref = butler.put(metric, *args, **kwargs) 

290 self.assertIsInstance(ref, DatasetRef) 

291 

292 # Test getDirect 

293 metricOut = butler.get(ref) 

294 self.assertEqual(metric, metricOut) 

295 # Test get 

296 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

297 self.assertEqual(metric, metricOut) 

298 # Test get with a datasetRef 

299 metricOut = butler.get(ref) 

300 self.assertEqual(metric, metricOut) 

301 # Test getDeferred with dataId 

302 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

303 self.assertEqual(metric, metricOut) 

304 # Test getDeferred with a ref 

305 metricOut = butler.getDeferred(ref).get() 

306 self.assertEqual(metric, metricOut) 

307 

308 # Check we can get components 

309 if storageClass.isComposite(): 

310 self.assertGetComponents( 

311 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

312 ) 

313 

314 # Can the artifacts themselves be retrieved? 

315 if not butler._datastore.isEphemeral: 

316 root_uri = ResourcePath(self.root) 

317 

318 for preserve_path in (True, False): 

319 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

320 # Use copy so that we can test that overwrite 

321 # protection works (using "auto" for File URIs would 

322 # use hard links and subsequent transfer would work 

323 # because it knows they are the same file). 

324 transferred = butler.retrieveArtifacts( 

325 [ref], destination, preserve_path=preserve_path, transfer="copy" 

326 ) 

327 self.assertGreater(len(transferred), 0) 

328 artifacts = list(ResourcePath.findFileResources([destination])) 

329 self.assertEqual(set(transferred), set(artifacts)) 

330 

331 for artifact in transferred: 

332 path_in_destination = artifact.relative_to(destination) 

333 self.assertIsNotNone(path_in_destination) 

334 assert path_in_destination is not None 

335 

336 # when path is not preserved there should not be 

337 # any path separators. 

338 num_seps = path_in_destination.count("/") 

339 if preserve_path: 

340 self.assertGreater(num_seps, 0) 

341 else: 

342 self.assertEqual(num_seps, 0) 

343 

344 primary_uri, secondary_uris = butler.getURIs(ref) 

345 n_uris = len(secondary_uris) 

346 if primary_uri: 

347 n_uris += 1 

348 self.assertEqual( 

349 len(artifacts), 

350 n_uris, 

351 "Comparing expected artifacts vs actual:" 

352 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

353 ) 

354 

355 if preserve_path: 

356 # No need to run these twice 

357 with self.assertRaises(ValueError): 

358 butler.retrieveArtifacts([ref], destination, transfer="move") 

359 

360 with self.assertRaises(FileExistsError): 

361 butler.retrieveArtifacts([ref], destination) 

362 

363 transferred_again = butler.retrieveArtifacts( 

364 [ref], destination, preserve_path=preserve_path, overwrite=True 

365 ) 

366 self.assertEqual(set(transferred_again), set(transferred)) 

367 

368 # Now remove the dataset completely. 

369 butler.pruneDatasets([ref], purge=True, unstore=True) 

370 # Lookup with original args should still fail. 

371 kwargs = {"collections": this_run} 

372 if isinstance(args[0], DatasetRef): 

373 kwargs = {} # Prevent warning from being issued. 

374 self.assertFalse(butler.exists(*args, **kwargs)) 

375 # get() should still fail. 

376 with self.assertRaises(FileNotFoundError): 

377 butler.get(ref) 

378 # Registry shouldn't be able to find it by dataset_id anymore. 

379 self.assertIsNone(butler.registry.getDataset(ref.id)) 

380 

381 # Do explicit registry removal since we know they are 

382 # empty 

383 butler.registry.removeCollection(this_run) 

384 expected_collections.remove(this_run) 

385 

386 # Create DatasetRef for put using default run. 

387 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run) 

388 

389 # Put the dataset again, since the last thing we did was remove it 

390 # and we want to use the default collection. 

391 ref = butler.put(metric, refIn) 

392 

393 # Get with parameters 

394 stop = 4 

395 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

396 self.assertNotEqual(metric, sliced) 

397 self.assertEqual(metric.summary, sliced.summary) 

398 self.assertEqual(metric.output, sliced.output) 

399 assert metric.data is not None # for mypy 

400 self.assertEqual(metric.data[:stop], sliced.data) 

401 # getDeferred with parameters 

402 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

403 self.assertNotEqual(metric, sliced) 

404 self.assertEqual(metric.summary, sliced.summary) 

405 self.assertEqual(metric.output, sliced.output) 

406 self.assertEqual(metric.data[:stop], sliced.data) 

407 # getDeferred with deferred parameters 

408 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

409 self.assertNotEqual(metric, sliced) 

410 self.assertEqual(metric.summary, sliced.summary) 

411 self.assertEqual(metric.output, sliced.output) 

412 self.assertEqual(metric.data[:stop], sliced.data) 

413 

414 if storageClass.isComposite(): 

415 # Check that components can be retrieved 

416 metricOut = butler.get(ref.datasetType.name, dataId) 

417 compNameS = ref.datasetType.componentTypeName("summary") 

418 compNameD = ref.datasetType.componentTypeName("data") 

419 summary = butler.get(compNameS, dataId) 

420 self.assertEqual(summary, metric.summary) 

421 data = butler.get(compNameD, dataId) 

422 self.assertEqual(data, metric.data) 

423 

424 if "counter" in storageClass.derivedComponents: 

425 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

426 self.assertEqual(count, len(data)) 

427 

428 count = butler.get( 

429 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

430 ) 

431 self.assertEqual(count, stop) 

432 

433 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

434 assert compRef is not None 

435 summary = butler.get(compRef) 

436 self.assertEqual(summary, metric.summary) 

437 

438 # Create a Dataset type that has the same name but is inconsistent. 

439 inconsistentDatasetType = DatasetType( 

440 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config") 

441 ) 

442 

443 # Getting with a dataset type that does not match registry fails 

444 with self.assertRaisesRegex(ValueError, "Supplied dataset type .* inconsistent with registry"): 

445 butler.get(inconsistentDatasetType, dataId) 

446 

447 # Combining a DatasetRef with a dataId should fail 

448 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"): 

449 butler.get(ref, dataId) 

450 # Getting with an explicit ref should fail if the id doesn't match. 

451 with self.assertRaises(FileNotFoundError): 

452 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run)) 

453 

454 # Getting a dataset with unknown parameters should fail 

455 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"): 

456 butler.get(ref, parameters={"unsupported": True}) 

457 

458 # Check we have a collection 

459 collections = set(butler.registry.queryCollections()) 

460 self.assertEqual(collections, expected_collections) 

461 

462 # Clean up to check that we can remove something that may have 

463 # already had a component removed 

464 butler.pruneDatasets([ref], unstore=True, purge=True) 

465 

466 # Add the same ref again, so we can check that duplicate put fails. 

467 ref = butler.put(metric, datasetType, dataId) 

468 

469 # Repeat put will fail. 

470 with self.assertRaisesRegex( 

471 ConflictingDefinitionError, "A database constraint failure was triggered" 

472 ): 

473 butler.put(metric, datasetType, dataId) 

474 

475 # Remove the datastore entry. 

476 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

477 

478 # Put will still fail 

479 with self.assertRaisesRegex( 

480 ConflictingDefinitionError, "A database constraint failure was triggered" 

481 ): 

482 butler.put(metric, datasetType, dataId) 

483 

484 # Repeat the same sequence with resolved ref. 

485 butler.pruneDatasets([ref], unstore=True, purge=True) 

486 ref = butler.put(metric, refIn) 

487 

488 # Repeat put will fail. 

489 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"): 

490 butler.put(metric, refIn) 

491 

492 # Remove the datastore entry. 

493 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

494 

495 # In case of resolved ref this write will succeed. 

496 ref = butler.put(metric, refIn) 

497 

498 # Leave the dataset in place since some downstream tests require 

499 # something to be present 

500 

501 return butler 

502 

503 def testDeferredCollectionPassing(self) -> None: 

504 # Construct a butler with no run or collection, but make it writeable. 

505 butler = Butler(self.tmpConfigFile, writeable=True) 

506 # Create and register a DatasetType 

507 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

508 datasetType = self.addDatasetType( 

509 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

510 ) 

511 # Add needed Dimensions 

512 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

513 butler.registry.insertDimensionData( 

514 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

515 ) 

516 butler.registry.insertDimensionData( 

517 "visit", 

518 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

519 ) 

520 dataId = {"instrument": "DummyCamComp", "visit": 423} 

521 # Create dataset. 

522 metric = makeExampleMetrics() 

523 # Register a new run and put dataset. 

524 run = "deferred" 

525 self.assertTrue(butler.registry.registerRun(run)) 

526 # Second time it will be allowed but indicate no-op 

527 self.assertFalse(butler.registry.registerRun(run)) 

528 ref = butler.put(metric, datasetType, dataId, run=run) 

529 # Putting with no run should fail with TypeError. 

530 with self.assertRaises(CollectionError): 

531 butler.put(metric, datasetType, dataId) 

532 # Dataset should exist. 

533 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

534 # We should be able to get the dataset back, but with and without 

535 # a deferred dataset handle. 

536 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

537 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

538 # Trying to find the dataset without any collection is a TypeError. 

539 self.assertFalse(butler.exists(datasetType, dataId)) 

540 with self.assertRaises(CollectionError): 

541 butler.get(datasetType, dataId) 

542 # Associate the dataset with a different collection. 

543 butler.registry.registerCollection("tagged") 

544 butler.registry.associate("tagged", [ref]) 

545 # Deleting the dataset from the new collection should make it findable 

546 # in the original collection. 

547 butler.pruneDatasets([ref], tags=["tagged"]) 

548 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

549 

550 

551class ButlerTests(ButlerPutGetTests): 

552 """Tests for Butler.""" 

553 

554 useTempRoot = True 

555 validationCanFail: bool 

556 fullConfigKey: str | None 

557 registryStr: str | None 

558 datastoreName: list[str] | None 

559 datastoreStr: list[str] 

560 

561 def setUp(self) -> None: 

562 """Create a new butler root for each test.""" 

563 self.root = makeTestTempDir(TESTDIR) 

564 Butler.makeRepo(self.root, config=Config(self.configFile)) 

565 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

566 

567 def testConstructor(self) -> None: 

568 """Independent test of constructor.""" 

569 butler = Butler(self.tmpConfigFile, run=self.default_run) 

570 self.assertIsInstance(butler, Butler) 

571 

572 # Check that butler.yaml is added automatically. 

573 if self.tmpConfigFile.endswith(end := "/butler.yaml"): 

574 config_dir = self.tmpConfigFile[: -len(end)] 

575 butler = Butler(config_dir, run=self.default_run) 

576 self.assertIsInstance(butler, Butler) 

577 

578 # Even with a ResourcePath. 

579 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) 

580 self.assertIsInstance(butler, Butler) 

581 

582 collections = set(butler.registry.queryCollections()) 

583 self.assertEqual(collections, {self.default_run}) 

584 

585 # Check that some special characters can be included in run name. 

586 special_run = "u@b.c-A" 

587 butler_special = Butler(butler=butler, run=special_run) 

588 collections = set(butler_special.registry.queryCollections("*@*")) 

589 self.assertEqual(collections, {special_run}) 

590 

591 butler2 = Butler(butler=butler, collections=["other"]) 

592 self.assertEqual(butler2.collections, ("other",)) 

593 self.assertIsNone(butler2.run) 

594 self.assertIs(butler._datastore, butler2._datastore) 

595 

596 # Test that we can use an environment variable to find this 

597 # repository. 

598 butler_index = Config() 

599 butler_index["label"] = self.tmpConfigFile 

600 for suffix in (".yaml", ".json"): 

601 # Ensure that the content differs so that we know that 

602 # we aren't reusing the cache. 

603 bad_label = f"file://bucket/not_real{suffix}" 

604 butler_index["bad_label"] = bad_label 

605 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

606 butler_index.dumpToUri(temp_file) 

607 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

608 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"}) 

609 uri = Butler.get_repo_uri("bad_label") 

610 self.assertEqual(uri, ResourcePath(bad_label)) 

611 uri = Butler.get_repo_uri("label") 

612 butler = Butler(uri, writeable=False) 

613 self.assertIsInstance(butler, Butler) 

614 butler = Butler("label", writeable=False) 

615 self.assertIsInstance(butler, Butler) 

616 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"): 

617 Butler("not_there", writeable=False) 

618 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"): 

619 Butler("bad_label") 

620 with self.assertRaises(FileNotFoundError): 

621 # Should ignore aliases. 

622 Butler(ResourcePath("label", forceAbsolute=False)) 

623 with self.assertRaises(KeyError) as cm: 

624 Butler.get_repo_uri("missing") 

625 self.assertEqual( 

626 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False) 

627 ) 

628 self.assertIn("not known to", str(cm.exception)) 

629 # Should report no failure. 

630 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "") 

631 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

632 # Now with empty configuration. 

633 butler_index = Config() 

634 butler_index.dumpToUri(temp_file) 

635 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

636 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"): 

637 Butler("label") 

638 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

639 # Now with bad contents. 

640 with open(temp_file.ospath, "w") as fh: 

641 print("'", file=fh) 

642 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

643 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"): 

644 Butler("label") 

645 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

646 with self.assertRaises(FileNotFoundError): 

647 Butler.get_repo_uri("label") 

648 self.assertEqual(Butler.get_known_repos(), set()) 

649 

650 with self.assertRaisesRegex(FileNotFoundError, "index file not found"): 

651 Butler("label") 

652 

653 # Check that we can create Butler when the alias file is not found. 

654 butler = Butler(self.tmpConfigFile, writeable=False) 

655 self.assertIsInstance(butler, Butler) 

656 with self.assertRaises(KeyError) as cm: 

657 # No environment variable set. 

658 Butler.get_repo_uri("label") 

659 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False)) 

660 self.assertIn("No repository index defined", str(cm.exception)) 

661 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"): 

662 # No aliases registered. 

663 Butler("not_there") 

664 self.assertEqual(Butler.get_known_repos(), set()) 

665 

666 def testBasicPutGet(self) -> None: 

667 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

668 self.runPutGetTest(storageClass, "test_metric") 

669 

670 def testCompositePutGetConcrete(self) -> None: 

671 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

672 butler = self.runPutGetTest(storageClass, "test_metric") 

673 

674 # Should *not* be disassembled 

675 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

676 self.assertEqual(len(datasets), 1) 

677 uri, components = butler.getURIs(datasets[0]) 

678 self.assertIsInstance(uri, ResourcePath) 

679 self.assertFalse(components) 

680 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

681 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

682 

683 # Predicted dataset 

684 dataId = {"instrument": "DummyCamComp", "visit": 424} 

685 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

686 self.assertFalse(components) 

687 self.assertIsInstance(uri, ResourcePath) 

688 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

689 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

690 

691 def testCompositePutGetVirtual(self) -> None: 

692 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

693 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

694 

695 # Should be disassembled 

696 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

697 self.assertEqual(len(datasets), 1) 

698 uri, components = butler.getURIs(datasets[0]) 

699 

700 if butler._datastore.isEphemeral: 

701 # Never disassemble in-memory datastore 

702 self.assertIsInstance(uri, ResourcePath) 

703 self.assertFalse(components) 

704 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

705 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

706 else: 

707 self.assertIsNone(uri) 

708 self.assertEqual(set(components), set(storageClass.components)) 

709 for compuri in components.values(): 

710 self.assertIsInstance(compuri, ResourcePath) 

711 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

712 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

713 

714 # Predicted dataset 

715 dataId = {"instrument": "DummyCamComp", "visit": 424} 

716 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

717 

718 if butler._datastore.isEphemeral: 

719 # Never disassembled 

720 self.assertIsInstance(uri, ResourcePath) 

721 self.assertFalse(components) 

722 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

723 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

724 else: 

725 self.assertIsNone(uri) 

726 self.assertEqual(set(components), set(storageClass.components)) 

727 for compuri in components.values(): 

728 self.assertIsInstance(compuri, ResourcePath) 

729 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

730 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

731 

732 def testStorageClassOverrideGet(self) -> None: 

733 """Test storage class conversion on get with override.""" 

734 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

735 datasetTypeName = "anything" 

736 run = self.default_run 

737 

738 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

739 

740 # Create and store a dataset. 

741 metric = makeExampleMetrics() 

742 dataId = {"instrument": "DummyCamComp", "visit": 423} 

743 

744 ref = butler.put(metric, datasetType, dataId) 

745 

746 # Return native type. 

747 retrieved = butler.get(ref) 

748 self.assertEqual(retrieved, metric) 

749 

750 # Specify an override. 

751 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion") 

752 model = butler.get(ref, storageClass=new_sc) 

753 self.assertNotEqual(type(model), type(retrieved)) 

754 self.assertIs(type(model), new_sc.pytype) 

755 self.assertEqual(retrieved, model) 

756 

757 # Defer but override later. 

758 deferred = butler.getDeferred(ref) 

759 model = deferred.get(storageClass=new_sc) 

760 self.assertIs(type(model), new_sc.pytype) 

761 self.assertEqual(retrieved, model) 

762 

763 # Defer but override up front. 

764 deferred = butler.getDeferred(ref, storageClass=new_sc) 

765 model = deferred.get() 

766 self.assertIs(type(model), new_sc.pytype) 

767 self.assertEqual(retrieved, model) 

768 

769 # Retrieve a component. Should be a tuple. 

770 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple") 

771 self.assertIs(type(data), tuple) 

772 self.assertEqual(data, tuple(retrieved.data)) 

773 

774 # Parameter on the write storage class should work regardless 

775 # of read storage class. 

776 data = butler.get( 

777 "anything.data", 

778 dataId, 

779 storageClass="StructuredDataDataTestTuple", 

780 parameters={"slice": slice(2, 4)}, 

781 ) 

782 self.assertEqual(len(data), 2) 

783 

784 # Try a parameter that is known to the read storage class but not 

785 # the write storage class. 

786 with self.assertRaises(KeyError): 

787 butler.get( 

788 "anything.data", 

789 dataId, 

790 storageClass="StructuredDataDataTestTuple", 

791 parameters={"xslice": slice(2, 4)}, 

792 ) 

793 

794 def testPytypePutCoercion(self) -> None: 

795 """Test python type coercion on Butler.get and put.""" 

796 # Store some data with the normal example storage class. 

797 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

798 datasetTypeName = "test_metric" 

799 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName) 

800 

801 dataId = {"instrument": "DummyCamComp", "visit": 423} 

802 

803 # Put a dict and this should coerce to a MetricsExample 

804 test_dict = {"summary": {"a": 1}, "output": {"b": 2}} 

805 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424) 

806 test_metric = butler.get(metric_ref) 

807 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample") 

808 self.assertEqual(test_metric.summary, test_dict["summary"]) 

809 self.assertEqual(test_metric.output, test_dict["output"]) 

810 

811 # Check that the put still works if a DatasetType is given with 

812 # a definition matching this python type. 

813 registry_type = butler.registry.getDatasetType(datasetTypeName) 

814 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson") 

815 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425) 

816 self.assertEqual(metric2_ref.datasetType, registry_type) 

817 

818 # The get will return the type expected by registry. 

819 test_metric2 = butler.get(metric2_ref) 

820 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample") 

821 

822 # Make a new DatasetRef with the compatible but different DatasetType. 

823 # This should now return a dict. 

824 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run) 

825 test_dict2 = butler.get(new_ref) 

826 self.assertEqual(get_full_type_name(test_dict2), "dict") 

827 

828 # Get it again with the wrong dataset type definition using get() 

829 # rather than get(). This should be consistent with get() 

830 # behavior and return the type of the DatasetType. 

831 test_dict3 = butler.get(this_type, dataId=dataId, visit=425) 

832 self.assertEqual(get_full_type_name(test_dict3), "dict") 

833 

834 def testIngest(self) -> None: 

835 butler = Butler(self.tmpConfigFile, run=self.default_run) 

836 

837 # Create and register a DatasetType 

838 dimensions = butler.dimensions.extract(["instrument", "visit", "detector"]) 

839 

840 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

841 datasetTypeName = "metric" 

842 

843 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

844 

845 # Add needed Dimensions 

846 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

847 butler.registry.insertDimensionData( 

848 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

849 ) 

850 for detector in (1, 2): 

851 butler.registry.insertDimensionData( 

852 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

853 ) 

854 

855 butler.registry.insertDimensionData( 

856 "visit", 

857 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

858 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

859 ) 

860 

861 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter") 

862 dataRoot = os.path.join(TESTDIR, "data", "basic") 

863 datasets = [] 

864 for detector in (1, 2): 

865 detector_name = f"detector_{detector}" 

866 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

867 dataId = butler.registry.expandDataId( 

868 {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

869 ) 

870 # Create a DatasetRef for ingest 

871 refIn = DatasetRef(datasetType, dataId, run=self.default_run) 

872 

873 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

874 

875 butler.ingest(*datasets, transfer="copy") 

876 

877 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

878 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

879 

880 metrics1 = butler.get(datasetTypeName, dataId1) 

881 metrics2 = butler.get(datasetTypeName, dataId2) 

882 self.assertNotEqual(metrics1, metrics2) 

883 

884 # Compare URIs 

885 uri1 = butler.getURI(datasetTypeName, dataId1) 

886 uri2 = butler.getURI(datasetTypeName, dataId2) 

887 self.assertNotEqual(uri1, uri2) 

888 

889 # Now do a multi-dataset but single file ingest 

890 metricFile = os.path.join(dataRoot, "detectors.yaml") 

891 refs = [] 

892 for detector in (1, 2): 

893 detector_name = f"detector_{detector}" 

894 dataId = butler.registry.expandDataId( 

895 {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

896 ) 

897 # Create a DatasetRef for ingest 

898 refs.append(DatasetRef(datasetType, dataId, run=self.default_run)) 

899 

900 # Test "move" transfer to ensure that the files themselves 

901 # have disappeared following ingest. 

902 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile: 

903 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy") 

904 

905 datasets = [] 

906 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter)) 

907 

908 # For first ingest use copy. 

909 butler.ingest(*datasets, transfer="copy", record_validation_info=False) 

910 

911 # Now try to ingest again in "execution butler" mode where 

912 # the registry entries exist but the datastore does not have 

913 # the files. We also need to strip the dimension records to ensure 

914 # that they will be re-added by the ingest. 

915 ref = datasets[0].refs[0] 

916 datasets[0].refs = [ 

917 cast( 

918 DatasetRef, 

919 butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), 

920 ) 

921 for ref in datasets[0].refs 

922 ] 

923 all_refs = [] 

924 for dataset in datasets: 

925 refs = [] 

926 for ref in dataset.refs: 

927 # Create a dict from the dataId to drop the records. 

928 new_data_id = {str(k): v for k, v in ref.dataId.items()} 

929 new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run) 

930 assert new_ref is not None 

931 self.assertFalse(new_ref.dataId.hasRecords()) 

932 refs.append(new_ref) 

933 dataset.refs = refs 

934 all_refs.extend(dataset.refs) 

935 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False) 

936 

937 # Use move mode to test that the file is deleted. Also 

938 # disable recording of file size. 

939 butler.ingest(*datasets, transfer="move", record_validation_info=False) 

940 

941 # Check that every ref now has records. 

942 for dataset in datasets: 

943 for ref in dataset.refs: 

944 self.assertTrue(ref.dataId.hasRecords()) 

945 

946 # Ensure that the file has disappeared. 

947 self.assertFalse(tempFile.exists()) 

948 

949 # Check that the datastore recorded no file size. 

950 # Not all datastores can support this. 

951 try: 

952 infos = butler._datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined] 

953 self.assertEqual(infos[0].file_size, -1) 

954 except AttributeError: 

955 pass 

956 

957 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

958 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

959 

960 multi1 = butler.get(datasetTypeName, dataId1) 

961 multi2 = butler.get(datasetTypeName, dataId2) 

962 

963 self.assertEqual(multi1, metrics1) 

964 self.assertEqual(multi2, metrics2) 

965 

966 # Compare URIs 

967 uri1 = butler.getURI(datasetTypeName, dataId1) 

968 uri2 = butler.getURI(datasetTypeName, dataId2) 

969 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

970 

971 # Test that removing one does not break the second 

972 # This line will issue a warning log message for a ChainedDatastore 

973 # that uses an InMemoryDatastore since in-memory can not ingest 

974 # files. 

975 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

976 self.assertFalse(butler.exists(datasetTypeName, dataId1)) 

977 self.assertTrue(butler.exists(datasetTypeName, dataId2)) 

978 multi2b = butler.get(datasetTypeName, dataId2) 

979 self.assertEqual(multi2, multi2b) 

980 

981 # Ensure we can ingest 0 datasets 

982 datasets = [] 

983 butler.ingest(*datasets) 

984 

985 def testPickle(self) -> None: 

986 """Test pickle support.""" 

987 butler = Butler(self.tmpConfigFile, run=self.default_run) 

988 butlerOut = pickle.loads(pickle.dumps(butler)) 

989 self.assertIsInstance(butlerOut, Butler) 

990 self.assertEqual(butlerOut._config, butler._config) 

991 self.assertEqual(butlerOut.collections, butler.collections) 

992 self.assertEqual(butlerOut.run, butler.run) 

993 

994 def testGetDatasetTypes(self) -> None: 

995 butler = Butler(self.tmpConfigFile, run=self.default_run) 

996 dimensions = butler.dimensions.extract(["instrument", "visit", "physical_filter"]) 

997 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [ 

998 ( 

999 "instrument", 

1000 [ 

1001 {"instrument": "DummyCam"}, 

1002 {"instrument": "DummyHSC"}, 

1003 {"instrument": "DummyCamComp"}, 

1004 ], 

1005 ), 

1006 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]), 

1007 ("visit", [{"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}]), 

1008 ] 

1009 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1010 # Add needed Dimensions 

1011 for element, data in dimensionEntries: 

1012 butler.registry.insertDimensionData(element, *data) 

1013 

1014 # When a DatasetType is added to the registry entries are not created 

1015 # for components but querying them can return the components. 

1016 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

1017 components = set() 

1018 for datasetTypeName in datasetTypeNames: 

1019 # Create and register a DatasetType 

1020 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1021 

1022 for componentName in storageClass.components: 

1023 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

1024 

1025 fromRegistry: set[DatasetType] = set() 

1026 for parent_dataset_type in butler.registry.queryDatasetTypes(): 

1027 fromRegistry.add(parent_dataset_type) 

1028 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes()) 

1029 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

1030 

1031 # Now that we have some dataset types registered, validate them 

1032 butler.validateConfiguration( 

1033 ignore=[ 

1034 "test_metric_comp", 

1035 "metric3", 

1036 "metric5", 

1037 "calexp", 

1038 "DummySC", 

1039 "datasetType.component", 

1040 "random_data", 

1041 "random_data_2", 

1042 ] 

1043 ) 

1044 

1045 # Add a new datasetType that will fail template validation 

1046 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

1047 if self.validationCanFail: 

1048 with self.assertRaises(ValidationError): 

1049 butler.validateConfiguration() 

1050 

1051 # Rerun validation but with a subset of dataset type names 

1052 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

1053 

1054 # Rerun validation but ignore the bad datasetType 

1055 butler.validateConfiguration( 

1056 ignore=[ 

1057 "test_metric_comp", 

1058 "metric3", 

1059 "metric5", 

1060 "calexp", 

1061 "DummySC", 

1062 "datasetType.component", 

1063 "random_data", 

1064 "random_data_2", 

1065 ] 

1066 ) 

1067 

1068 def testTransaction(self) -> None: 

1069 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1070 datasetTypeName = "test_metric" 

1071 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1072 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = ( 

1073 ("instrument", {"instrument": "DummyCam"}), 

1074 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

1075 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

1076 ) 

1077 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1078 metric = makeExampleMetrics() 

1079 dataId = {"instrument": "DummyCam", "visit": 42} 

1080 # Create and register a DatasetType 

1081 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1082 with self.assertRaises(TransactionTestError): 

1083 with butler.transaction(): 

1084 # Add needed Dimensions 

1085 for args in dimensionEntries: 

1086 butler.registry.insertDimensionData(*args) 

1087 # Store a dataset 

1088 ref = butler.put(metric, datasetTypeName, dataId) 

1089 self.assertIsInstance(ref, DatasetRef) 

1090 # Test getDirect 

1091 metricOut = butler.get(ref) 

1092 self.assertEqual(metric, metricOut) 

1093 # Test get 

1094 metricOut = butler.get(datasetTypeName, dataId) 

1095 self.assertEqual(metric, metricOut) 

1096 # Check we can get components 

1097 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

1098 raise TransactionTestError("This should roll back the entire transaction") 

1099 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"): 

1100 butler.registry.expandDataId(dataId) 

1101 # Should raise LookupError for missing data ID value 

1102 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

1103 butler.get(datasetTypeName, dataId) 

1104 # Also check explicitly if Dataset entry is missing 

1105 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

1106 # Direct retrieval should not find the file in the Datastore 

1107 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

1108 butler.get(ref) 

1109 

1110 def testMakeRepo(self) -> None: 

1111 """Test that we can write butler configuration to a new repository via 

1112 the Butler.makeRepo interface and then instantiate a butler from the 

1113 repo root. 

1114 """ 

1115 # Do not run the test if we know this datastore configuration does 

1116 # not support a file system root 

1117 if self.fullConfigKey is None: 

1118 return 

1119 

1120 # create two separate directories 

1121 root1 = tempfile.mkdtemp(dir=self.root) 

1122 root2 = tempfile.mkdtemp(dir=self.root) 

1123 

1124 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

1125 limited = Config(self.configFile) 

1126 butler1 = Butler(butlerConfig) 

1127 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

1128 full = Config(self.tmpConfigFile) 

1129 butler2 = Butler(butlerConfig) 

1130 # Butlers should have the same configuration regardless of whether 

1131 # defaults were expanded. 

1132 self.assertEqual(butler1._config, butler2._config) 

1133 # Config files loaded directly should not be the same. 

1134 self.assertNotEqual(limited, full) 

1135 # Make sure "limited" doesn't have a few keys we know it should be 

1136 # inheriting from defaults. 

1137 self.assertIn(self.fullConfigKey, full) 

1138 self.assertNotIn(self.fullConfigKey, limited) 

1139 

1140 # Collections don't appear until something is put in them 

1141 collections1 = set(butler1.registry.queryCollections()) 

1142 self.assertEqual(collections1, set()) 

1143 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

1144 

1145 # Check that a config with no associated file name will not 

1146 # work properly with relocatable Butler repo 

1147 butlerConfig.configFile = None 

1148 with self.assertRaises(ValueError): 

1149 Butler(butlerConfig) 

1150 

1151 with self.assertRaises(FileExistsError): 

1152 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

1153 

1154 def testStringification(self) -> None: 

1155 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1156 butlerStr = str(butler) 

1157 

1158 if self.datastoreStr is not None: 

1159 for testStr in self.datastoreStr: 

1160 self.assertIn(testStr, butlerStr) 

1161 if self.registryStr is not None: 

1162 self.assertIn(self.registryStr, butlerStr) 

1163 

1164 datastoreName = butler._datastore.name 

1165 if self.datastoreName is not None: 

1166 for testStr in self.datastoreName: 

1167 self.assertIn(testStr, datastoreName) 

1168 

1169 def testButlerRewriteDataId(self) -> None: 

1170 """Test that dataIds can be rewritten based on dimension records.""" 

1171 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1172 

1173 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1174 datasetTypeName = "random_data" 

1175 

1176 # Create dimension records. 

1177 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1178 butler.registry.insertDimensionData( 

1179 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1180 ) 

1181 butler.registry.insertDimensionData( 

1182 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1183 ) 

1184 

1185 dimensions = butler.dimensions.extract(["instrument", "exposure"]) 

1186 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1187 butler.registry.registerDatasetType(datasetType) 

1188 

1189 n_exposures = 5 

1190 dayobs = 20210530 

1191 

1192 for i in range(n_exposures): 

1193 butler.registry.insertDimensionData( 

1194 "exposure", 

1195 { 

1196 "instrument": "DummyCamComp", 

1197 "id": i, 

1198 "obs_id": f"exp{i}", 

1199 "seq_num": i, 

1200 "day_obs": dayobs, 

1201 "physical_filter": "d-r", 

1202 }, 

1203 ) 

1204 

1205 # Write some data. 

1206 for i in range(n_exposures): 

1207 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1208 

1209 # Use the seq_num for the put to test rewriting. 

1210 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1211 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1212 

1213 # Check that the exposure is correct in the dataId 

1214 self.assertEqual(ref.dataId["exposure"], i) 

1215 

1216 # and check that we can get the dataset back with the same dataId 

1217 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1218 self.assertEqual(new_metric, metric) 

1219 

1220 

1221class FileDatastoreButlerTests(ButlerTests): 

1222 """Common tests and specialization of ButlerTests for butlers backed 

1223 by datastores that inherit from FileDatastore. 

1224 """ 

1225 

1226 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool: 

1227 """Check if file exists at a given path (relative to root). 

1228 

1229 Test testPutTemplates verifies actual physical existance of the files 

1230 in the requested location. 

1231 """ 

1232 uri = ResourcePath(root, forceDirectory=True) 

1233 return uri.join(relpath).exists() 

1234 

1235 def testPutTemplates(self) -> None: 

1236 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1237 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1238 

1239 # Add needed Dimensions 

1240 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1241 butler.registry.insertDimensionData( 

1242 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1243 ) 

1244 butler.registry.insertDimensionData( 

1245 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1246 ) 

1247 butler.registry.insertDimensionData( 

1248 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1249 ) 

1250 

1251 # Create and store a dataset 

1252 metric = makeExampleMetrics() 

1253 

1254 # Create two almost-identical DatasetTypes (both will use default 

1255 # template) 

1256 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1257 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1258 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1259 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1260 

1261 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1262 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1263 

1264 # Put with exactly the data ID keys needed 

1265 ref = butler.put(metric, "metric1", dataId1) 

1266 uri = butler.getURI(ref) 

1267 self.assertTrue(uri.exists()) 

1268 self.assertTrue( 

1269 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle") 

1270 ) 

1271 

1272 # Check the template based on dimensions 

1273 if hasattr(butler._datastore, "templates"): 

1274 butler._datastore.templates.validateTemplates([ref]) 

1275 

1276 # Put with extra data ID keys (physical_filter is an optional 

1277 # dependency); should not change template (at least the way we're 

1278 # defining them to behave now; the important thing is that they 

1279 # must be consistent). 

1280 ref = butler.put(metric, "metric2", dataId2) 

1281 uri = butler.getURI(ref) 

1282 self.assertTrue(uri.exists()) 

1283 self.assertTrue( 

1284 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle") 

1285 ) 

1286 

1287 # Check the template based on dimensions 

1288 if hasattr(butler._datastore, "templates"): 

1289 butler._datastore.templates.validateTemplates([ref]) 

1290 

1291 # Use a template that has a typo in dimension record metadata. 

1292 # Easier to test with a butler that has a ref with records attached. 

1293 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits") 

1294 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"): 

1295 path = template.format(ref) 

1296 self.assertEqual(path, f"a/v423/{ref.id}_fits") 

1297 

1298 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits") 

1299 with self.assertRaises(KeyError): 

1300 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"): 

1301 template.format(ref) 

1302 

1303 # Now use a file template that will not result in unique filenames 

1304 with self.assertRaises(FileTemplateValidationError): 

1305 butler.put(metric, "metric3", dataId1) 

1306 

1307 def testImportExport(self) -> None: 

1308 # Run put/get tests just to create and populate a repo. 

1309 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1310 self.runImportExportTest(storageClass) 

1311 

1312 @unittest.expectedFailure 

1313 def testImportExportVirtualComposite(self) -> None: 

1314 # Run put/get tests just to create and populate a repo. 

1315 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1316 self.runImportExportTest(storageClass) 

1317 

1318 def runImportExportTest(self, storageClass: StorageClass) -> None: 

1319 """Test exporting and importing. 

1320 

1321 This test does an export to a temp directory and an import back 

1322 into a new temp directory repo. It does not assume a posix datastore. 

1323 """ 

1324 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1325 

1326 # Test that we must have a file extension. 

1327 with self.assertRaises(ValueError): 

1328 with exportButler.export(filename="dump", directory=".") as export: 

1329 pass 

1330 

1331 # Test that unknown format is not allowed. 

1332 with self.assertRaises(ValueError): 

1333 with exportButler.export(filename="dump.fits", directory=".") as export: 

1334 pass 

1335 

1336 # Test that the repo actually has at least one dataset. 

1337 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1338 self.assertGreater(len(datasets), 0) 

1339 # Add a DimensionRecord that's unused by those datasets. 

1340 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1341 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1342 # Export and then import datasets. 

1343 with safeTestTempDir(TESTDIR) as exportDir: 

1344 exportFile = os.path.join(exportDir, "exports.yaml") 

1345 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1346 export.saveDatasets(datasets) 

1347 # Export the same datasets again. This should quietly do 

1348 # nothing because of internal deduplication, and it shouldn't 

1349 # complain about being asked to export the "htm7" elements even 

1350 # though there aren't any in these datasets or in the database. 

1351 export.saveDatasets(datasets, elements=["htm7"]) 

1352 # Save one of the data IDs again; this should be harmless 

1353 # because of internal deduplication. 

1354 export.saveDataIds([datasets[0].dataId]) 

1355 # Save some dimension records directly. 

1356 export.saveDimensionData("skymap", [skymapRecord]) 

1357 self.assertTrue(os.path.exists(exportFile)) 

1358 with safeTestTempDir(TESTDIR) as importDir: 

1359 # We always want this to be a local posix butler 

1360 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1361 # Calling script.butlerImport tests the implementation of the 

1362 # butler command line interface "import" subcommand. Functions 

1363 # in the script folder are generally considered protected and 

1364 # should not be used as public api. 

1365 with open(exportFile) as f: 

1366 script.butlerImport( 

1367 importDir, 

1368 export_file=f, 

1369 directory=exportDir, 

1370 transfer="auto", 

1371 skip_dimensions=None, 

1372 ) 

1373 importButler = Butler(importDir, run=self.default_run) 

1374 for ref in datasets: 

1375 with self.subTest(ref=ref): 

1376 # Test for existence by passing in the DatasetType and 

1377 # data ID separately, to avoid lookup by dataset_id. 

1378 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId)) 

1379 self.assertEqual( 

1380 list(importButler.registry.queryDimensionRecords("skymap")), 

1381 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)], 

1382 ) 

1383 

1384 def testRemoveRuns(self) -> None: 

1385 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1386 butler = Butler(self.tmpConfigFile, writeable=True) 

1387 # Load registry data with dimensions to hang datasets off of. 

1388 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1389 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1390 # Add some RUN-type collection. 

1391 run1 = "run1" 

1392 butler.registry.registerRun(run1) 

1393 run2 = "run2" 

1394 butler.registry.registerRun(run2) 

1395 # put a dataset in each 

1396 metric = makeExampleMetrics() 

1397 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1398 datasetType = self.addDatasetType( 

1399 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1400 ) 

1401 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1402 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1403 uri1 = butler.getURI(ref1) 

1404 uri2 = butler.getURI(ref2) 

1405 

1406 with self.assertRaises(OrphanedRecordError): 

1407 butler.registry.removeDatasetType(datasetType.name) 

1408 

1409 # Remove from both runs with different values for unstore. 

1410 butler.removeRuns([run1], unstore=True) 

1411 butler.removeRuns([run2], unstore=False) 

1412 # Should be nothing in registry for either one, and datastore should 

1413 # not think either exists. 

1414 with self.assertRaises(MissingCollectionError): 

1415 butler.registry.getCollectionType(run1) 

1416 with self.assertRaises(MissingCollectionError): 

1417 butler.registry.getCollectionType(run2) 

1418 self.assertFalse(butler.stored(ref1)) 

1419 self.assertFalse(butler.stored(ref2)) 

1420 # The ref we unstored should be gone according to the URI, but the 

1421 # one we forgot should still be around. 

1422 self.assertFalse(uri1.exists()) 

1423 self.assertTrue(uri2.exists()) 

1424 

1425 # Now that the collections have been pruned we can remove the 

1426 # dataset type 

1427 butler.registry.removeDatasetType(datasetType.name) 

1428 

1429 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm: 

1430 butler.registry.removeDatasetType(tuple(["test*", "test*"])) 

1431 self.assertIn("not defined", "\n".join(cm.output)) 

1432 

1433 

1434class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1435 """PosixDatastore specialization of a butler""" 

1436 

1437 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1438 fullConfigKey: str | None = ".datastore.formatters" 

1439 validationCanFail = True 

1440 datastoreStr = ["/tmp"] 

1441 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1442 registryStr = "/gen3.sqlite3" 

1443 

1444 def testPathConstructor(self) -> None: 

1445 """Independent test of constructor using PathLike.""" 

1446 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1447 self.assertIsInstance(butler, Butler) 

1448 

1449 # And again with a Path object with the butler yaml 

1450 path = pathlib.Path(self.tmpConfigFile) 

1451 butler = Butler(path, writeable=False) 

1452 self.assertIsInstance(butler, Butler) 

1453 

1454 # And again with a Path object without the butler yaml 

1455 # (making sure we skip it if the tmp config doesn't end 

1456 # in butler.yaml -- which is the case for a subclass) 

1457 if self.tmpConfigFile.endswith("butler.yaml"): 

1458 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1459 butler = Butler(path, writeable=False) 

1460 self.assertIsInstance(butler, Butler) 

1461 

1462 def testExportTransferCopy(self) -> None: 

1463 """Test local export using all transfer modes""" 

1464 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1465 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1466 # Test that the repo actually has at least one dataset. 

1467 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1468 self.assertGreater(len(datasets), 0) 

1469 uris = [exportButler.getURI(d) for d in datasets] 

1470 assert isinstance(exportButler._datastore, FileDatastore) 

1471 datastoreRoot = exportButler.get_datastore_roots()[exportButler.get_datastore_names()[0]] 

1472 

1473 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1474 

1475 for path in pathsInStore: 

1476 # Assume local file system 

1477 assert path is not None 

1478 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1479 

1480 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1481 with safeTestTempDir(TESTDIR) as exportDir: 

1482 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1483 export.saveDatasets(datasets) 

1484 for path in pathsInStore: 

1485 assert path is not None 

1486 self.assertTrue( 

1487 self.checkFileExists(exportDir, path), 

1488 f"Check that mode {transfer} exported files", 

1489 ) 

1490 

1491 def testPruneDatasets(self) -> None: 

1492 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1493 butler = Butler(self.tmpConfigFile, writeable=True) 

1494 assert isinstance(butler._datastore, FileDatastore) 

1495 # Load registry data with dimensions to hang datasets off of. 

1496 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1497 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1498 # Add some RUN-type collections. 

1499 run1 = "run1" 

1500 butler.registry.registerRun(run1) 

1501 run2 = "run2" 

1502 butler.registry.registerRun(run2) 

1503 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1504 # different runs. ref3 has a different data ID. 

1505 metric = makeExampleMetrics() 

1506 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1507 datasetType = self.addDatasetType( 

1508 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1509 ) 

1510 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1511 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1512 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1513 

1514 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1515 for ref, stored in many_stored.items(): 

1516 self.assertTrue(stored, f"Ref {ref} should be stored") 

1517 

1518 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1519 for ref, exists in many_exists.items(): 

1520 self.assertTrue(exists, f"Checking ref {ref} exists.") 

1521 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored") 

1522 

1523 # Simple prune. 

1524 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1525 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1)) 

1526 

1527 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1528 for ref, stored in many_stored.items(): 

1529 self.assertFalse(stored, f"Ref {ref} should not be stored") 

1530 

1531 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1532 for ref, exists in many_exists.items(): 

1533 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored") 

1534 

1535 # Put data back. 

1536 ref1_new = butler.put(metric, ref1) 

1537 self.assertEqual(ref1_new, ref1) # Reuses original ID. 

1538 ref2 = butler.put(metric, ref2) 

1539 

1540 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1541 self.assertTrue(many_stored[ref1]) 

1542 self.assertTrue(many_stored[ref2]) 

1543 self.assertFalse(many_stored[ref3]) 

1544 

1545 ref3 = butler.put(metric, ref3) 

1546 

1547 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1548 for ref, exists in many_exists.items(): 

1549 self.assertTrue(exists, f"Ref {ref} should not be stored") 

1550 

1551 # Clear out the datasets from registry and start again. 

1552 refs = [ref1, ref2, ref3] 

1553 butler.pruneDatasets(refs, purge=True, unstore=True) 

1554 for ref in refs: 

1555 butler.put(metric, ref) 

1556 

1557 # Test different forms of file availability. 

1558 # Need to be in a state where: 

1559 # - one ref just has registry record. 

1560 # - one ref has a missing file but a datastore record. 

1561 # - one ref has a missing datastore record but file is there. 

1562 # - one ref does not exist anywhere. 

1563 # Do not need to test a ref that has everything since that is tested 

1564 # above. 

1565 ref0 = DatasetRef( 

1566 datasetType, 

1567 DataCoordinate.standardize( 

1568 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions 

1569 ), 

1570 run=run1, 

1571 ) 

1572 

1573 # Delete from datastore and retain in Registry. 

1574 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False) 

1575 

1576 # File has been removed. 

1577 uri2 = butler.getURI(ref2) 

1578 uri2.remove() 

1579 

1580 # Datastore has lost track. 

1581 butler._datastore.forget([ref3]) 

1582 

1583 # First test with a standard butler. 

1584 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1585 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1586 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1587 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1588 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED) 

1589 

1590 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False) 

1591 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1592 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1593 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN) 

1594 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1595 self.assertTrue(exists_many[ref2]) 

1596 

1597 # Check that per-ref query gives the same answer as many query. 

1598 for ref, exists in exists_many.items(): 

1599 self.assertEqual(butler.exists(ref, full_check=False), exists) 

1600 

1601 # Test again with a trusting butler. 

1602 butler._datastore.trustGetRequest = True 

1603 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1604 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1605 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1606 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1607 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT) 

1608 

1609 # Check that per-ref query gives the same answer as many query. 

1610 for ref, exists in exists_many.items(): 

1611 self.assertEqual(butler.exists(ref, full_check=True), exists) 

1612 

1613 # Create a ref that surprisingly has the UUID of an existing ref 

1614 # but is not the same. 

1615 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id) 

1616 with self.assertRaises(ValueError): 

1617 butler.exists(ref_bad) 

1618 

1619 # Create a ref that has a compatible storage class. 

1620 ref_compat = ref2.overrideStorageClass("StructuredDataDict") 

1621 exists = butler.exists(ref_compat) 

1622 self.assertEqual(exists, exists_many[ref2]) 

1623 

1624 # Remove everything and start from scratch. 

1625 butler._datastore.trustGetRequest = False 

1626 butler.pruneDatasets(refs, purge=True, unstore=True) 

1627 for ref in refs: 

1628 butler.put(metric, ref) 

1629 

1630 # These tests mess directly with the trash table and can leave the 

1631 # datastore in an odd state. Do them at the end. 

1632 # Check that in normal mode, deleting the record will lead to 

1633 # trash not touching the file. 

1634 uri1 = butler.getURI(ref1) 

1635 butler._datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table 

1636 butler._datastore.forget([ref1]) 

1637 butler._datastore.trash(ref1) 

1638 butler._datastore.emptyTrash() 

1639 self.assertTrue(uri1.exists()) 

1640 uri1.remove() # Clean it up. 

1641 

1642 # Simulate execution butler setup by deleting the datastore 

1643 # record but keeping the file around and trusting. 

1644 butler._datastore.trustGetRequest = True 

1645 uris = butler.get_many_uris([ref2, ref3]) 

1646 uri2 = uris[ref2].primaryURI 

1647 uri3 = uris[ref3].primaryURI 

1648 self.assertTrue(uri2.exists()) 

1649 self.assertTrue(uri3.exists()) 

1650 

1651 # Remove the datastore record. 

1652 butler._datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table 

1653 butler._datastore.forget([ref2]) 

1654 self.assertTrue(uri2.exists()) 

1655 butler._datastore.trash([ref2, ref3]) 

1656 # Immediate removal for ref2 file 

1657 self.assertFalse(uri2.exists()) 

1658 # But ref3 has to wait for the empty. 

1659 self.assertTrue(uri3.exists()) 

1660 butler._datastore.emptyTrash() 

1661 self.assertFalse(uri3.exists()) 

1662 

1663 # Clear out the datasets from registry. 

1664 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1665 

1666 def testPytypeCoercion(self) -> None: 

1667 """Test python type coercion on Butler.get and put.""" 

1668 # Store some data with the normal example storage class. 

1669 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1670 datasetTypeName = "test_metric" 

1671 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1672 

1673 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1674 metric = butler.get(datasetTypeName, dataId=dataId) 

1675 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1676 

1677 datasetType_ori = butler.registry.getDatasetType(datasetTypeName) 

1678 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1679 

1680 # Now need to hack the registry dataset type definition. 

1681 # There is no API for this. 

1682 assert isinstance(butler._registry, SqlRegistry) 

1683 manager = butler._registry._managers.datasets 

1684 assert hasattr(manager, "_db") and hasattr(manager, "_static") 

1685 manager._db.update( 

1686 manager._static.dataset_type, 

1687 {"name": datasetTypeName}, 

1688 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1689 ) 

1690 

1691 # Force reset of dataset type cache 

1692 butler.registry.refresh() 

1693 

1694 datasetType_new = butler.registry.getDatasetType(datasetTypeName) 

1695 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1696 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1697 

1698 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1699 self.assertNotEqual(type(metric_model), type(metric)) 

1700 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1701 

1702 # Put the model and read it back to show that everything now 

1703 # works as normal. 

1704 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1705 metric_model_new = butler.get(metric_ref) 

1706 self.assertEqual(metric_model_new, metric_model) 

1707 

1708 # Hack the storage class again to something that will fail on the 

1709 # get with no conversion class. 

1710 manager._db.update( 

1711 manager._static.dataset_type, 

1712 {"name": datasetTypeName}, 

1713 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1714 ) 

1715 butler.registry.refresh() 

1716 

1717 with self.assertRaises(ValueError): 

1718 butler.get(datasetTypeName, dataId=dataId) 

1719 

1720 

1721@unittest.skipUnless(testing is not None, "testing.postgresql module not found") 

1722class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1723 """PosixDatastore specialization of a butler using Postgres""" 

1724 

1725 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1726 fullConfigKey = ".datastore.formatters" 

1727 validationCanFail = True 

1728 datastoreStr = ["/tmp"] 

1729 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1730 registryStr = "PostgreSQL@test" 

1731 postgresql: Any 

1732 

1733 @staticmethod 

1734 def _handler(postgresql: Any) -> None: 

1735 engine = sqlalchemy.engine.create_engine(postgresql.url()) 

1736 with engine.begin() as connection: 

1737 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;")) 

1738 

1739 @classmethod 

1740 def setUpClass(cls) -> None: 

1741 # Create the postgres test server. 

1742 cls.postgresql = testing.postgresql.PostgresqlFactory( 

1743 cache_initialized_db=True, on_initialized=cls._handler 

1744 ) 

1745 super().setUpClass() 

1746 

1747 @classmethod 

1748 def tearDownClass(cls) -> None: 

1749 # Clean up any lingering SQLAlchemy engines/connections 

1750 # so they're closed before we shut down the server. 

1751 gc.collect() 

1752 cls.postgresql.clear_cache() 

1753 super().tearDownClass() 

1754 

1755 def setUp(self) -> None: 

1756 self.server = self.postgresql() 

1757 

1758 # Need to add a registry section to the config. 

1759 self._temp_config = False 

1760 config = Config(self.configFile) 

1761 config["registry", "db"] = self.server.url() 

1762 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh: 

1763 config.dump(fh) 

1764 self.configFile = fh.name 

1765 self._temp_config = True 

1766 super().setUp() 

1767 

1768 def tearDown(self) -> None: 

1769 self.server.stop() 

1770 if self._temp_config and os.path.exists(self.configFile): 

1771 os.remove(self.configFile) 

1772 super().tearDown() 

1773 

1774 def testMakeRepo(self) -> None: 

1775 # The base class test assumes that it's using sqlite and assumes 

1776 # the config file is acceptable to sqlite. 

1777 raise unittest.SkipTest("Postgres config is not compatible with this test.") 

1778 

1779 

1780class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1781 """InMemoryDatastore specialization of a butler""" 

1782 

1783 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1784 fullConfigKey = None 

1785 useTempRoot = False 

1786 validationCanFail = False 

1787 datastoreStr = ["datastore='InMemory"] 

1788 datastoreName = ["InMemoryDatastore@"] 

1789 registryStr = "/gen3.sqlite3" 

1790 

1791 def testIngest(self) -> None: 

1792 pass 

1793 

1794 

1795class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1796 """PosixDatastore specialization""" 

1797 

1798 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1799 fullConfigKey = ".datastore.datastores.1.formatters" 

1800 validationCanFail = True 

1801 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1802 datastoreName = [ 

1803 "InMemoryDatastore@", 

1804 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1805 "SecondDatastore", 

1806 ] 

1807 registryStr = "/gen3.sqlite3" 

1808 

1809 

1810class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1811 """Test that a yaml file in one location can refer to a root in another.""" 

1812 

1813 datastoreStr = ["dir1"] 

1814 # Disable the makeRepo test since we are deliberately not using 

1815 # butler.yaml as the config name. 

1816 fullConfigKey = None 

1817 

1818 def setUp(self) -> None: 

1819 self.root = makeTestTempDir(TESTDIR) 

1820 

1821 # Make a new repository in one place 

1822 self.dir1 = os.path.join(self.root, "dir1") 

1823 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1824 

1825 # Move the yaml file to a different place and add a "root" 

1826 self.dir2 = os.path.join(self.root, "dir2") 

1827 os.makedirs(self.dir2, exist_ok=True) 

1828 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1829 config = Config(configFile1) 

1830 config["root"] = self.dir1 

1831 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1832 config.dumpToUri(configFile2) 

1833 os.remove(configFile1) 

1834 self.tmpConfigFile = configFile2 

1835 

1836 def testFileLocations(self) -> None: 

1837 self.assertNotEqual(self.dir1, self.dir2) 

1838 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1839 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1840 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1841 

1842 

1843class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1844 """Test that a config file created by makeRepo outside of repo works.""" 

1845 

1846 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1847 

1848 def setUp(self) -> None: 

1849 self.root = makeTestTempDir(TESTDIR) 

1850 self.root2 = makeTestTempDir(TESTDIR) 

1851 

1852 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1853 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1854 

1855 def tearDown(self) -> None: 

1856 if os.path.exists(self.root2): 

1857 shutil.rmtree(self.root2, ignore_errors=True) 

1858 super().tearDown() 

1859 

1860 def testConfigExistence(self) -> None: 

1861 c = Config(self.tmpConfigFile) 

1862 uri_config = ResourcePath(c["root"]) 

1863 uri_expected = ResourcePath(self.root, forceDirectory=True) 

1864 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1865 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1866 

1867 def testPutGet(self) -> None: 

1868 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1869 self.runPutGetTest(storageClass, "test_metric") 

1870 

1871 

1872class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1873 """Test that a config file created by makeRepo outside of repo works.""" 

1874 

1875 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1876 

1877 def setUp(self) -> None: 

1878 self.root = makeTestTempDir(TESTDIR) 

1879 self.root2 = makeTestTempDir(TESTDIR) 

1880 

1881 self.tmpConfigFile = self.root2 

1882 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1883 

1884 def testConfigExistence(self) -> None: 

1885 # Append the yaml file else Config constructor does not know the file 

1886 # type. 

1887 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1888 super().testConfigExistence() 

1889 

1890 

1891class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1892 """Test that a config file created by makeRepo outside of repo works.""" 

1893 

1894 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1895 

1896 def setUp(self) -> None: 

1897 self.root = makeTestTempDir(TESTDIR) 

1898 self.root2 = makeTestTempDir(TESTDIR) 

1899 

1900 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl() 

1901 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1902 

1903 

1904@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1905class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1906 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1907 a local in-memory SqlRegistry. 

1908 """ 

1909 

1910 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1911 fullConfigKey = None 

1912 validationCanFail = True 

1913 

1914 bucketName = "anybucketname" 

1915 """Name of the Bucket that will be used in the tests. The name is read from 

1916 the config file used with the tests during set-up. 

1917 """ 

1918 

1919 root = "butlerRoot/" 

1920 """Root repository directory expected to be used in case useTempRoot=False. 

1921 Otherwise the root is set to a 20 characters long randomly generated string 

1922 during set-up. 

1923 """ 

1924 

1925 datastoreStr = [f"datastore={root}"] 

1926 """Contains all expected root locations in a format expected to be 

1927 returned by Butler stringification. 

1928 """ 

1929 

1930 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1931 """The expected format of the S3 Datastore string.""" 

1932 

1933 registryStr = "/gen3.sqlite3" 

1934 """Expected format of the Registry string.""" 

1935 

1936 mock_s3 = mock_s3() 

1937 """The mocked s3 interface from moto.""" 

1938 

1939 def genRoot(self) -> str: 

1940 """Return a random string of len 20 to serve as a root 

1941 name for the temporary bucket repo. 

1942 

1943 This is equivalent to tempfile.mkdtemp as this is what self.root 

1944 becomes when useTempRoot is True. 

1945 """ 

1946 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1947 return rndstr + "/" 

1948 

1949 def setUp(self) -> None: 

1950 config = Config(self.configFile) 

1951 uri = ResourcePath(config[".datastore.datastore.root"]) 

1952 self.bucketName = uri.netloc 

1953 

1954 # Enable S3 mocking of tests. 

1955 self.mock_s3.start() 

1956 

1957 # set up some fake credentials if they do not exist 

1958 self.usingDummyCredentials = setAwsEnvCredentials() 

1959 

1960 if self.useTempRoot: 

1961 self.root = self.genRoot() 

1962 rooturi = f"s3://{self.bucketName}/{self.root}" 

1963 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1964 

1965 # need local folder to store registry database 

1966 self.reg_dir = makeTestTempDir(TESTDIR) 

1967 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1968 

1969 # MOTO needs to know that we expect Bucket bucketname to exist 

1970 # (this used to be the class attribute bucketName) 

1971 s3 = boto3.resource("s3") 

1972 s3.create_bucket(Bucket=self.bucketName) 

1973 

1974 self.datastoreStr = [f"datastore='{rooturi}'"] 

1975 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1976 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1977 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1978 

1979 def tearDown(self) -> None: 

1980 s3 = boto3.resource("s3") 

1981 bucket = s3.Bucket(self.bucketName) 

1982 try: 

1983 bucket.objects.all().delete() 

1984 except botocore.exceptions.ClientError as e: 

1985 if e.response["Error"]["Code"] == "404": 

1986 # the key was not reachable - pass 

1987 pass 

1988 else: 

1989 raise 

1990 

1991 bucket = s3.Bucket(self.bucketName) 

1992 bucket.delete() 

1993 

1994 # Stop the S3 mock. 

1995 self.mock_s3.stop() 

1996 

1997 # unset any potentially set dummy credentials 

1998 if self.usingDummyCredentials: 

1999 unsetAwsEnvCredentials() 

2000 

2001 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

2002 shutil.rmtree(self.reg_dir, ignore_errors=True) 

2003 

2004 if self.useTempRoot and os.path.exists(self.root): 

2005 shutil.rmtree(self.root, ignore_errors=True) 

2006 

2007 super().tearDown() 

2008 

2009 

2010class PosixDatastoreTransfers(unittest.TestCase): 

2011 """Test data transfers between butlers. 

2012 

2013 Test for different managers. UUID to UUID and integer to integer are 

2014 tested. UUID to integer is not supported since we do not currently 

2015 want to allow that. Integer to UUID is supported with the caveat 

2016 that UUID4 will be generated and this will be incorrect for raw 

2017 dataset types. The test ignores that. 

2018 """ 

2019 

2020 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

2021 storageClassFactory: StorageClassFactory 

2022 

2023 @classmethod 

2024 def setUpClass(cls) -> None: 

2025 cls.storageClassFactory = StorageClassFactory() 

2026 cls.storageClassFactory.addFromConfig(cls.configFile) 

2027 

2028 def setUp(self) -> None: 

2029 self.root = makeTestTempDir(TESTDIR) 

2030 self.config = Config(self.configFile) 

2031 

2032 def tearDown(self) -> None: 

2033 removeTestTempDir(self.root) 

2034 

2035 def create_butler(self, manager: str, label: str) -> Butler: 

2036 config = Config(self.configFile) 

2037 config["registry", "managers", "datasets"] = manager 

2038 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True) 

2039 

2040 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None: 

2041 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID" 

2042 if manager1 is None: 

2043 manager1 = default 

2044 if manager2 is None: 

2045 manager2 = default 

2046 self.source_butler = self.create_butler(manager1, "1") 

2047 self.target_butler = self.create_butler(manager2, "2") 

2048 

2049 def testTransferUuidToUuid(self) -> None: 

2050 self.create_butlers() 

2051 self.assertButlerTransfers() 

2052 

2053 def _enable_trust(self, datastore: Datastore) -> None: 

2054 if hasattr(datastore, "trustGetRequest"): 

2055 datastore.trustGetRequest = True 

2056 elif hasattr(datastore, "datastores"): 

2057 for datastore in datastore.datastores: 

2058 if hasattr(datastore, "trustGetRequest"): 

2059 datastore.trustGetRequest = True 

2060 

2061 def testTransferMissing(self) -> None: 

2062 """Test transfers where datastore records are missing. 

2063 

2064 This is how execution butler works. 

2065 """ 

2066 self.create_butlers() 

2067 

2068 # Configure the source butler to allow trust. 

2069 self._enable_trust(self.source_butler._datastore) 

2070 

2071 self.assertButlerTransfers(purge=True) 

2072 

2073 def testTransferMissingDisassembly(self) -> None: 

2074 """Test transfers where datastore records are missing. 

2075 

2076 This is how execution butler works. 

2077 """ 

2078 self.create_butlers() 

2079 

2080 # Configure the source butler to allow trust. 

2081 self._enable_trust(self.source_butler._datastore) 

2082 

2083 # Test disassembly. 

2084 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

2085 

2086 def testAbsoluteURITransferDirect(self) -> None: 

2087 """Test transfer using an absolute URI.""" 

2088 self._absolute_transfer("auto") 

2089 

2090 def testAbsoluteURITransferCopy(self) -> None: 

2091 """Test transfer using an absolute URI.""" 

2092 self._absolute_transfer("copy") 

2093 

2094 def _absolute_transfer(self, transfer: str) -> None: 

2095 self.create_butlers() 

2096 

2097 storageClassName = "StructuredData" 

2098 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2099 datasetTypeName = "random_data" 

2100 run = "run1" 

2101 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2102 

2103 dimensions = self.source_butler.dimensions.extract(()) 

2104 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2105 self.source_butler.registry.registerDatasetType(datasetType) 

2106 

2107 metrics = makeExampleMetrics() 

2108 with ResourcePath.temporary_uri(suffix=".json") as temp: 

2109 dataId = DataCoordinate.makeEmpty(self.source_butler.dimensions) 

2110 source_refs = [DatasetRef(datasetType, dataId, run=run)] 

2111 temp.write(json.dumps(metrics.exportAsDict()).encode()) 

2112 dataset = FileDataset(path=temp, refs=source_refs) 

2113 self.source_butler.ingest(dataset, transfer="direct") 

2114 

2115 self.target_butler.transfer_from( 

2116 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer 

2117 ) 

2118 

2119 uri = self.target_butler.getURI(dataset.refs[0]) 

2120 if transfer == "auto": 

2121 self.assertEqual(uri, temp) 

2122 else: 

2123 self.assertNotEqual(uri, temp) 

2124 

2125 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None: 

2126 """Test that a run can be transferred to another butler.""" 

2127 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2128 datasetTypeName = "random_data" 

2129 

2130 # Test will create 3 collections and we will want to transfer 

2131 # two of those three. 

2132 runs = ["run1", "run2", "other"] 

2133 

2134 # Also want to use two different dataset types to ensure that 

2135 # grouping works. 

2136 datasetTypeNames = ["random_data", "random_data_2"] 

2137 

2138 # Create the run collections in the source butler. 

2139 for run in runs: 

2140 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2141 

2142 # Create dimensions in source butler. 

2143 n_exposures = 30 

2144 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

2145 self.source_butler.registry.insertDimensionData( 

2146 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

2147 ) 

2148 self.source_butler.registry.insertDimensionData( 

2149 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

2150 ) 

2151 

2152 for i in range(n_exposures): 

2153 self.source_butler.registry.insertDimensionData( 

2154 "exposure", 

2155 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

2156 ) 

2157 

2158 # Create dataset types in the source butler. 

2159 dimensions = self.source_butler.dimensions.extract(["instrument", "exposure"]) 

2160 for datasetTypeName in datasetTypeNames: 

2161 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2162 self.source_butler.registry.registerDatasetType(datasetType) 

2163 

2164 # Write a dataset to an unrelated run -- this will ensure that 

2165 # we are rewriting integer dataset ids in the target if necessary. 

2166 # Will not be relevant for UUID. 

2167 run = "distraction" 

2168 butler = Butler(butler=self.source_butler, run=run) 

2169 butler.put( 

2170 makeExampleMetrics(), 

2171 datasetTypeName, 

2172 exposure=1, 

2173 instrument="DummyCamComp", 

2174 physical_filter="d-r", 

2175 ) 

2176 

2177 # Write some example metrics to the source 

2178 butler = Butler(butler=self.source_butler) 

2179 

2180 # Set of DatasetRefs that should be in the list of refs to transfer 

2181 # but which will not be transferred. 

2182 deleted: set[DatasetRef] = set() 

2183 

2184 n_expected = 20 # Number of datasets expected to be transferred 

2185 source_refs = [] 

2186 for i in range(n_exposures): 

2187 # Put a third of datasets into each collection, only retain 

2188 # two thirds. 

2189 index = i % 3 

2190 run = runs[index] 

2191 datasetTypeName = datasetTypeNames[i % 2] 

2192 

2193 metric = MetricsExample( 

2194 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)] 

2195 ) 

2196 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

2197 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

2198 

2199 # Remove the datastore record using low-level API 

2200 if purge: 

2201 # Remove records for a fraction. 

2202 if index == 1: 

2203 # For one of these delete the file as well. 

2204 # This allows the "missing" code to filter the 

2205 # file out. 

2206 # Access the individual datastores. 

2207 datastores = [] 

2208 if hasattr(butler._datastore, "datastores"): 

2209 datastores.extend(butler._datastore.datastores) 

2210 else: 

2211 datastores.append(butler._datastore) 

2212 

2213 if not deleted: 

2214 # For a chained datastore we need to remove 

2215 # files in each chain. 

2216 for datastore in datastores: 

2217 # The file might not be known to the datastore 

2218 # if constraints are used. 

2219 try: 

2220 primary, uris = datastore.getURIs(ref) 

2221 except FileNotFoundError: 

2222 continue 

2223 if primary: 

2224 if primary.scheme != "mem": 

2225 primary.remove() 

2226 for uri in uris.values(): 

2227 if uri.scheme != "mem": 

2228 uri.remove() 

2229 n_expected -= 1 

2230 deleted.add(ref) 

2231 

2232 # Remove the datastore record. 

2233 for datastore in datastores: 

2234 if hasattr(datastore, "removeStoredItemInfo"): 

2235 datastore.removeStoredItemInfo(ref) 

2236 

2237 if index < 2: 

2238 source_refs.append(ref) 

2239 if ref not in deleted: 

2240 new_metric = butler.get(ref) 

2241 self.assertEqual(new_metric, metric) 

2242 

2243 # Create some bad dataset types to ensure we check for inconsistent 

2244 # definitions. 

2245 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2246 for datasetTypeName in datasetTypeNames: 

2247 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2248 self.target_butler.registry.registerDatasetType(datasetType) 

2249 with self.assertRaises(ConflictingDefinitionError) as cm: 

2250 self.target_butler.transfer_from(self.source_butler, source_refs) 

2251 self.assertIn("dataset type differs", str(cm.exception)) 

2252 

2253 # And remove the bad definitions. 

2254 for datasetTypeName in datasetTypeNames: 

2255 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2256 

2257 # Transfer without creating dataset types should fail. 

2258 with self.assertRaises(KeyError): 

2259 self.target_butler.transfer_from(self.source_butler, source_refs) 

2260 

2261 # Transfer without creating dimensions should fail. 

2262 with self.assertRaises(ConflictingDefinitionError) as cm: 

2263 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True) 

2264 self.assertIn("dimension", str(cm.exception)) 

2265 

2266 # The failed transfer above leaves registry in an inconsistent 

2267 # state because the run is created but then rolled back without 

2268 # the collection cache being cleared. For now force a refresh. 

2269 # Can remove with DM-35498. 

2270 self.target_butler.registry.refresh() 

2271 

2272 # Now transfer them to the second butler, including dimensions. 

2273 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2274 transferred = self.target_butler.transfer_from( 

2275 self.source_butler, 

2276 source_refs, 

2277 register_dataset_types=True, 

2278 transfer_dimensions=True, 

2279 ) 

2280 self.assertEqual(len(transferred), n_expected) 

2281 log_output = ";".join(log_cm.output) 

2282 

2283 # A ChainedDatastore will use the in-memory datastore for mexists 

2284 # so we can not rely on the mexists log message. 

2285 self.assertIn("Number of datastore records found in source", log_output) 

2286 self.assertIn("Creating output run", log_output) 

2287 

2288 # Do the transfer twice to ensure that it will do nothing extra. 

2289 # Only do this if purge=True because it does not work for int 

2290 # dataset_id. 

2291 if purge: 

2292 # This should not need to register dataset types. 

2293 transferred = self.target_butler.transfer_from(self.source_butler, source_refs) 

2294 self.assertEqual(len(transferred), n_expected) 

2295 

2296 # Also do an explicit low-level transfer to trigger some 

2297 # edge cases. 

2298 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2299 self.target_butler._datastore.transfer_from(self.source_butler._datastore, source_refs) 

2300 log_output = ";".join(log_cm.output) 

2301 self.assertIn("no file artifacts exist", log_output) 

2302 

2303 with self.assertRaises((TypeError, AttributeError)): 

2304 self.target_butler._datastore.transfer_from(self.source_butler, source_refs) # type: ignore 

2305 

2306 with self.assertRaises(ValueError): 

2307 self.target_butler._datastore.transfer_from( 

2308 self.source_butler._datastore, source_refs, transfer="split" 

2309 ) 

2310 

2311 # Now try to get the same refs from the new butler. 

2312 for ref in source_refs: 

2313 if ref not in deleted: 

2314 new_metric = self.target_butler.get(ref) 

2315 old_metric = self.source_butler.get(ref) 

2316 self.assertEqual(new_metric, old_metric) 

2317 

2318 # Now prune run2 collection and create instead a CHAINED collection. 

2319 # This should block the transfer. 

2320 self.target_butler.removeRuns(["run2"], unstore=True) 

2321 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2322 with self.assertRaises(CollectionTypeError): 

2323 # Re-importing the run1 datasets can be problematic if they 

2324 # use integer IDs so filter those out. 

2325 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2326 self.target_butler.transfer_from(self.source_butler, to_transfer) 

2327 

2328 

2329class ChainedDatastoreTransfers(PosixDatastoreTransfers): 

2330 """Test transfers using a chained datastore.""" 

2331 

2332 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

2333 

2334 

2335def setup_module(module: types.ModuleType) -> None: 

2336 """Set up the module for pytest.""" 

2337 clean_environment() 

2338 

2339 

2340if __name__ == "__main__": 

2341 clean_environment() 

2342 unittest.main()