Coverage for tests/test_butler.py: 12%

1258 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-15 09:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for Butler. 

23""" 

24from __future__ import annotations 

25 

26import gc 

27import json 

28import logging 

29import os 

30import pathlib 

31import pickle 

32import posixpath 

33import random 

34import shutil 

35import string 

36import tempfile 

37import unittest 

38import uuid 

39from collections.abc import Mapping 

40from typing import TYPE_CHECKING, Any, cast 

41 

42try: 

43 import boto3 

44 import botocore 

45 from moto import mock_s3 # type: ignore[import] 

46except ImportError: 

47 boto3 = None 

48 

49 def mock_s3(cls): 

50 """A no-op decorator in case moto mock_s3 can not be imported.""" 

51 return cls 

52 

53 

54try: 

55 # It's possible but silly to have testing.postgresql installed without 

56 # having the postgresql server installed (because then nothing in 

57 # testing.postgresql would work), so we use the presence of that module 

58 # to test whether we can expect the server to be available. 

59 import testing.postgresql # type: ignore[import] 

60except ImportError: 

61 testing = None 

62 

63import astropy.time 

64import sqlalchemy 

65from lsst.daf.butler import ( 

66 Butler, 

67 ButlerConfig, 

68 ButlerRepoIndex, 

69 CollectionType, 

70 Config, 

71 DataCoordinate, 

72 DatasetExistence, 

73 DatasetRef, 

74 DatasetType, 

75 FileDataset, 

76 FileTemplate, 

77 FileTemplateValidationError, 

78 StorageClassFactory, 

79 ValidationError, 

80 script, 

81) 

82from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

83from lsst.daf.butler.datastores.fileDatastore import FileDatastore 

84from lsst.daf.butler.registries.sql import SqlRegistry 

85from lsst.daf.butler.registry import ( 

86 CollectionError, 

87 CollectionTypeError, 

88 ConflictingDefinitionError, 

89 DataIdValueError, 

90 MissingCollectionError, 

91 OrphanedRecordError, 

92) 

93from lsst.daf.butler.tests import MetricsExample, MultiDetectorFormatter 

94from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir, safeTestTempDir 

95from lsst.resources import ResourcePath 

96from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials 

97from lsst.utils import doImportType 

98from lsst.utils.introspection import get_full_type_name 

99 

100if TYPE_CHECKING: 

101 from lsst.daf.butler import Datastore, DimensionGraph, Registry, StorageClass 

102 

103TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

104 

105 

106def clean_environment() -> None: 

107 """Remove external environment variables that affect the tests.""" 

108 for k in ( 

109 "DAF_BUTLER_REPOSITORY_INDEX", 

110 "S3_ENDPOINT_URL", 

111 "AWS_ACCESS_KEY_ID", 

112 "AWS_SECRET_ACCESS_KEY", 

113 "AWS_SHARED_CREDENTIALS_FILE", 

114 ): 

115 os.environ.pop(k, None) 

116 

117 

118def makeExampleMetrics(): 

119 return MetricsExample( 

120 {"AM1": 5.2, "AM2": 30.6}, 

121 {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, 

122 [563, 234, 456.7, 752, 8, 9, 27], 

123 ) 

124 

125 

126class TransactionTestError(Exception): 

127 """Specific error for testing transactions, to prevent misdiagnosing 

128 that might otherwise occur when a standard exception is used. 

129 """ 

130 

131 pass 

132 

133 

134class ButlerConfigTests(unittest.TestCase): 

135 """Simple tests for ButlerConfig that are not tested in any other test 

136 cases.""" 

137 

138 def testSearchPath(self): 

139 configFile = os.path.join(TESTDIR, "config", "basic", "butler.yaml") 

140 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

141 config1 = ButlerConfig(configFile) 

142 self.assertNotIn("testConfigs", "\n".join(cm.output)) 

143 

144 overrideDirectory = os.path.join(TESTDIR, "config", "testConfigs") 

145 with self.assertLogs("lsst.daf.butler", level="DEBUG") as cm: 

146 config2 = ButlerConfig(configFile, searchPaths=[overrideDirectory]) 

147 self.assertIn("testConfigs", "\n".join(cm.output)) 

148 

149 key = ("datastore", "records", "table") 

150 self.assertNotEqual(config1[key], config2[key]) 

151 self.assertEqual(config2[key], "override_record") 

152 

153 

154class ButlerPutGetTests(TestCaseMixin): 

155 """Helper method for running a suite of put/get tests from different 

156 butler configurations.""" 

157 

158 root: str 

159 default_run = "ingésτ😺" 

160 storageClassFactory: StorageClassFactory 

161 configFile: str 

162 tmpConfigFile: str 

163 

164 @staticmethod 

165 def addDatasetType( 

166 datasetTypeName: str, dimensions: DimensionGraph, storageClass: StorageClass | str, registry: Registry 

167 ) -> DatasetType: 

168 """Create a DatasetType and register it""" 

169 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

170 registry.registerDatasetType(datasetType) 

171 return datasetType 

172 

173 @classmethod 

174 def setUpClass(cls) -> None: 

175 cls.storageClassFactory = StorageClassFactory() 

176 cls.storageClassFactory.addFromConfig(cls.configFile) 

177 

178 def assertGetComponents(self, butler, datasetRef, components, reference, collections=None) -> None: 

179 datasetType = datasetRef.datasetType 

180 dataId = datasetRef.dataId 

181 deferred = butler.getDeferred(datasetRef) 

182 

183 for component in components: 

184 compTypeName = datasetType.componentTypeName(component) 

185 result = butler.get(compTypeName, dataId, collections=collections) 

186 self.assertEqual(result, getattr(reference, component)) 

187 result_deferred = deferred.get(component=component) 

188 self.assertEqual(result_deferred, result) 

189 

190 def tearDown(self) -> None: 

191 removeTestTempDir(self.root) 

192 

193 def create_butler( 

194 self, run: str, storageClass: StorageClass | str, datasetTypeName: str 

195 ) -> tuple[Butler, DatasetType]: 

196 butler = Butler(self.tmpConfigFile, run=run) 

197 

198 collections = set(butler.registry.queryCollections()) 

199 self.assertEqual(collections, {run}) 

200 

201 # Create and register a DatasetType 

202 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

203 

204 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

205 

206 # Add needed Dimensions 

207 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

208 butler.registry.insertDimensionData( 

209 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

210 ) 

211 butler.registry.insertDimensionData( 

212 "visit_system", {"instrument": "DummyCamComp", "id": 1, "name": "default"} 

213 ) 

214 visit_start = astropy.time.Time("2020-01-01 08:00:00.123456789", scale="tai") 

215 visit_end = astropy.time.Time("2020-01-01 08:00:36.66", scale="tai") 

216 butler.registry.insertDimensionData( 

217 "visit", 

218 { 

219 "instrument": "DummyCamComp", 

220 "id": 423, 

221 "name": "fourtwentythree", 

222 "physical_filter": "d-r", 

223 "visit_system": 1, 

224 "datetime_begin": visit_start, 

225 "datetime_end": visit_end, 

226 }, 

227 ) 

228 

229 # Add more visits for some later tests 

230 for visit_id in (424, 425): 

231 butler.registry.insertDimensionData( 

232 "visit", 

233 { 

234 "instrument": "DummyCamComp", 

235 "id": visit_id, 

236 "name": f"fourtwentyfour_{visit_id}", 

237 "physical_filter": "d-r", 

238 "visit_system": 1, 

239 }, 

240 ) 

241 return butler, datasetType 

242 

243 def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler: 

244 # New datasets will be added to run and tag, but we will only look in 

245 # tag when looking up datasets. 

246 run = self.default_run 

247 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

248 assert butler.run is not None 

249 

250 # Create and store a dataset 

251 metric = makeExampleMetrics() 

252 dataId = butler.registry.expandDataId({"instrument": "DummyCamComp", "visit": 423}) 

253 

254 # Put and remove the dataset once as a DatasetRef, once as a dataId, 

255 # and once with a DatasetType 

256 

257 # Keep track of any collections we add and do not clean up 

258 expected_collections = {run} 

259 

260 counter = 0 

261 ref = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run="put_run_1") 

262 args = tuple[DatasetRef] | tuple[str | DatasetType, DataCoordinate] 

263 for args in ((ref,), (datasetTypeName, dataId), (datasetType, dataId)): 

264 # Since we are using subTest we can get cascading failures 

265 # here with the first attempt failing and the others failing 

266 # immediately because the dataset already exists. Work around 

267 # this by using a distinct run collection each time 

268 counter += 1 

269 this_run = f"put_run_{counter}" 

270 butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

271 expected_collections.update({this_run}) 

272 

273 with self.subTest(args=args): 

274 kwargs: dict[str, Any] = {} 

275 if not isinstance(args[0], DatasetRef): # type: ignore 

276 kwargs["run"] = this_run 

277 ref = butler.put(metric, *args, **kwargs) 

278 self.assertIsInstance(ref, DatasetRef) 

279 

280 # Test getDirect 

281 metricOut = butler.get(ref) 

282 self.assertEqual(metric, metricOut) 

283 # Test get 

284 metricOut = butler.get(ref.datasetType.name, dataId, collections=this_run) 

285 self.assertEqual(metric, metricOut) 

286 # Test get with a datasetRef 

287 metricOut = butler.get(ref) 

288 self.assertEqual(metric, metricOut) 

289 # Test getDeferred with dataId 

290 metricOut = butler.getDeferred(ref.datasetType.name, dataId, collections=this_run).get() 

291 self.assertEqual(metric, metricOut) 

292 # Test getDeferred with a ref 

293 metricOut = butler.getDeferred(ref).get() 

294 self.assertEqual(metric, metricOut) 

295 

296 # Check we can get components 

297 if storageClass.isComposite(): 

298 self.assertGetComponents( 

299 butler, ref, ("summary", "data", "output"), metric, collections=this_run 

300 ) 

301 

302 # Can the artifacts themselves be retrieved? 

303 if not butler.datastore.isEphemeral: 

304 root_uri = ResourcePath(self.root) 

305 

306 for preserve_path in (True, False): 

307 destination = root_uri.join(f"artifacts/{preserve_path}_{counter}/") 

308 # Use copy so that we can test that overwrite 

309 # protection works (using "auto" for File URIs would 

310 # use hard links and subsequent transfer would work 

311 # because it knows they are the same file). 

312 transferred = butler.retrieveArtifacts( 

313 [ref], destination, preserve_path=preserve_path, transfer="copy" 

314 ) 

315 self.assertGreater(len(transferred), 0) 

316 artifacts = list(ResourcePath.findFileResources([destination])) 

317 self.assertEqual(set(transferred), set(artifacts)) 

318 

319 for artifact in transferred: 

320 path_in_destination = artifact.relative_to(destination) 

321 self.assertIsNotNone(path_in_destination) 

322 assert path_in_destination is not None 

323 

324 # when path is not preserved there should not be 

325 # any path separators. 

326 num_seps = path_in_destination.count("/") 

327 if preserve_path: 

328 self.assertGreater(num_seps, 0) 

329 else: 

330 self.assertEqual(num_seps, 0) 

331 

332 primary_uri, secondary_uris = butler.datastore.getURIs(ref) 

333 n_uris = len(secondary_uris) 

334 if primary_uri: 

335 n_uris += 1 

336 self.assertEqual( 

337 len(artifacts), 

338 n_uris, 

339 "Comparing expected artifacts vs actual:" 

340 f" {artifacts} vs {primary_uri} and {secondary_uris}", 

341 ) 

342 

343 if preserve_path: 

344 # No need to run these twice 

345 with self.assertRaises(ValueError): 

346 butler.retrieveArtifacts([ref], destination, transfer="move") 

347 

348 with self.assertRaises(FileExistsError): 

349 butler.retrieveArtifacts([ref], destination) 

350 

351 transferred_again = butler.retrieveArtifacts( 

352 [ref], destination, preserve_path=preserve_path, overwrite=True 

353 ) 

354 self.assertEqual(set(transferred_again), set(transferred)) 

355 

356 # Now remove the dataset completely. 

357 butler.pruneDatasets([ref], purge=True, unstore=True) 

358 # Lookup with original args should still fail. 

359 self.assertFalse(butler.exists(*args, collections=this_run)) 

360 # get() should still fail. 

361 with self.assertRaises(FileNotFoundError): 

362 butler.get(ref) 

363 # Registry shouldn't be able to find it by dataset_id anymore. 

364 self.assertIsNone(butler.registry.getDataset(ref.id)) 

365 

366 # Do explicit registry removal since we know they are 

367 # empty 

368 butler.registry.removeCollection(this_run) 

369 expected_collections.remove(this_run) 

370 

371 # Create DatasetRef for put using default run. 

372 refIn = DatasetRef(datasetType, dataId, id=uuid.UUID(int=1), run=butler.run) 

373 

374 # Put the dataset again, since the last thing we did was remove it 

375 # and we want to use the default collection. 

376 ref = butler.put(metric, refIn) 

377 

378 # Get with parameters 

379 stop = 4 

380 sliced = butler.get(ref, parameters={"slice": slice(stop)}) 

381 self.assertNotEqual(metric, sliced) 

382 self.assertEqual(metric.summary, sliced.summary) 

383 self.assertEqual(metric.output, sliced.output) 

384 self.assertEqual(metric.data[:stop], sliced.data) 

385 # getDeferred with parameters 

386 sliced = butler.getDeferred(ref, parameters={"slice": slice(stop)}).get() 

387 self.assertNotEqual(metric, sliced) 

388 self.assertEqual(metric.summary, sliced.summary) 

389 self.assertEqual(metric.output, sliced.output) 

390 self.assertEqual(metric.data[:stop], sliced.data) 

391 # getDeferred with deferred parameters 

392 sliced = butler.getDeferred(ref).get(parameters={"slice": slice(stop)}) 

393 self.assertNotEqual(metric, sliced) 

394 self.assertEqual(metric.summary, sliced.summary) 

395 self.assertEqual(metric.output, sliced.output) 

396 self.assertEqual(metric.data[:stop], sliced.data) 

397 

398 if storageClass.isComposite(): 

399 # Check that components can be retrieved 

400 metricOut = butler.get(ref.datasetType.name, dataId) 

401 compNameS = ref.datasetType.componentTypeName("summary") 

402 compNameD = ref.datasetType.componentTypeName("data") 

403 summary = butler.get(compNameS, dataId) 

404 self.assertEqual(summary, metric.summary) 

405 data = butler.get(compNameD, dataId) 

406 self.assertEqual(data, metric.data) 

407 

408 if "counter" in storageClass.derivedComponents: 

409 count = butler.get(ref.datasetType.componentTypeName("counter"), dataId) 

410 self.assertEqual(count, len(data)) 

411 

412 count = butler.get( 

413 ref.datasetType.componentTypeName("counter"), dataId, parameters={"slice": slice(stop)} 

414 ) 

415 self.assertEqual(count, stop) 

416 

417 compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) 

418 assert compRef is not None 

419 summary = butler.get(compRef) 

420 self.assertEqual(summary, metric.summary) 

421 

422 # Create a Dataset type that has the same name but is inconsistent. 

423 inconsistentDatasetType = DatasetType( 

424 datasetTypeName, datasetType.dimensions, self.storageClassFactory.getStorageClass("Config") 

425 ) 

426 

427 # Getting with a dataset type that does not match registry fails 

428 with self.assertRaisesRegex(ValueError, "Supplied dataset type .* inconsistent with registry"): 

429 butler.get(inconsistentDatasetType, dataId) 

430 

431 # Combining a DatasetRef with a dataId should fail 

432 with self.assertRaisesRegex(ValueError, "DatasetRef given, cannot use dataId as well"): 

433 butler.get(ref, dataId) 

434 # Getting with an explicit ref should fail if the id doesn't match. 

435 with self.assertRaises(FileNotFoundError): 

436 butler.get(DatasetRef(ref.datasetType, ref.dataId, id=uuid.UUID(int=101), run=butler.run)) 

437 

438 # Getting a dataset with unknown parameters should fail 

439 with self.assertRaisesRegex(KeyError, "Parameter 'unsupported' not understood"): 

440 butler.get(ref, parameters={"unsupported": True}) 

441 

442 # Check we have a collection 

443 collections = set(butler.registry.queryCollections()) 

444 self.assertEqual(collections, expected_collections) 

445 

446 # Clean up to check that we can remove something that may have 

447 # already had a component removed 

448 butler.pruneDatasets([ref], unstore=True, purge=True) 

449 

450 # Add the same ref again, so we can check that duplicate put fails. 

451 ref = butler.put(metric, datasetType, dataId) 

452 

453 # Repeat put will fail. 

454 with self.assertRaisesRegex( 

455 ConflictingDefinitionError, "A database constraint failure was triggered" 

456 ): 

457 butler.put(metric, datasetType, dataId) 

458 

459 # Remove the datastore entry. 

460 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

461 

462 # Put will still fail 

463 with self.assertRaisesRegex( 

464 ConflictingDefinitionError, "A database constraint failure was triggered" 

465 ): 

466 butler.put(metric, datasetType, dataId) 

467 

468 # Repeat the same sequence with resolved ref. 

469 butler.pruneDatasets([ref], unstore=True, purge=True) 

470 ref = butler.put(metric, refIn) 

471 

472 # Repeat put will fail. 

473 with self.assertRaisesRegex(ConflictingDefinitionError, "Datastore already contains dataset"): 

474 butler.put(metric, refIn) 

475 

476 # Remove the datastore entry. 

477 butler.pruneDatasets([ref], unstore=True, purge=False, disassociate=False) 

478 

479 # In case of resolved ref this write will succeed. 

480 ref = butler.put(metric, refIn) 

481 

482 # Leave the dataset in place since some downstream tests require 

483 # something to be present 

484 

485 return butler 

486 

487 def testDeferredCollectionPassing(self) -> None: 

488 # Construct a butler with no run or collection, but make it writeable. 

489 butler = Butler(self.tmpConfigFile, writeable=True) 

490 # Create and register a DatasetType 

491 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

492 datasetType = self.addDatasetType( 

493 "example", dimensions, self.storageClassFactory.getStorageClass("StructuredData"), butler.registry 

494 ) 

495 # Add needed Dimensions 

496 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

497 butler.registry.insertDimensionData( 

498 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

499 ) 

500 butler.registry.insertDimensionData( 

501 "visit", 

502 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

503 ) 

504 dataId = {"instrument": "DummyCamComp", "visit": 423} 

505 # Create dataset. 

506 metric = makeExampleMetrics() 

507 # Register a new run and put dataset. 

508 run = "deferred" 

509 self.assertTrue(butler.registry.registerRun(run)) 

510 # Second time it will be allowed but indicate no-op 

511 self.assertFalse(butler.registry.registerRun(run)) 

512 ref = butler.put(metric, datasetType, dataId, run=run) 

513 # Putting with no run should fail with TypeError. 

514 with self.assertRaises(CollectionError): 

515 butler.put(metric, datasetType, dataId) 

516 # Dataset should exist. 

517 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

518 # We should be able to get the dataset back, but with and without 

519 # a deferred dataset handle. 

520 self.assertEqual(metric, butler.get(datasetType, dataId, collections=[run])) 

521 self.assertEqual(metric, butler.getDeferred(datasetType, dataId, collections=[run]).get()) 

522 # Trying to find the dataset without any collection is a TypeError. 

523 self.assertFalse(butler.exists(datasetType, dataId)) 

524 with self.assertRaises(CollectionError): 

525 butler.get(datasetType, dataId) 

526 # Associate the dataset with a different collection. 

527 butler.registry.registerCollection("tagged") 

528 butler.registry.associate("tagged", [ref]) 

529 # Deleting the dataset from the new collection should make it findable 

530 # in the original collection. 

531 butler.pruneDatasets([ref], tags=["tagged"]) 

532 self.assertTrue(butler.exists(datasetType, dataId, collections=[run])) 

533 

534 

535class ButlerTests(ButlerPutGetTests): 

536 """Tests for Butler.""" 

537 

538 useTempRoot = True 

539 validationCanFail: bool 

540 fullConfigKey: str | None 

541 registryStr: str | None 

542 datastoreName: list[str] | None 

543 datastoreStr: list[str] 

544 

545 def setUp(self) -> None: 

546 """Create a new butler root for each test.""" 

547 self.root = makeTestTempDir(TESTDIR) 

548 Butler.makeRepo(self.root, config=Config(self.configFile)) 

549 self.tmpConfigFile = os.path.join(self.root, "butler.yaml") 

550 

551 def testConstructor(self) -> None: 

552 """Independent test of constructor.""" 

553 butler = Butler(self.tmpConfigFile, run=self.default_run) 

554 self.assertIsInstance(butler, Butler) 

555 

556 # Check that butler.yaml is added automatically. 

557 if self.tmpConfigFile.endswith(end := "/butler.yaml"): 

558 config_dir = self.tmpConfigFile[: -len(end)] 

559 butler = Butler(config_dir, run=self.default_run) 

560 self.assertIsInstance(butler, Butler) 

561 

562 # Even with a ResourcePath. 

563 butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) 

564 self.assertIsInstance(butler, Butler) 

565 

566 collections = set(butler.registry.queryCollections()) 

567 self.assertEqual(collections, {self.default_run}) 

568 

569 # Check that some special characters can be included in run name. 

570 special_run = "u@b.c-A" 

571 butler_special = Butler(butler=butler, run=special_run) 

572 collections = set(butler_special.registry.queryCollections("*@*")) 

573 self.assertEqual(collections, {special_run}) 

574 

575 butler2 = Butler(butler=butler, collections=["other"]) 

576 self.assertEqual(butler2.collections, ("other",)) 

577 self.assertIsNone(butler2.run) 

578 self.assertIs(butler.datastore, butler2.datastore) 

579 

580 # Test that we can use an environment variable to find this 

581 # repository. 

582 butler_index = Config() 

583 butler_index["label"] = self.tmpConfigFile 

584 for suffix in (".yaml", ".json"): 

585 # Ensure that the content differs so that we know that 

586 # we aren't reusing the cache. 

587 bad_label = f"file://bucket/not_real{suffix}" 

588 butler_index["bad_label"] = bad_label 

589 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

590 butler_index.dumpToUri(temp_file) 

591 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

592 self.assertEqual(Butler.get_known_repos(), {"label", "bad_label"}) 

593 uri = Butler.get_repo_uri("bad_label") 

594 self.assertEqual(uri, ResourcePath(bad_label)) 

595 uri = Butler.get_repo_uri("label") 

596 butler = Butler(uri, writeable=False) 

597 self.assertIsInstance(butler, Butler) 

598 butler = Butler("label", writeable=False) 

599 self.assertIsInstance(butler, Butler) 

600 with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"): 

601 Butler("not_there", writeable=False) 

602 with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"): 

603 Butler("bad_label") 

604 with self.assertRaises(FileNotFoundError): 

605 # Should ignore aliases. 

606 Butler(ResourcePath("label", forceAbsolute=False)) 

607 with self.assertRaises(KeyError) as cm: 

608 Butler.get_repo_uri("missing") 

609 self.assertEqual( 

610 Butler.get_repo_uri("missing", True), ResourcePath("missing", forceAbsolute=False) 

611 ) 

612 self.assertIn("not known to", str(cm.exception)) 

613 # Should report no failure. 

614 self.assertEqual(ButlerRepoIndex.get_failure_reason(), "") 

615 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

616 # Now with empty configuration. 

617 butler_index = Config() 

618 butler_index.dumpToUri(temp_file) 

619 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

620 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"): 

621 Butler("label") 

622 with ResourcePath.temporary_uri(suffix=suffix) as temp_file: 

623 # Now with bad contents. 

624 with open(temp_file.ospath, "w") as fh: 

625 print("'", file=fh) 

626 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): 

627 with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"): 

628 Butler("label") 

629 with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): 

630 with self.assertRaises(FileNotFoundError): 

631 Butler.get_repo_uri("label") 

632 self.assertEqual(Butler.get_known_repos(), set()) 

633 

634 with self.assertRaisesRegex(FileNotFoundError, "index file not found"): 

635 Butler("label") 

636 

637 # Check that we can create Butler when the alias file is not found. 

638 butler = Butler(self.tmpConfigFile, writeable=False) 

639 self.assertIsInstance(butler, Butler) 

640 with self.assertRaises(KeyError) as cm: 

641 # No environment variable set. 

642 Butler.get_repo_uri("label") 

643 self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False)) 

644 self.assertIn("No repository index defined", str(cm.exception)) 

645 with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"): 

646 # No aliases registered. 

647 Butler("not_there") 

648 self.assertEqual(Butler.get_known_repos(), set()) 

649 

650 def testBasicPutGet(self) -> None: 

651 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

652 self.runPutGetTest(storageClass, "test_metric") 

653 

654 def testCompositePutGetConcrete(self) -> None: 

655 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadCompNoDisassembly") 

656 butler = self.runPutGetTest(storageClass, "test_metric") 

657 

658 # Should *not* be disassembled 

659 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

660 self.assertEqual(len(datasets), 1) 

661 uri, components = butler.getURIs(datasets[0]) 

662 self.assertIsInstance(uri, ResourcePath) 

663 self.assertFalse(components) 

664 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

665 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

666 

667 # Predicted dataset 

668 dataId = {"instrument": "DummyCamComp", "visit": 424} 

669 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

670 self.assertFalse(components) 

671 self.assertIsInstance(uri, ResourcePath) 

672 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

673 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

674 

675 def testCompositePutGetVirtual(self) -> None: 

676 storageClass = self.storageClassFactory.getStorageClass("StructuredCompositeReadComp") 

677 butler = self.runPutGetTest(storageClass, "test_metric_comp") 

678 

679 # Should be disassembled 

680 datasets = list(butler.registry.queryDatasets(..., collections=self.default_run)) 

681 self.assertEqual(len(datasets), 1) 

682 uri, components = butler.getURIs(datasets[0]) 

683 

684 if butler.datastore.isEphemeral: 

685 # Never disassemble in-memory datastore 

686 self.assertIsInstance(uri, ResourcePath) 

687 self.assertFalse(components) 

688 self.assertEqual(uri.fragment, "", f"Checking absence of fragment in {uri}") 

689 self.assertIn("423", str(uri), f"Checking visit is in URI {uri}") 

690 else: 

691 self.assertIsNone(uri) 

692 self.assertEqual(set(components), set(storageClass.components)) 

693 for compuri in components.values(): 

694 self.assertIsInstance(compuri, ResourcePath) 

695 self.assertIn("423", str(compuri), f"Checking visit is in URI {compuri}") 

696 self.assertEqual(compuri.fragment, "", f"Checking absence of fragment in {compuri}") 

697 

698 # Predicted dataset 

699 dataId = {"instrument": "DummyCamComp", "visit": 424} 

700 uri, components = butler.getURIs(datasets[0].datasetType, dataId=dataId, predict=True) 

701 

702 if butler.datastore.isEphemeral: 

703 # Never disassembled 

704 self.assertIsInstance(uri, ResourcePath) 

705 self.assertFalse(components) 

706 self.assertIn("424", str(uri), f"Checking visit is in URI {uri}") 

707 self.assertEqual(uri.fragment, "predicted", f"Checking for fragment in {uri}") 

708 else: 

709 self.assertIsNone(uri) 

710 self.assertEqual(set(components), set(storageClass.components)) 

711 for compuri in components.values(): 

712 self.assertIsInstance(compuri, ResourcePath) 

713 self.assertIn("424", str(compuri), f"Checking visit is in URI {compuri}") 

714 self.assertEqual(compuri.fragment, "predicted", f"Checking for fragment in {compuri}") 

715 

716 def testStorageClassOverrideGet(self) -> None: 

717 """Test storage class conversion on get with override.""" 

718 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

719 datasetTypeName = "anything" 

720 run = self.default_run 

721 

722 butler, datasetType = self.create_butler(run, storageClass, datasetTypeName) 

723 

724 # Create and store a dataset. 

725 metric = makeExampleMetrics() 

726 dataId = {"instrument": "DummyCamComp", "visit": 423} 

727 

728 ref = butler.put(metric, datasetType, dataId) 

729 

730 # Return native type. 

731 retrieved = butler.get(ref) 

732 self.assertEqual(retrieved, metric) 

733 

734 # Specify an override. 

735 new_sc = self.storageClassFactory.getStorageClass("MetricsConversion") 

736 model = butler.get(ref, storageClass=new_sc) 

737 self.assertNotEqual(type(model), type(retrieved)) 

738 self.assertIs(type(model), new_sc.pytype) 

739 self.assertEqual(retrieved, model) 

740 

741 # Defer but override later. 

742 deferred = butler.getDeferred(ref) 

743 model = deferred.get(storageClass=new_sc) 

744 self.assertIs(type(model), new_sc.pytype) 

745 self.assertEqual(retrieved, model) 

746 

747 # Defer but override up front. 

748 deferred = butler.getDeferred(ref, storageClass=new_sc) 

749 model = deferred.get() 

750 self.assertIs(type(model), new_sc.pytype) 

751 self.assertEqual(retrieved, model) 

752 

753 # Retrieve a component. Should be a tuple. 

754 data = butler.get("anything.data", dataId, storageClass="StructuredDataDataTestTuple") 

755 self.assertIs(type(data), tuple) 

756 self.assertEqual(data, tuple(retrieved.data)) 

757 

758 # Parameter on the write storage class should work regardless 

759 # of read storage class. 

760 data = butler.get( 

761 "anything.data", 

762 dataId, 

763 storageClass="StructuredDataDataTestTuple", 

764 parameters={"slice": slice(2, 4)}, 

765 ) 

766 self.assertEqual(len(data), 2) 

767 

768 # Try a parameter that is known to the read storage class but not 

769 # the write storage class. 

770 with self.assertRaises(KeyError): 

771 butler.get( 

772 "anything.data", 

773 dataId, 

774 storageClass="StructuredDataDataTestTuple", 

775 parameters={"xslice": slice(2, 4)}, 

776 ) 

777 

778 def testPytypePutCoercion(self) -> None: 

779 """Test python type coercion on Butler.get and put.""" 

780 

781 # Store some data with the normal example storage class. 

782 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

783 datasetTypeName = "test_metric" 

784 butler, _ = self.create_butler(self.default_run, storageClass, datasetTypeName) 

785 

786 dataId = {"instrument": "DummyCamComp", "visit": 423} 

787 

788 # Put a dict and this should coerce to a MetricsExample 

789 test_dict = {"summary": {"a": 1}, "output": {"b": 2}} 

790 metric_ref = butler.put(test_dict, datasetTypeName, dataId=dataId, visit=424) 

791 test_metric = butler.get(metric_ref) 

792 self.assertEqual(get_full_type_name(test_metric), "lsst.daf.butler.tests.MetricsExample") 

793 self.assertEqual(test_metric.summary, test_dict["summary"]) 

794 self.assertEqual(test_metric.output, test_dict["output"]) 

795 

796 # Check that the put still works if a DatasetType is given with 

797 # a definition matching this python type. 

798 registry_type = butler.registry.getDatasetType(datasetTypeName) 

799 this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson") 

800 metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425) 

801 self.assertEqual(metric2_ref.datasetType, registry_type) 

802 

803 # The get will return the type expected by registry. 

804 test_metric2 = butler.get(metric2_ref) 

805 self.assertEqual(get_full_type_name(test_metric2), "lsst.daf.butler.tests.MetricsExample") 

806 

807 # Make a new DatasetRef with the compatible but different DatasetType. 

808 # This should now return a dict. 

809 new_ref = DatasetRef(this_type, metric2_ref.dataId, id=metric2_ref.id, run=metric2_ref.run) 

810 test_dict2 = butler.get(new_ref) 

811 self.assertEqual(get_full_type_name(test_dict2), "dict") 

812 

813 # Get it again with the wrong dataset type definition using get() 

814 # rather than get(). This should be consistent with get() 

815 # behavior and return the type of the DatasetType. 

816 test_dict3 = butler.get(this_type, dataId=dataId, visit=425) 

817 self.assertEqual(get_full_type_name(test_dict3), "dict") 

818 

819 def testIngest(self) -> None: 

820 butler = Butler(self.tmpConfigFile, run=self.default_run) 

821 

822 # Create and register a DatasetType 

823 dimensions = butler.dimensions.extract(["instrument", "visit", "detector"]) 

824 

825 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDictYaml") 

826 datasetTypeName = "metric" 

827 

828 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

829 

830 # Add needed Dimensions 

831 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

832 butler.registry.insertDimensionData( 

833 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

834 ) 

835 for detector in (1, 2): 

836 butler.registry.insertDimensionData( 

837 "detector", {"instrument": "DummyCamComp", "id": detector, "full_name": f"detector{detector}"} 

838 ) 

839 

840 butler.registry.insertDimensionData( 

841 "visit", 

842 {"instrument": "DummyCamComp", "id": 423, "name": "fourtwentythree", "physical_filter": "d-r"}, 

843 {"instrument": "DummyCamComp", "id": 424, "name": "fourtwentyfour", "physical_filter": "d-r"}, 

844 ) 

845 

846 formatter = doImportType("lsst.daf.butler.formatters.yaml.YamlFormatter") 

847 dataRoot = os.path.join(TESTDIR, "data", "basic") 

848 datasets = [] 

849 for detector in (1, 2): 

850 detector_name = f"detector_{detector}" 

851 metricFile = os.path.join(dataRoot, f"{detector_name}.yaml") 

852 dataId = butler.registry.expandDataId( 

853 {"instrument": "DummyCamComp", "visit": 423, "detector": detector} 

854 ) 

855 # Create a DatasetRef for ingest 

856 refIn = DatasetRef(datasetType, dataId, run=self.default_run) 

857 

858 datasets.append(FileDataset(path=metricFile, refs=[refIn], formatter=formatter)) 

859 

860 butler.ingest(*datasets, transfer="copy") 

861 

862 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423} 

863 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423} 

864 

865 metrics1 = butler.get(datasetTypeName, dataId1) 

866 metrics2 = butler.get(datasetTypeName, dataId2) 

867 self.assertNotEqual(metrics1, metrics2) 

868 

869 # Compare URIs 

870 uri1 = butler.getURI(datasetTypeName, dataId1) 

871 uri2 = butler.getURI(datasetTypeName, dataId2) 

872 self.assertNotEqual(uri1, uri2) 

873 

874 # Now do a multi-dataset but single file ingest 

875 metricFile = os.path.join(dataRoot, "detectors.yaml") 

876 refs = [] 

877 for detector in (1, 2): 

878 detector_name = f"detector_{detector}" 

879 dataId = butler.registry.expandDataId( 

880 {"instrument": "DummyCamComp", "visit": 424, "detector": detector} 

881 ) 

882 # Create a DatasetRef for ingest 

883 refs.append(DatasetRef(datasetType, dataId, run=self.default_run)) 

884 

885 # Test "move" transfer to ensure that the files themselves 

886 # have disappeared following ingest. 

887 with ResourcePath.temporary_uri(suffix=".yaml") as tempFile: 

888 tempFile.transfer_from(ResourcePath(metricFile), transfer="copy") 

889 

890 datasets = [] 

891 datasets.append(FileDataset(path=tempFile, refs=refs, formatter=MultiDetectorFormatter)) 

892 

893 # For first ingest use copy. 

894 butler.ingest(*datasets, transfer="copy", record_validation_info=False) 

895 

896 # Now try to ingest again in "execution butler" mode where 

897 # the registry entries exist but the datastore does not have 

898 # the files. We also need to strip the dimension records to ensure 

899 # that they will be re-added by the ingest. 

900 ref = datasets[0].refs[0] 

901 datasets[0].refs = [ 

902 cast( 

903 DatasetRef, 

904 butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), 

905 ) 

906 for ref in datasets[0].refs 

907 ] 

908 all_refs = [] 

909 for dataset in datasets: 

910 refs = [] 

911 for ref in dataset.refs: 

912 # Create a dict from the dataId to drop the records. 

913 new_data_id = {str(k): v for k, v in ref.dataId.items()} 

914 new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run) 

915 assert new_ref is not None 

916 self.assertFalse(new_ref.dataId.hasRecords()) 

917 refs.append(new_ref) 

918 dataset.refs = refs 

919 all_refs.extend(dataset.refs) 

920 butler.pruneDatasets(all_refs, disassociate=False, unstore=True, purge=False) 

921 

922 # Use move mode to test that the file is deleted. Also 

923 # disable recording of file size. 

924 butler.ingest(*datasets, transfer="move", record_validation_info=False) 

925 

926 # Check that every ref now has records. 

927 for dataset in datasets: 

928 for ref in dataset.refs: 

929 self.assertTrue(ref.dataId.hasRecords()) 

930 

931 # Ensure that the file has disappeared. 

932 self.assertFalse(tempFile.exists()) 

933 

934 # Check that the datastore recorded no file size. 

935 # Not all datastores can support this. 

936 try: 

937 infos = butler.datastore.getStoredItemsInfo(datasets[0].refs[0]) # type: ignore[attr-defined] 

938 self.assertEqual(infos[0].file_size, -1) 

939 except AttributeError: 

940 pass 

941 

942 dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424} 

943 dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424} 

944 

945 multi1 = butler.get(datasetTypeName, dataId1) 

946 multi2 = butler.get(datasetTypeName, dataId2) 

947 

948 self.assertEqual(multi1, metrics1) 

949 self.assertEqual(multi2, metrics2) 

950 

951 # Compare URIs 

952 uri1 = butler.getURI(datasetTypeName, dataId1) 

953 uri2 = butler.getURI(datasetTypeName, dataId2) 

954 self.assertEqual(uri1, uri2, f"Cf. {uri1} with {uri2}") 

955 

956 # Test that removing one does not break the second 

957 # This line will issue a warning log message for a ChainedDatastore 

958 # that uses an InMemoryDatastore since in-memory can not ingest 

959 # files. 

960 butler.pruneDatasets([datasets[0].refs[0]], unstore=True, disassociate=False) 

961 self.assertFalse(butler.exists(datasetTypeName, dataId1)) 

962 self.assertTrue(butler.exists(datasetTypeName, dataId2)) 

963 multi2b = butler.get(datasetTypeName, dataId2) 

964 self.assertEqual(multi2, multi2b) 

965 

966 # Ensure we can ingest 0 datasets 

967 datasets = [] 

968 butler.ingest(*datasets) 

969 

970 def testPickle(self) -> None: 

971 """Test pickle support.""" 

972 butler = Butler(self.tmpConfigFile, run=self.default_run) 

973 butlerOut = pickle.loads(pickle.dumps(butler)) 

974 self.assertIsInstance(butlerOut, Butler) 

975 self.assertEqual(butlerOut._config, butler._config) 

976 self.assertEqual(butlerOut.collections, butler.collections) 

977 self.assertEqual(butlerOut.run, butler.run) 

978 

979 def testGetDatasetTypes(self) -> None: 

980 butler = Butler(self.tmpConfigFile, run=self.default_run) 

981 dimensions = butler.dimensions.extract(["instrument", "visit", "physical_filter"]) 

982 dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [ 

983 ( 

984 "instrument", 

985 [ 

986 {"instrument": "DummyCam"}, 

987 {"instrument": "DummyHSC"}, 

988 {"instrument": "DummyCamComp"}, 

989 ], 

990 ), 

991 ("physical_filter", [{"instrument": "DummyCam", "name": "d-r", "band": "R"}]), 

992 ("visit", [{"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}]), 

993 ] 

994 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

995 # Add needed Dimensions 

996 for element, data in dimensionEntries: 

997 butler.registry.insertDimensionData(element, *data) 

998 

999 # When a DatasetType is added to the registry entries are not created 

1000 # for components but querying them can return the components. 

1001 datasetTypeNames = {"metric", "metric2", "metric4", "metric33", "pvi", "paramtest"} 

1002 components = set() 

1003 for datasetTypeName in datasetTypeNames: 

1004 # Create and register a DatasetType 

1005 self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1006 

1007 for componentName in storageClass.components: 

1008 components.add(DatasetType.nameWithComponent(datasetTypeName, componentName)) 

1009 

1010 fromRegistry: set[DatasetType] = set() 

1011 for parent_dataset_type in butler.registry.queryDatasetTypes(): 

1012 fromRegistry.add(parent_dataset_type) 

1013 fromRegistry.update(parent_dataset_type.makeAllComponentDatasetTypes()) 

1014 self.assertEqual({d.name for d in fromRegistry}, datasetTypeNames | components) 

1015 

1016 # Now that we have some dataset types registered, validate them 

1017 butler.validateConfiguration( 

1018 ignore=[ 

1019 "test_metric_comp", 

1020 "metric3", 

1021 "metric5", 

1022 "calexp", 

1023 "DummySC", 

1024 "datasetType.component", 

1025 "random_data", 

1026 "random_data_2", 

1027 ] 

1028 ) 

1029 

1030 # Add a new datasetType that will fail template validation 

1031 self.addDatasetType("test_metric_comp", dimensions, storageClass, butler.registry) 

1032 if self.validationCanFail: 

1033 with self.assertRaises(ValidationError): 

1034 butler.validateConfiguration() 

1035 

1036 # Rerun validation but with a subset of dataset type names 

1037 butler.validateConfiguration(datasetTypeNames=["metric4"]) 

1038 

1039 # Rerun validation but ignore the bad datasetType 

1040 butler.validateConfiguration( 

1041 ignore=[ 

1042 "test_metric_comp", 

1043 "metric3", 

1044 "metric5", 

1045 "calexp", 

1046 "DummySC", 

1047 "datasetType.component", 

1048 "random_data", 

1049 "random_data_2", 

1050 ] 

1051 ) 

1052 

1053 def testTransaction(self) -> None: 

1054 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1055 datasetTypeName = "test_metric" 

1056 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1057 dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = ( 

1058 ("instrument", {"instrument": "DummyCam"}), 

1059 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

1060 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

1061 ) 

1062 storageClass = self.storageClassFactory.getStorageClass("StructuredData") 

1063 metric = makeExampleMetrics() 

1064 dataId = {"instrument": "DummyCam", "visit": 42} 

1065 # Create and register a DatasetType 

1066 datasetType = self.addDatasetType(datasetTypeName, dimensions, storageClass, butler.registry) 

1067 with self.assertRaises(TransactionTestError): 

1068 with butler.transaction(): 

1069 # Add needed Dimensions 

1070 for args in dimensionEntries: 

1071 butler.registry.insertDimensionData(*args) 

1072 # Store a dataset 

1073 ref = butler.put(metric, datasetTypeName, dataId) 

1074 self.assertIsInstance(ref, DatasetRef) 

1075 # Test getDirect 

1076 metricOut = butler.get(ref) 

1077 self.assertEqual(metric, metricOut) 

1078 # Test get 

1079 metricOut = butler.get(datasetTypeName, dataId) 

1080 self.assertEqual(metric, metricOut) 

1081 # Check we can get components 

1082 self.assertGetComponents(butler, ref, ("summary", "data", "output"), metric) 

1083 raise TransactionTestError("This should roll back the entire transaction") 

1084 with self.assertRaises(DataIdValueError, msg=f"Check can't expand DataId {dataId}"): 

1085 butler.registry.expandDataId(dataId) 

1086 # Should raise LookupError for missing data ID value 

1087 with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): 

1088 butler.get(datasetTypeName, dataId) 

1089 # Also check explicitly if Dataset entry is missing 

1090 self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) 

1091 # Direct retrieval should not find the file in the Datastore 

1092 with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): 

1093 butler.get(ref) 

1094 

1095 def testMakeRepo(self) -> None: 

1096 """Test that we can write butler configuration to a new repository via 

1097 the Butler.makeRepo interface and then instantiate a butler from the 

1098 repo root. 

1099 """ 

1100 # Do not run the test if we know this datastore configuration does 

1101 # not support a file system root 

1102 if self.fullConfigKey is None: 

1103 return 

1104 

1105 # create two separate directories 

1106 root1 = tempfile.mkdtemp(dir=self.root) 

1107 root2 = tempfile.mkdtemp(dir=self.root) 

1108 

1109 butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) 

1110 limited = Config(self.configFile) 

1111 butler1 = Butler(butlerConfig) 

1112 butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) 

1113 full = Config(self.tmpConfigFile) 

1114 butler2 = Butler(butlerConfig) 

1115 # Butlers should have the same configuration regardless of whether 

1116 # defaults were expanded. 

1117 self.assertEqual(butler1._config, butler2._config) 

1118 # Config files loaded directly should not be the same. 

1119 self.assertNotEqual(limited, full) 

1120 # Make sure "limited" doesn't have a few keys we know it should be 

1121 # inheriting from defaults. 

1122 self.assertIn(self.fullConfigKey, full) 

1123 self.assertNotIn(self.fullConfigKey, limited) 

1124 

1125 # Collections don't appear until something is put in them 

1126 collections1 = set(butler1.registry.queryCollections()) 

1127 self.assertEqual(collections1, set()) 

1128 self.assertEqual(set(butler2.registry.queryCollections()), collections1) 

1129 

1130 # Check that a config with no associated file name will not 

1131 # work properly with relocatable Butler repo 

1132 butlerConfig.configFile = None 

1133 with self.assertRaises(ValueError): 

1134 Butler(butlerConfig) 

1135 

1136 with self.assertRaises(FileExistsError): 

1137 Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) 

1138 

1139 def testStringification(self) -> None: 

1140 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1141 butlerStr = str(butler) 

1142 

1143 if self.datastoreStr is not None: 

1144 for testStr in self.datastoreStr: 

1145 self.assertIn(testStr, butlerStr) 

1146 if self.registryStr is not None: 

1147 self.assertIn(self.registryStr, butlerStr) 

1148 

1149 datastoreName = butler.datastore.name 

1150 if self.datastoreName is not None: 

1151 for testStr in self.datastoreName: 

1152 self.assertIn(testStr, datastoreName) 

1153 

1154 def testButlerRewriteDataId(self) -> None: 

1155 """Test that dataIds can be rewritten based on dimension records.""" 

1156 

1157 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1158 

1159 storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") 

1160 datasetTypeName = "random_data" 

1161 

1162 # Create dimension records. 

1163 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1164 butler.registry.insertDimensionData( 

1165 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1166 ) 

1167 butler.registry.insertDimensionData( 

1168 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

1169 ) 

1170 

1171 dimensions = butler.dimensions.extract(["instrument", "exposure"]) 

1172 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

1173 butler.registry.registerDatasetType(datasetType) 

1174 

1175 n_exposures = 5 

1176 dayobs = 20210530 

1177 

1178 for i in range(n_exposures): 

1179 butler.registry.insertDimensionData( 

1180 "exposure", 

1181 { 

1182 "instrument": "DummyCamComp", 

1183 "id": i, 

1184 "obs_id": f"exp{i}", 

1185 "seq_num": i, 

1186 "day_obs": dayobs, 

1187 "physical_filter": "d-r", 

1188 }, 

1189 ) 

1190 

1191 # Write some data. 

1192 for i in range(n_exposures): 

1193 metric = {"something": i, "other": "metric", "list": [2 * x for x in range(i)]} 

1194 

1195 # Use the seq_num for the put to test rewriting. 

1196 dataId = {"seq_num": i, "day_obs": dayobs, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

1197 ref = butler.put(metric, datasetTypeName, dataId=dataId) 

1198 

1199 # Check that the exposure is correct in the dataId 

1200 self.assertEqual(ref.dataId["exposure"], i) 

1201 

1202 # and check that we can get the dataset back with the same dataId 

1203 new_metric = butler.get(datasetTypeName, dataId=dataId) 

1204 self.assertEqual(new_metric, metric) 

1205 

1206 

1207class FileDatastoreButlerTests(ButlerTests): 

1208 """Common tests and specialization of ButlerTests for butlers backed 

1209 by datastores that inherit from FileDatastore. 

1210 """ 

1211 

1212 def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) -> bool: 

1213 """Checks if file exists at a given path (relative to root). 

1214 

1215 Test testPutTemplates verifies actual physical existance of the files 

1216 in the requested location. 

1217 """ 

1218 uri = ResourcePath(root, forceDirectory=True) 

1219 return uri.join(relpath).exists() 

1220 

1221 def testPutTemplates(self) -> None: 

1222 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1223 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1224 

1225 # Add needed Dimensions 

1226 butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

1227 butler.registry.insertDimensionData( 

1228 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

1229 ) 

1230 butler.registry.insertDimensionData( 

1231 "visit", {"instrument": "DummyCamComp", "id": 423, "name": "v423", "physical_filter": "d-r"} 

1232 ) 

1233 butler.registry.insertDimensionData( 

1234 "visit", {"instrument": "DummyCamComp", "id": 425, "name": "v425", "physical_filter": "d-r"} 

1235 ) 

1236 

1237 # Create and store a dataset 

1238 metric = makeExampleMetrics() 

1239 

1240 # Create two almost-identical DatasetTypes (both will use default 

1241 # template) 

1242 dimensions = butler.dimensions.extract(["instrument", "visit"]) 

1243 butler.registry.registerDatasetType(DatasetType("metric1", dimensions, storageClass)) 

1244 butler.registry.registerDatasetType(DatasetType("metric2", dimensions, storageClass)) 

1245 butler.registry.registerDatasetType(DatasetType("metric3", dimensions, storageClass)) 

1246 

1247 dataId1 = {"instrument": "DummyCamComp", "visit": 423} 

1248 dataId2 = {"instrument": "DummyCamComp", "visit": 423, "physical_filter": "d-r"} 

1249 

1250 # Put with exactly the data ID keys needed 

1251 ref = butler.put(metric, "metric1", dataId1) 

1252 uri = butler.getURI(ref) 

1253 self.assertTrue(uri.exists()) 

1254 self.assertTrue( 

1255 uri.unquoted_path.endswith(f"{self.default_run}/metric1/??#?/d-r/DummyCamComp_423.pickle") 

1256 ) 

1257 

1258 # Check the template based on dimensions 

1259 if hasattr(butler.datastore, "templates"): 

1260 butler.datastore.templates.validateTemplates([ref]) 

1261 

1262 # Put with extra data ID keys (physical_filter is an optional 

1263 # dependency); should not change template (at least the way we're 

1264 # defining them to behave now; the important thing is that they 

1265 # must be consistent). 

1266 ref = butler.put(metric, "metric2", dataId2) 

1267 uri = butler.getURI(ref) 

1268 self.assertTrue(uri.exists()) 

1269 self.assertTrue( 

1270 uri.unquoted_path.endswith(f"{self.default_run}/metric2/d-r/DummyCamComp_v423.pickle") 

1271 ) 

1272 

1273 # Check the template based on dimensions 

1274 if hasattr(butler.datastore, "templates"): 

1275 butler.datastore.templates.validateTemplates([ref]) 

1276 

1277 # Use a template that has a typo in dimension record metadata. 

1278 # Easier to test with a butler that has a ref with records attached. 

1279 template = FileTemplate("a/{visit.name}/{id}_{visit.namex:?}.fits") 

1280 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"): 

1281 path = template.format(ref) 

1282 self.assertEqual(path, f"a/v423/{ref.id}_fits") 

1283 

1284 template = FileTemplate("a/{visit.name}/{id}_{visit.namex}.fits") 

1285 with self.assertRaises(KeyError): 

1286 with self.assertLogs("lsst.daf.butler.core.fileTemplates", "INFO"): 

1287 template.format(ref) 

1288 

1289 # Now use a file template that will not result in unique filenames 

1290 with self.assertRaises(FileTemplateValidationError): 

1291 butler.put(metric, "metric3", dataId1) 

1292 

1293 def testImportExport(self) -> None: 

1294 # Run put/get tests just to create and populate a repo. 

1295 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1296 self.runImportExportTest(storageClass) 

1297 

1298 @unittest.expectedFailure 

1299 def testImportExportVirtualComposite(self) -> None: 

1300 # Run put/get tests just to create and populate a repo. 

1301 storageClass = self.storageClassFactory.getStorageClass("StructuredComposite") 

1302 self.runImportExportTest(storageClass) 

1303 

1304 def runImportExportTest(self, storageClass: StorageClass) -> None: 

1305 """This test does an export to a temp directory and an import back 

1306 into a new temp directory repo. It does not assume a posix datastore""" 

1307 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1308 

1309 # Test that we must have a file extension. 

1310 with self.assertRaises(ValueError): 

1311 with exportButler.export(filename="dump", directory=".") as export: 

1312 pass 

1313 

1314 # Test that unknown format is not allowed. 

1315 with self.assertRaises(ValueError): 

1316 with exportButler.export(filename="dump.fits", directory=".") as export: 

1317 pass 

1318 

1319 # Test that the repo actually has at least one dataset. 

1320 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1321 self.assertGreater(len(datasets), 0) 

1322 # Add a DimensionRecord that's unused by those datasets. 

1323 skymapRecord = {"name": "example_skymap", "hash": (50).to_bytes(8, byteorder="little")} 

1324 exportButler.registry.insertDimensionData("skymap", skymapRecord) 

1325 # Export and then import datasets. 

1326 with safeTestTempDir(TESTDIR) as exportDir: 

1327 exportFile = os.path.join(exportDir, "exports.yaml") 

1328 with exportButler.export(filename=exportFile, directory=exportDir, transfer="auto") as export: 

1329 export.saveDatasets(datasets) 

1330 # Export the same datasets again. This should quietly do 

1331 # nothing because of internal deduplication, and it shouldn't 

1332 # complain about being asked to export the "htm7" elements even 

1333 # though there aren't any in these datasets or in the database. 

1334 export.saveDatasets(datasets, elements=["htm7"]) 

1335 # Save one of the data IDs again; this should be harmless 

1336 # because of internal deduplication. 

1337 export.saveDataIds([datasets[0].dataId]) 

1338 # Save some dimension records directly. 

1339 export.saveDimensionData("skymap", [skymapRecord]) 

1340 self.assertTrue(os.path.exists(exportFile)) 

1341 with safeTestTempDir(TESTDIR) as importDir: 

1342 # We always want this to be a local posix butler 

1343 Butler.makeRepo(importDir, config=Config(os.path.join(TESTDIR, "config/basic/butler.yaml"))) 

1344 # Calling script.butlerImport tests the implementation of the 

1345 # butler command line interface "import" subcommand. Functions 

1346 # in the script folder are generally considered protected and 

1347 # should not be used as public api. 

1348 with open(exportFile) as f: 

1349 script.butlerImport( 

1350 importDir, 

1351 export_file=f, 

1352 directory=exportDir, 

1353 transfer="auto", 

1354 skip_dimensions=None, 

1355 ) 

1356 importButler = Butler(importDir, run=self.default_run) 

1357 for ref in datasets: 

1358 with self.subTest(ref=ref): 

1359 # Test for existence by passing in the DatasetType and 

1360 # data ID separately, to avoid lookup by dataset_id. 

1361 self.assertTrue(importButler.exists(ref.datasetType, ref.dataId)) 

1362 self.assertEqual( 

1363 list(importButler.registry.queryDimensionRecords("skymap")), 

1364 [importButler.dimensions["skymap"].RecordClass(**skymapRecord)], 

1365 ) 

1366 

1367 def testRemoveRuns(self) -> None: 

1368 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1369 butler = Butler(self.tmpConfigFile, writeable=True) 

1370 # Load registry data with dimensions to hang datasets off of. 

1371 registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) 

1372 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1373 # Add some RUN-type collection. 

1374 run1 = "run1" 

1375 butler.registry.registerRun(run1) 

1376 run2 = "run2" 

1377 butler.registry.registerRun(run2) 

1378 # put a dataset in each 

1379 metric = makeExampleMetrics() 

1380 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1381 datasetType = self.addDatasetType( 

1382 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1383 ) 

1384 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1385 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1386 uri1 = butler.getURI(ref1) 

1387 uri2 = butler.getURI(ref2) 

1388 

1389 with self.assertRaises(OrphanedRecordError): 

1390 butler.registry.removeDatasetType(datasetType.name) 

1391 

1392 # Remove from both runs with different values for unstore. 

1393 butler.removeRuns([run1], unstore=True) 

1394 butler.removeRuns([run2], unstore=False) 

1395 # Should be nothing in registry for either one, and datastore should 

1396 # not think either exists. 

1397 with self.assertRaises(MissingCollectionError): 

1398 butler.registry.getCollectionType(run1) 

1399 with self.assertRaises(MissingCollectionError): 

1400 butler.registry.getCollectionType(run2) 

1401 self.assertFalse(butler.datastore.exists(ref1)) 

1402 self.assertFalse(butler.datastore.exists(ref2)) 

1403 # The ref we unstored should be gone according to the URI, but the 

1404 # one we forgot should still be around. 

1405 self.assertFalse(uri1.exists()) 

1406 self.assertTrue(uri2.exists()) 

1407 

1408 # Now that the collections have been pruned we can remove the 

1409 # dataset type 

1410 butler.registry.removeDatasetType(datasetType.name) 

1411 

1412 with self.assertLogs("lsst.daf.butler.registries", "INFO") as cm: 

1413 butler.registry.removeDatasetType(tuple(["test*", "test*"])) 

1414 self.assertIn("not defined", "\n".join(cm.output)) 

1415 

1416 

1417class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1418 """PosixDatastore specialization of a butler""" 

1419 

1420 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1421 fullConfigKey: str | None = ".datastore.formatters" 

1422 validationCanFail = True 

1423 datastoreStr = ["/tmp"] 

1424 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1425 registryStr = "/gen3.sqlite3" 

1426 

1427 def testPathConstructor(self) -> None: 

1428 """Independent test of constructor using PathLike.""" 

1429 butler = Butler(self.tmpConfigFile, run=self.default_run) 

1430 self.assertIsInstance(butler, Butler) 

1431 

1432 # And again with a Path object with the butler yaml 

1433 path = pathlib.Path(self.tmpConfigFile) 

1434 butler = Butler(path, writeable=False) 

1435 self.assertIsInstance(butler, Butler) 

1436 

1437 # And again with a Path object without the butler yaml 

1438 # (making sure we skip it if the tmp config doesn't end 

1439 # in butler.yaml -- which is the case for a subclass) 

1440 if self.tmpConfigFile.endswith("butler.yaml"): 

1441 path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) 

1442 butler = Butler(path, writeable=False) 

1443 self.assertIsInstance(butler, Butler) 

1444 

1445 def testExportTransferCopy(self) -> None: 

1446 """Test local export using all transfer modes""" 

1447 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1448 exportButler = self.runPutGetTest(storageClass, "test_metric") 

1449 # Test that the repo actually has at least one dataset. 

1450 datasets = list(exportButler.registry.queryDatasets(..., collections=...)) 

1451 self.assertGreater(len(datasets), 0) 

1452 uris = [exportButler.getURI(d) for d in datasets] 

1453 assert isinstance(exportButler.datastore, FileDatastore) 

1454 datastoreRoot = exportButler.datastore.root 

1455 

1456 pathsInStore = [uri.relative_to(datastoreRoot) for uri in uris] 

1457 

1458 for path in pathsInStore: 

1459 # Assume local file system 

1460 assert path is not None 

1461 self.assertTrue(self.checkFileExists(datastoreRoot, path), f"Checking path {path}") 

1462 

1463 for transfer in ("copy", "link", "symlink", "relsymlink"): 

1464 with safeTestTempDir(TESTDIR) as exportDir: 

1465 with exportButler.export(directory=exportDir, format="yaml", transfer=transfer) as export: 

1466 export.saveDatasets(datasets) 

1467 for path in pathsInStore: 

1468 assert path is not None 

1469 self.assertTrue( 

1470 self.checkFileExists(exportDir, path), 

1471 f"Check that mode {transfer} exported files", 

1472 ) 

1473 

1474 def testPruneDatasets(self) -> None: 

1475 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1476 butler = Butler(self.tmpConfigFile, writeable=True) 

1477 assert isinstance(butler.datastore, FileDatastore) 

1478 # Load registry data with dimensions to hang datasets off of. 

1479 registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) 

1480 butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) 

1481 # Add some RUN-type collections. 

1482 run1 = "run1" 

1483 butler.registry.registerRun(run1) 

1484 run2 = "run2" 

1485 butler.registry.registerRun(run2) 

1486 # put some datasets. ref1 and ref2 have the same data ID, and are in 

1487 # different runs. ref3 has a different data ID. 

1488 metric = makeExampleMetrics() 

1489 dimensions = butler.dimensions.extract(["instrument", "physical_filter"]) 

1490 datasetType = self.addDatasetType( 

1491 "prune_collections_test_dataset", dimensions, storageClass, butler.registry 

1492 ) 

1493 ref1 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run1) 

1494 ref2 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-G"}, run=run2) 

1495 ref3 = butler.put(metric, datasetType, {"instrument": "Cam1", "physical_filter": "Cam1-R1"}, run=run1) 

1496 

1497 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1498 for ref, stored in many_stored.items(): 

1499 self.assertTrue(stored, f"Ref {ref} should be stored") 

1500 

1501 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1502 for ref, exists in many_exists.items(): 

1503 self.assertTrue(exists, f"Checking ref {ref} exists.") 

1504 self.assertEqual(exists, DatasetExistence.VERIFIED, f"Ref {ref} should be stored") 

1505 

1506 # Simple prune. 

1507 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1508 self.assertFalse(butler.exists(ref1.datasetType, ref1.dataId, collections=run1)) 

1509 

1510 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1511 for ref, stored in many_stored.items(): 

1512 self.assertFalse(stored, f"Ref {ref} should not be stored") 

1513 

1514 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1515 for ref, exists in many_exists.items(): 

1516 self.assertEqual(exists, DatasetExistence.UNRECOGNIZED, f"Ref {ref} should not be stored") 

1517 

1518 # Put data back. 

1519 ref1_new = butler.put(metric, ref1) 

1520 self.assertEqual(ref1_new, ref1) # Reuses original ID. 

1521 ref2 = butler.put(metric, ref2) 

1522 

1523 many_stored = butler.stored_many([ref1, ref2, ref3]) 

1524 self.assertTrue(many_stored[ref1]) 

1525 self.assertTrue(many_stored[ref2]) 

1526 self.assertFalse(many_stored[ref3]) 

1527 

1528 ref3 = butler.put(metric, ref3) 

1529 

1530 many_exists = butler._exists_many([ref1, ref2, ref3]) 

1531 for ref, exists in many_exists.items(): 

1532 self.assertTrue(exists, f"Ref {ref} should not be stored") 

1533 

1534 # Clear out the datasets from registry and start again. 

1535 refs = [ref1, ref2, ref3] 

1536 butler.pruneDatasets(refs, purge=True, unstore=True) 

1537 for ref in refs: 

1538 butler.put(metric, ref) 

1539 

1540 # Test different forms of file availability. 

1541 # Need to be in a state where: 

1542 # - one ref just has registry record. 

1543 # - one ref has a missing file but a datastore record. 

1544 # - one ref has a missing datastore record but file is there. 

1545 # - one ref does not exist anywhere. 

1546 # Do not need to test a ref that has everything since that is tested 

1547 # above. 

1548 ref0 = DatasetRef( 

1549 datasetType, 

1550 DataCoordinate.standardize( 

1551 {"instrument": "Cam1", "physical_filter": "Cam1-G"}, universe=butler.dimensions 

1552 ), 

1553 run=run1, 

1554 ) 

1555 

1556 # Delete from datastore and retain in Registry. 

1557 butler.pruneDatasets([ref1], purge=False, unstore=True, disassociate=False) 

1558 

1559 # File has been removed. 

1560 uri2 = butler.datastore.getURI(ref2) 

1561 uri2.remove() 

1562 

1563 # Datastore has lost track. 

1564 butler.datastore.forget([ref3]) 

1565 

1566 # First test with a standard butler. 

1567 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1568 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1569 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1570 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1571 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED) 

1572 

1573 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=False) 

1574 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1575 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1576 self.assertEqual(exists_many[ref2], DatasetExistence.KNOWN) 

1577 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ASSUMED) 

1578 self.assertTrue(exists_many[ref2]) 

1579 

1580 # Check that per-ref query gives the same answer as many query. 

1581 for ref, exists in exists_many.items(): 

1582 self.assertEqual(butler.exists(ref, full_check=False), exists) 

1583 

1584 # Test again with a trusting butler. 

1585 butler.datastore.trustGetRequest = True 

1586 exists_many = butler._exists_many([ref0, ref1, ref2, ref3], full_check=True) 

1587 self.assertEqual(exists_many[ref0], DatasetExistence.UNRECOGNIZED) 

1588 self.assertEqual(exists_many[ref1], DatasetExistence.RECORDED) 

1589 self.assertEqual(exists_many[ref2], DatasetExistence.RECORDED | DatasetExistence.DATASTORE) 

1590 self.assertEqual(exists_many[ref3], DatasetExistence.RECORDED | DatasetExistence._ARTIFACT) 

1591 

1592 # Check that per-ref query gives the same answer as many query. 

1593 for ref, exists in exists_many.items(): 

1594 self.assertEqual(butler.exists(ref, full_check=True), exists) 

1595 

1596 # Create a ref that surprisingly has the UUID of an existing ref 

1597 # but is not the same. 

1598 ref_bad = DatasetRef(datasetType, dataId=ref3.dataId, run=ref3.run, id=ref2.id) 

1599 with self.assertRaises(ValueError): 

1600 butler.exists(ref_bad) 

1601 

1602 # Create a ref that has a compatible storage class. 

1603 ref_compat = ref2.overrideStorageClass("StructuredDataDict") 

1604 exists = butler.exists(ref_compat) 

1605 self.assertEqual(exists, exists_many[ref2]) 

1606 

1607 # Remove everything and start from scratch. 

1608 butler.datastore.trustGetRequest = False 

1609 butler.pruneDatasets(refs, purge=True, unstore=True) 

1610 for ref in refs: 

1611 butler.put(metric, ref) 

1612 

1613 # These tests mess directly with the trash table and can leave the 

1614 # datastore in an odd state. Do them at the end. 

1615 # Check that in normal mode, deleting the record will lead to 

1616 # trash not touching the file. 

1617 uri1 = butler.datastore.getURI(ref1) 

1618 butler.datastore.bridge.moveToTrash([ref1], transaction=None) # Update the dataset_location table 

1619 butler.datastore.forget([ref1]) 

1620 butler.datastore.trash(ref1) 

1621 butler.datastore.emptyTrash() 

1622 self.assertTrue(uri1.exists()) 

1623 uri1.remove() # Clean it up. 

1624 

1625 # Simulate execution butler setup by deleting the datastore 

1626 # record but keeping the file around and trusting. 

1627 butler.datastore.trustGetRequest = True 

1628 uri2 = butler.datastore.getURI(ref2) 

1629 uri3 = butler.datastore.getURI(ref3) 

1630 self.assertTrue(uri2.exists()) 

1631 self.assertTrue(uri3.exists()) 

1632 

1633 # Remove the datastore record. 

1634 butler.datastore.bridge.moveToTrash([ref2], transaction=None) # Update the dataset_location table 

1635 butler.datastore.forget([ref2]) 

1636 self.assertTrue(uri2.exists()) 

1637 butler.datastore.trash([ref2, ref3]) 

1638 # Immediate removal for ref2 file 

1639 self.assertFalse(uri2.exists()) 

1640 # But ref3 has to wait for the empty. 

1641 self.assertTrue(uri3.exists()) 

1642 butler.datastore.emptyTrash() 

1643 self.assertFalse(uri3.exists()) 

1644 

1645 # Clear out the datasets from registry. 

1646 butler.pruneDatasets([ref1, ref2, ref3], purge=True, unstore=True) 

1647 

1648 def testPytypeCoercion(self) -> None: 

1649 """Test python type coercion on Butler.get and put.""" 

1650 

1651 # Store some data with the normal example storage class. 

1652 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1653 datasetTypeName = "test_metric" 

1654 butler = self.runPutGetTest(storageClass, datasetTypeName) 

1655 

1656 dataId = {"instrument": "DummyCamComp", "visit": 423} 

1657 metric = butler.get(datasetTypeName, dataId=dataId) 

1658 self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") 

1659 

1660 datasetType_ori = butler.registry.getDatasetType(datasetTypeName) 

1661 self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") 

1662 

1663 # Now need to hack the registry dataset type definition. 

1664 # There is no API for this. 

1665 assert isinstance(butler.registry, SqlRegistry) 

1666 manager = butler.registry._managers.datasets 

1667 assert hasattr(manager, "_db") and hasattr(manager, "_static") 

1668 manager._db.update( 

1669 manager._static.dataset_type, 

1670 {"name": datasetTypeName}, 

1671 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataNoComponentsModel"}, 

1672 ) 

1673 

1674 # Force reset of dataset type cache 

1675 butler.registry.refresh() 

1676 

1677 datasetType_new = butler.registry.getDatasetType(datasetTypeName) 

1678 self.assertEqual(datasetType_new.name, datasetType_ori.name) 

1679 self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") 

1680 

1681 metric_model = butler.get(datasetTypeName, dataId=dataId) 

1682 self.assertNotEqual(type(metric_model), type(metric)) 

1683 self.assertEqual(get_full_type_name(metric_model), "lsst.daf.butler.tests.MetricsExampleModel") 

1684 

1685 # Put the model and read it back to show that everything now 

1686 # works as normal. 

1687 metric_ref = butler.put(metric_model, datasetTypeName, dataId=dataId, visit=424) 

1688 metric_model_new = butler.get(metric_ref) 

1689 self.assertEqual(metric_model_new, metric_model) 

1690 

1691 # Hack the storage class again to something that will fail on the 

1692 # get with no conversion class. 

1693 manager._db.update( 

1694 manager._static.dataset_type, 

1695 {"name": datasetTypeName}, 

1696 {datasetTypeName: datasetTypeName, "storage_class": "StructuredDataListYaml"}, 

1697 ) 

1698 butler.registry.refresh() 

1699 

1700 with self.assertRaises(ValueError): 

1701 butler.get(datasetTypeName, dataId=dataId) 

1702 

1703 

1704@unittest.skipUnless(testing is not None, "testing.postgresql module not found") 

1705class PostgresPosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1706 """PosixDatastore specialization of a butler using Postgres""" 

1707 

1708 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1709 fullConfigKey = ".datastore.formatters" 

1710 validationCanFail = True 

1711 datastoreStr = ["/tmp"] 

1712 datastoreName = [f"FileDatastore@{BUTLER_ROOT_TAG}"] 

1713 registryStr = "PostgreSQL@test" 

1714 postgresql: Any 

1715 

1716 @staticmethod 

1717 def _handler(postgresql: Any) -> None: 

1718 engine = sqlalchemy.engine.create_engine(postgresql.url()) 

1719 with engine.begin() as connection: 

1720 connection.execute(sqlalchemy.text("CREATE EXTENSION btree_gist;")) 

1721 

1722 @classmethod 

1723 def setUpClass(cls) -> None: 

1724 # Create the postgres test server. 

1725 cls.postgresql = testing.postgresql.PostgresqlFactory( 

1726 cache_initialized_db=True, on_initialized=cls._handler 

1727 ) 

1728 super().setUpClass() 

1729 

1730 @classmethod 

1731 def tearDownClass(cls) -> None: 

1732 # Clean up any lingering SQLAlchemy engines/connections 

1733 # so they're closed before we shut down the server. 

1734 gc.collect() 

1735 cls.postgresql.clear_cache() 

1736 super().tearDownClass() 

1737 

1738 def setUp(self) -> None: 

1739 self.server = self.postgresql() 

1740 

1741 # Need to add a registry section to the config. 

1742 self._temp_config = False 

1743 config = Config(self.configFile) 

1744 config["registry", "db"] = self.server.url() 

1745 with tempfile.NamedTemporaryFile("w", suffix=".yaml", delete=False) as fh: 

1746 config.dump(fh) 

1747 self.configFile = fh.name 

1748 self._temp_config = True 

1749 super().setUp() 

1750 

1751 def tearDown(self) -> None: 

1752 self.server.stop() 

1753 if self._temp_config and os.path.exists(self.configFile): 

1754 os.remove(self.configFile) 

1755 super().tearDown() 

1756 

1757 def testMakeRepo(self) -> None: 

1758 # The base class test assumes that it's using sqlite and assumes 

1759 # the config file is acceptable to sqlite. 

1760 raise unittest.SkipTest("Postgres config is not compatible with this test.") 

1761 

1762 

1763class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): 

1764 """InMemoryDatastore specialization of a butler""" 

1765 

1766 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1767 fullConfigKey = None 

1768 useTempRoot = False 

1769 validationCanFail = False 

1770 datastoreStr = ["datastore='InMemory"] 

1771 datastoreName = ["InMemoryDatastore@"] 

1772 registryStr = "/gen3.sqlite3" 

1773 

1774 def testIngest(self) -> None: 

1775 pass 

1776 

1777 

1778class ChainedDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1779 """PosixDatastore specialization""" 

1780 

1781 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

1782 fullConfigKey = ".datastore.datastores.1.formatters" 

1783 validationCanFail = True 

1784 datastoreStr = ["datastore='InMemory", "/FileDatastore_1/,", "/FileDatastore_2/'"] 

1785 datastoreName = [ 

1786 "InMemoryDatastore@", 

1787 f"FileDatastore@{BUTLER_ROOT_TAG}/FileDatastore_1", 

1788 "SecondDatastore", 

1789 ] 

1790 registryStr = "/gen3.sqlite3" 

1791 

1792 

1793class ButlerExplicitRootTestCase(PosixDatastoreButlerTestCase): 

1794 """Test that a yaml file in one location can refer to a root in another.""" 

1795 

1796 datastoreStr = ["dir1"] 

1797 # Disable the makeRepo test since we are deliberately not using 

1798 # butler.yaml as the config name. 

1799 fullConfigKey = None 

1800 

1801 def setUp(self) -> None: 

1802 self.root = makeTestTempDir(TESTDIR) 

1803 

1804 # Make a new repository in one place 

1805 self.dir1 = os.path.join(self.root, "dir1") 

1806 Butler.makeRepo(self.dir1, config=Config(self.configFile)) 

1807 

1808 # Move the yaml file to a different place and add a "root" 

1809 self.dir2 = os.path.join(self.root, "dir2") 

1810 os.makedirs(self.dir2, exist_ok=True) 

1811 configFile1 = os.path.join(self.dir1, "butler.yaml") 

1812 config = Config(configFile1) 

1813 config["root"] = self.dir1 

1814 configFile2 = os.path.join(self.dir2, "butler2.yaml") 

1815 config.dumpToUri(configFile2) 

1816 os.remove(configFile1) 

1817 self.tmpConfigFile = configFile2 

1818 

1819 def testFileLocations(self) -> None: 

1820 self.assertNotEqual(self.dir1, self.dir2) 

1821 self.assertTrue(os.path.exists(os.path.join(self.dir2, "butler2.yaml"))) 

1822 self.assertFalse(os.path.exists(os.path.join(self.dir1, "butler.yaml"))) 

1823 self.assertTrue(os.path.exists(os.path.join(self.dir1, "gen3.sqlite3"))) 

1824 

1825 

1826class ButlerMakeRepoOutfileTestCase(ButlerPutGetTests, unittest.TestCase): 

1827 """Test that a config file created by makeRepo outside of repo works.""" 

1828 

1829 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1830 

1831 def setUp(self) -> None: 

1832 self.root = makeTestTempDir(TESTDIR) 

1833 self.root2 = makeTestTempDir(TESTDIR) 

1834 

1835 self.tmpConfigFile = os.path.join(self.root2, "different.yaml") 

1836 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1837 

1838 def tearDown(self) -> None: 

1839 if os.path.exists(self.root2): 

1840 shutil.rmtree(self.root2, ignore_errors=True) 

1841 super().tearDown() 

1842 

1843 def testConfigExistence(self) -> None: 

1844 c = Config(self.tmpConfigFile) 

1845 uri_config = ResourcePath(c["root"]) 

1846 uri_expected = ResourcePath(self.root, forceDirectory=True) 

1847 self.assertEqual(uri_config.geturl(), uri_expected.geturl()) 

1848 self.assertNotIn(":", uri_config.path, "Check for URI concatenated with normal path") 

1849 

1850 def testPutGet(self) -> None: 

1851 storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") 

1852 self.runPutGetTest(storageClass, "test_metric") 

1853 

1854 

1855class ButlerMakeRepoOutfileDirTestCase(ButlerMakeRepoOutfileTestCase): 

1856 """Test that a config file created by makeRepo outside of repo works.""" 

1857 

1858 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1859 

1860 def setUp(self) -> None: 

1861 self.root = makeTestTempDir(TESTDIR) 

1862 self.root2 = makeTestTempDir(TESTDIR) 

1863 

1864 self.tmpConfigFile = self.root2 

1865 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1866 

1867 def testConfigExistence(self) -> None: 

1868 # Append the yaml file else Config constructor does not know the file 

1869 # type. 

1870 self.tmpConfigFile = os.path.join(self.tmpConfigFile, "butler.yaml") 

1871 super().testConfigExistence() 

1872 

1873 

1874class ButlerMakeRepoOutfileUriTestCase(ButlerMakeRepoOutfileTestCase): 

1875 """Test that a config file created by makeRepo outside of repo works.""" 

1876 

1877 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1878 

1879 def setUp(self) -> None: 

1880 self.root = makeTestTempDir(TESTDIR) 

1881 self.root2 = makeTestTempDir(TESTDIR) 

1882 

1883 self.tmpConfigFile = ResourcePath(os.path.join(self.root2, "something.yaml")).geturl() 

1884 Butler.makeRepo(self.root, config=Config(self.configFile), outfile=self.tmpConfigFile) 

1885 

1886 

1887@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!") 

1888class S3DatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): 

1889 """S3Datastore specialization of a butler; an S3 storage Datastore + 

1890 a local in-memory SqlRegistry. 

1891 """ 

1892 

1893 configFile = os.path.join(TESTDIR, "config/basic/butler-s3store.yaml") 

1894 fullConfigKey = None 

1895 validationCanFail = True 

1896 

1897 bucketName = "anybucketname" 

1898 """Name of the Bucket that will be used in the tests. The name is read from 

1899 the config file used with the tests during set-up. 

1900 """ 

1901 

1902 root = "butlerRoot/" 

1903 """Root repository directory expected to be used in case useTempRoot=False. 

1904 Otherwise the root is set to a 20 characters long randomly generated string 

1905 during set-up. 

1906 """ 

1907 

1908 datastoreStr = [f"datastore={root}"] 

1909 """Contains all expected root locations in a format expected to be 

1910 returned by Butler stringification. 

1911 """ 

1912 

1913 datastoreName = ["FileDatastore@s3://{bucketName}/{root}"] 

1914 """The expected format of the S3 Datastore string.""" 

1915 

1916 registryStr = "/gen3.sqlite3" 

1917 """Expected format of the Registry string.""" 

1918 

1919 mock_s3 = mock_s3() 

1920 """The mocked s3 interface from moto.""" 

1921 

1922 def genRoot(self) -> str: 

1923 """Returns a random string of len 20 to serve as a root 

1924 name for the temporary bucket repo. 

1925 

1926 This is equivalent to tempfile.mkdtemp as this is what self.root 

1927 becomes when useTempRoot is True. 

1928 """ 

1929 rndstr = "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(20)) 

1930 return rndstr + "/" 

1931 

1932 def setUp(self) -> None: 

1933 config = Config(self.configFile) 

1934 uri = ResourcePath(config[".datastore.datastore.root"]) 

1935 self.bucketName = uri.netloc 

1936 

1937 # Enable S3 mocking of tests. 

1938 self.mock_s3.start() 

1939 

1940 # set up some fake credentials if they do not exist 

1941 self.usingDummyCredentials = setAwsEnvCredentials() 

1942 

1943 if self.useTempRoot: 

1944 self.root = self.genRoot() 

1945 rooturi = f"s3://{self.bucketName}/{self.root}" 

1946 config.update({"datastore": {"datastore": {"root": rooturi}}}) 

1947 

1948 # need local folder to store registry database 

1949 self.reg_dir = makeTestTempDir(TESTDIR) 

1950 config["registry", "db"] = f"sqlite:///{self.reg_dir}/gen3.sqlite3" 

1951 

1952 # MOTO needs to know that we expect Bucket bucketname to exist 

1953 # (this used to be the class attribute bucketName) 

1954 s3 = boto3.resource("s3") 

1955 s3.create_bucket(Bucket=self.bucketName) 

1956 

1957 self.datastoreStr = [f"datastore='{rooturi}'"] 

1958 self.datastoreName = [f"FileDatastore@{rooturi}"] 

1959 Butler.makeRepo(rooturi, config=config, forceConfigRoot=False) 

1960 self.tmpConfigFile = posixpath.join(rooturi, "butler.yaml") 

1961 

1962 def tearDown(self) -> None: 

1963 s3 = boto3.resource("s3") 

1964 bucket = s3.Bucket(self.bucketName) 

1965 try: 

1966 bucket.objects.all().delete() 

1967 except botocore.exceptions.ClientError as e: 

1968 if e.response["Error"]["Code"] == "404": 

1969 # the key was not reachable - pass 

1970 pass 

1971 else: 

1972 raise 

1973 

1974 bucket = s3.Bucket(self.bucketName) 

1975 bucket.delete() 

1976 

1977 # Stop the S3 mock. 

1978 self.mock_s3.stop() 

1979 

1980 # unset any potentially set dummy credentials 

1981 if self.usingDummyCredentials: 

1982 unsetAwsEnvCredentials() 

1983 

1984 if self.reg_dir is not None and os.path.exists(self.reg_dir): 

1985 shutil.rmtree(self.reg_dir, ignore_errors=True) 

1986 

1987 if self.useTempRoot and os.path.exists(self.root): 

1988 shutil.rmtree(self.root, ignore_errors=True) 

1989 

1990 super().tearDown() 

1991 

1992 

1993class PosixDatastoreTransfers(unittest.TestCase): 

1994 """Test data transfers between butlers. 

1995 

1996 Test for different managers. UUID to UUID and integer to integer are 

1997 tested. UUID to integer is not supported since we do not currently 

1998 want to allow that. Integer to UUID is supported with the caveat 

1999 that UUID4 will be generated and this will be incorrect for raw 

2000 dataset types. The test ignores that. 

2001 """ 

2002 

2003 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

2004 storageClassFactory: StorageClassFactory 

2005 

2006 @classmethod 

2007 def setUpClass(cls) -> None: 

2008 cls.storageClassFactory = StorageClassFactory() 

2009 cls.storageClassFactory.addFromConfig(cls.configFile) 

2010 

2011 def setUp(self) -> None: 

2012 self.root = makeTestTempDir(TESTDIR) 

2013 self.config = Config(self.configFile) 

2014 

2015 def tearDown(self) -> None: 

2016 removeTestTempDir(self.root) 

2017 

2018 def create_butler(self, manager: str, label: str) -> Butler: 

2019 config = Config(self.configFile) 

2020 config["registry", "managers", "datasets"] = manager 

2021 return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True) 

2022 

2023 def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None: 

2024 default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID" 

2025 if manager1 is None: 

2026 manager1 = default 

2027 if manager2 is None: 

2028 manager2 = default 

2029 self.source_butler = self.create_butler(manager1, "1") 

2030 self.target_butler = self.create_butler(manager2, "2") 

2031 

2032 def testTransferUuidToUuid(self) -> None: 

2033 self.create_butlers() 

2034 self.assertButlerTransfers() 

2035 

2036 def _enable_trust(self, datastore: Datastore) -> None: 

2037 if hasattr(datastore, "trustGetRequest"): 

2038 datastore.trustGetRequest = True 

2039 elif hasattr(datastore, "datastores"): 

2040 for datastore in datastore.datastores: 

2041 if hasattr(datastore, "trustGetRequest"): 

2042 datastore.trustGetRequest = True 

2043 

2044 def testTransferMissing(self) -> None: 

2045 """Test transfers where datastore records are missing. 

2046 

2047 This is how execution butler works. 

2048 """ 

2049 self.create_butlers() 

2050 

2051 # Configure the source butler to allow trust. 

2052 self._enable_trust(self.source_butler.datastore) 

2053 

2054 self.assertButlerTransfers(purge=True) 

2055 

2056 def testTransferMissingDisassembly(self) -> None: 

2057 """Test transfers where datastore records are missing. 

2058 

2059 This is how execution butler works. 

2060 """ 

2061 self.create_butlers() 

2062 

2063 # Configure the source butler to allow trust. 

2064 self._enable_trust(self.source_butler.datastore) 

2065 

2066 # Test disassembly. 

2067 self.assertButlerTransfers(purge=True, storageClassName="StructuredComposite") 

2068 

2069 def testAbsoluteURITransferDirect(self) -> None: 

2070 """Test transfer using an absolute URI.""" 

2071 self._absolute_transfer("auto") 

2072 

2073 def testAbsoluteURITransferCopy(self) -> None: 

2074 """Test transfer using an absolute URI.""" 

2075 self._absolute_transfer("copy") 

2076 

2077 def _absolute_transfer(self, transfer: str) -> None: 

2078 self.create_butlers() 

2079 

2080 storageClassName = "StructuredData" 

2081 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2082 datasetTypeName = "random_data" 

2083 run = "run1" 

2084 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2085 

2086 dimensions = self.source_butler.dimensions.extract(()) 

2087 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2088 self.source_butler.registry.registerDatasetType(datasetType) 

2089 

2090 metrics = makeExampleMetrics() 

2091 with ResourcePath.temporary_uri(suffix=".json") as temp: 

2092 dataId = DataCoordinate.makeEmpty(self.source_butler.dimensions) 

2093 source_refs = [DatasetRef(datasetType, dataId, run=run)] 

2094 temp.write(json.dumps(metrics.exportAsDict()).encode()) 

2095 dataset = FileDataset(path=temp, refs=source_refs) 

2096 self.source_butler.ingest(dataset, transfer="direct") 

2097 

2098 self.target_butler.transfer_from( 

2099 self.source_butler, dataset.refs, register_dataset_types=True, transfer=transfer 

2100 ) 

2101 

2102 uri = self.target_butler.getURI(dataset.refs[0]) 

2103 if transfer == "auto": 

2104 self.assertEqual(uri, temp) 

2105 else: 

2106 self.assertNotEqual(uri, temp) 

2107 

2108 def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "StructuredData") -> None: 

2109 """Test that a run can be transferred to another butler.""" 

2110 

2111 storageClass = self.storageClassFactory.getStorageClass(storageClassName) 

2112 datasetTypeName = "random_data" 

2113 

2114 # Test will create 3 collections and we will want to transfer 

2115 # two of those three. 

2116 runs = ["run1", "run2", "other"] 

2117 

2118 # Also want to use two different dataset types to ensure that 

2119 # grouping works. 

2120 datasetTypeNames = ["random_data", "random_data_2"] 

2121 

2122 # Create the run collections in the source butler. 

2123 for run in runs: 

2124 self.source_butler.registry.registerCollection(run, CollectionType.RUN) 

2125 

2126 # Create dimensions in source butler. 

2127 n_exposures = 30 

2128 self.source_butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) 

2129 self.source_butler.registry.insertDimensionData( 

2130 "physical_filter", {"instrument": "DummyCamComp", "name": "d-r", "band": "R"} 

2131 ) 

2132 self.source_butler.registry.insertDimensionData( 

2133 "detector", {"instrument": "DummyCamComp", "id": 1, "full_name": "det1"} 

2134 ) 

2135 

2136 for i in range(n_exposures): 

2137 self.source_butler.registry.insertDimensionData( 

2138 "exposure", 

2139 {"instrument": "DummyCamComp", "id": i, "obs_id": f"exp{i}", "physical_filter": "d-r"}, 

2140 ) 

2141 

2142 # Create dataset types in the source butler. 

2143 dimensions = self.source_butler.dimensions.extract(["instrument", "exposure"]) 

2144 for datasetTypeName in datasetTypeNames: 

2145 datasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

2146 self.source_butler.registry.registerDatasetType(datasetType) 

2147 

2148 # Write a dataset to an unrelated run -- this will ensure that 

2149 # we are rewriting integer dataset ids in the target if necessary. 

2150 # Will not be relevant for UUID. 

2151 run = "distraction" 

2152 butler = Butler(butler=self.source_butler, run=run) 

2153 butler.put( 

2154 makeExampleMetrics(), 

2155 datasetTypeName, 

2156 exposure=1, 

2157 instrument="DummyCamComp", 

2158 physical_filter="d-r", 

2159 ) 

2160 

2161 # Write some example metrics to the source 

2162 butler = Butler(butler=self.source_butler) 

2163 

2164 # Set of DatasetRefs that should be in the list of refs to transfer 

2165 # but which will not be transferred. 

2166 deleted: set[DatasetRef] = set() 

2167 

2168 n_expected = 20 # Number of datasets expected to be transferred 

2169 source_refs = [] 

2170 for i in range(n_exposures): 

2171 # Put a third of datasets into each collection, only retain 

2172 # two thirds. 

2173 index = i % 3 

2174 run = runs[index] 

2175 datasetTypeName = datasetTypeNames[i % 2] 

2176 

2177 metric = MetricsExample( 

2178 summary={"counter": i}, output={"text": "metric"}, data=[2 * x for x in range(i)] 

2179 ) 

2180 dataId = {"exposure": i, "instrument": "DummyCamComp", "physical_filter": "d-r"} 

2181 ref = butler.put(metric, datasetTypeName, dataId=dataId, run=run) 

2182 

2183 # Remove the datastore record using low-level API 

2184 if purge: 

2185 # Remove records for a fraction. 

2186 if index == 1: 

2187 # For one of these delete the file as well. 

2188 # This allows the "missing" code to filter the 

2189 # file out. 

2190 # Access the individual datastores. 

2191 datastores = [] 

2192 if hasattr(butler.datastore, "datastores"): 

2193 datastores.extend(butler.datastore.datastores) 

2194 else: 

2195 datastores.append(butler.datastore) 

2196 

2197 if not deleted: 

2198 # For a chained datastore we need to remove 

2199 # files in each chain. 

2200 for datastore in datastores: 

2201 # The file might not be known to the datastore 

2202 # if constraints are used. 

2203 try: 

2204 primary, uris = datastore.getURIs(ref) 

2205 except FileNotFoundError: 

2206 continue 

2207 if primary: 

2208 if primary.scheme != "mem": 

2209 primary.remove() 

2210 for uri in uris.values(): 

2211 if uri.scheme != "mem": 

2212 uri.remove() 

2213 n_expected -= 1 

2214 deleted.add(ref) 

2215 

2216 # Remove the datastore record. 

2217 for datastore in datastores: 

2218 if hasattr(datastore, "removeStoredItemInfo"): 

2219 datastore.removeStoredItemInfo(ref) 

2220 

2221 if index < 2: 

2222 source_refs.append(ref) 

2223 if ref not in deleted: 

2224 new_metric = butler.get(ref) 

2225 self.assertEqual(new_metric, metric) 

2226 

2227 # Create some bad dataset types to ensure we check for inconsistent 

2228 # definitions. 

2229 badStorageClass = self.storageClassFactory.getStorageClass("StructuredDataList") 

2230 for datasetTypeName in datasetTypeNames: 

2231 datasetType = DatasetType(datasetTypeName, dimensions, badStorageClass) 

2232 self.target_butler.registry.registerDatasetType(datasetType) 

2233 with self.assertRaises(ConflictingDefinitionError) as cm: 

2234 self.target_butler.transfer_from(self.source_butler, source_refs) 

2235 self.assertIn("dataset type differs", str(cm.exception)) 

2236 

2237 # And remove the bad definitions. 

2238 for datasetTypeName in datasetTypeNames: 

2239 self.target_butler.registry.removeDatasetType(datasetTypeName) 

2240 

2241 # Transfer without creating dataset types should fail. 

2242 with self.assertRaises(KeyError): 

2243 self.target_butler.transfer_from(self.source_butler, source_refs) 

2244 

2245 # Transfer without creating dimensions should fail. 

2246 with self.assertRaises(ConflictingDefinitionError) as cm: 

2247 self.target_butler.transfer_from(self.source_butler, source_refs, register_dataset_types=True) 

2248 self.assertIn("dimension", str(cm.exception)) 

2249 

2250 # The failed transfer above leaves registry in an inconsistent 

2251 # state because the run is created but then rolled back without 

2252 # the collection cache being cleared. For now force a refresh. 

2253 # Can remove with DM-35498. 

2254 self.target_butler.registry.refresh() 

2255 

2256 # Now transfer them to the second butler, including dimensions. 

2257 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2258 transferred = self.target_butler.transfer_from( 

2259 self.source_butler, 

2260 source_refs, 

2261 register_dataset_types=True, 

2262 transfer_dimensions=True, 

2263 ) 

2264 self.assertEqual(len(transferred), n_expected) 

2265 log_output = ";".join(log_cm.output) 

2266 

2267 # A ChainedDatastore will use the in-memory datastore for mexists 

2268 # so we can not rely on the mexists log message. 

2269 self.assertIn("Number of datastore records found in source", log_output) 

2270 self.assertIn("Creating output run", log_output) 

2271 

2272 # Do the transfer twice to ensure that it will do nothing extra. 

2273 # Only do this if purge=True because it does not work for int 

2274 # dataset_id. 

2275 if purge: 

2276 # This should not need to register dataset types. 

2277 transferred = self.target_butler.transfer_from(self.source_butler, source_refs) 

2278 self.assertEqual(len(transferred), n_expected) 

2279 

2280 # Also do an explicit low-level transfer to trigger some 

2281 # edge cases. 

2282 with self.assertLogs(level=logging.DEBUG) as log_cm: 

2283 self.target_butler.datastore.transfer_from(self.source_butler.datastore, source_refs) 

2284 log_output = ";".join(log_cm.output) 

2285 self.assertIn("no file artifacts exist", log_output) 

2286 

2287 with self.assertRaises((TypeError, AttributeError)): 

2288 self.target_butler.datastore.transfer_from(self.source_butler, source_refs) # type: ignore 

2289 

2290 with self.assertRaises(ValueError): 

2291 self.target_butler.datastore.transfer_from( 

2292 self.source_butler.datastore, source_refs, transfer="split" 

2293 ) 

2294 

2295 # Now try to get the same refs from the new butler. 

2296 for ref in source_refs: 

2297 if ref not in deleted: 

2298 new_metric = self.target_butler.get(ref) 

2299 old_metric = self.source_butler.get(ref) 

2300 self.assertEqual(new_metric, old_metric) 

2301 

2302 # Now prune run2 collection and create instead a CHAINED collection. 

2303 # This should block the transfer. 

2304 self.target_butler.removeRuns(["run2"], unstore=True) 

2305 self.target_butler.registry.registerCollection("run2", CollectionType.CHAINED) 

2306 with self.assertRaises(CollectionTypeError): 

2307 # Re-importing the run1 datasets can be problematic if they 

2308 # use integer IDs so filter those out. 

2309 to_transfer = [ref for ref in source_refs if ref.run == "run2"] 

2310 self.target_butler.transfer_from(self.source_butler, to_transfer) 

2311 

2312 

2313class ChainedDatastoreTransfers(PosixDatastoreTransfers): 

2314 configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml") 

2315 

2316 

2317def setup_module(module) -> None: 

2318 clean_environment() 

2319 

2320 

2321if __name__ == "__main__": 

2322 clean_environment() 

2323 unittest.main()