Coverage for python/lsst/daf/butler/registry/tests/_registry.py: 4%

1434 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-19 03:42 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["RegistryTests"] 

24 

25import itertools 

26import logging 

27import os 

28import re 

29import unittest 

30import uuid 

31from abc import ABC, abstractmethod 

32from collections import defaultdict, namedtuple 

33from datetime import datetime, timedelta 

34from typing import TYPE_CHECKING, Iterator, Optional, Type, Union 

35 

36import astropy.time 

37import sqlalchemy 

38 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43 

44import lsst.sphgeom 

45from lsst.daf.relation import Relation, RelationalAlgebraError, Transfer, iteration, sql 

46 

47from ...core import ( 

48 DataCoordinate, 

49 DataCoordinateSet, 

50 DatasetAssociation, 

51 DatasetRef, 

52 DatasetType, 

53 DimensionGraph, 

54 NamedValueSet, 

55 SkyPixDimension, 

56 StorageClass, 

57 Timespan, 

58 ddl, 

59) 

60from .._collection_summary import CollectionSummary 

61from .._collectionType import CollectionType 

62from .._config import RegistryConfig 

63from .._exceptions import ( 

64 ArgumentError, 

65 CollectionError, 

66 CollectionTypeError, 

67 ConflictingDefinitionError, 

68 DataIdValueError, 

69 DatasetTypeError, 

70 InconsistentDataIdError, 

71 MissingCollectionError, 

72 MissingDatasetTypeError, 

73 OrphanedRecordError, 

74) 

75from ..interfaces import ButlerAttributeExistsError, DatasetIdGenEnum 

76 

77if TYPE_CHECKING: 

78 from .._registry import Registry 

79 

80 

81class RegistryTests(ABC): 

82 """Generic tests for the `Registry` class that can be subclassed to 

83 generate tests for different configurations. 

84 """ 

85 

86 collectionsManager: Optional[str] = None 

87 """Name of the collections manager class, if subclass provides value for 

88 this member then it overrides name specified in default configuration 

89 (`str`). 

90 """ 

91 

92 datasetsManager: Optional[str | dict[str, str]] = None 

93 """Name or configuration dictionary of the datasets manager class, if 

94 subclass provides value for this member then it overrides name specified 

95 in default configuration (`str` or `dict`). 

96 """ 

97 

98 @classmethod 

99 @abstractmethod 

100 def getDataDir(cls) -> str: 

101 """Return the root directory containing test data YAML files.""" 

102 raise NotImplementedError() 

103 

104 def makeRegistryConfig(self) -> RegistryConfig: 

105 """Create RegistryConfig used to create a registry. 

106 

107 This method should be called by a subclass from `makeRegistry`. 

108 Returned instance will be pre-configured based on the values of class 

109 members, and default-configured for all other parameters. Subclasses 

110 that need default configuration should just instantiate 

111 `RegistryConfig` directly. 

112 """ 

113 config = RegistryConfig() 

114 if self.collectionsManager: 

115 config["managers", "collections"] = self.collectionsManager 

116 if self.datasetsManager: 

117 config["managers", "datasets"] = self.datasetsManager 

118 return config 

119 

120 @abstractmethod 

121 def makeRegistry(self, share_repo_with: Optional[Registry] = None) -> Optional[Registry]: 

122 """Return the Registry instance to be tested. 

123 

124 Parameters 

125 ---------- 

126 share_repo_with : `Registry`, optional 

127 If provided, the new registry should point to the same data 

128 repository as this existing registry. 

129 

130 Returns 

131 ------- 

132 registry : `Registry` 

133 New `Registry` instance, or `None` *only* if `share_repo_with` is 

134 not `None` and this test case does not support that argument 

135 (e.g. it is impossible with in-memory SQLite DBs). 

136 """ 

137 raise NotImplementedError() 

138 

139 def loadData(self, registry: Registry, filename: str): 

140 """Load registry test data from ``getDataDir/<filename>``, 

141 which should be a YAML import/export file. 

142 """ 

143 from ...transfers import YamlRepoImportBackend 

144 

145 with open(os.path.join(self.getDataDir(), filename), "r") as stream: 

146 backend = YamlRepoImportBackend(stream, registry) 

147 backend.register() 

148 backend.load(datastore=None) 

149 

150 def checkQueryResults(self, results, expected): 

151 """Check that a query results object contains expected values. 

152 

153 Parameters 

154 ---------- 

155 results : `DataCoordinateQueryResults` or `DatasetQueryResults` 

156 A lazy-evaluation query results object. 

157 expected : `list` 

158 A list of `DataCoordinate` o `DatasetRef` objects that should be 

159 equal to results of the query, aside from ordering. 

160 """ 

161 self.assertCountEqual(list(results), expected) 

162 self.assertEqual(results.count(), len(expected)) 

163 if expected: 

164 self.assertTrue(results.any()) 

165 else: 

166 self.assertFalse(results.any()) 

167 

168 def testOpaque(self): 

169 """Tests for `Registry.registerOpaqueTable`, 

170 `Registry.insertOpaqueData`, `Registry.fetchOpaqueData`, and 

171 `Registry.deleteOpaqueData`. 

172 """ 

173 registry = self.makeRegistry() 

174 table = "opaque_table_for_testing" 

175 registry.registerOpaqueTable( 

176 table, 

177 spec=ddl.TableSpec( 

178 fields=[ 

179 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, primaryKey=True), 

180 ddl.FieldSpec("name", dtype=sqlalchemy.String, length=16, nullable=False), 

181 ddl.FieldSpec("count", dtype=sqlalchemy.SmallInteger, nullable=True), 

182 ], 

183 ), 

184 ) 

185 rows = [ 

186 {"id": 1, "name": "one", "count": None}, 

187 {"id": 2, "name": "two", "count": 5}, 

188 {"id": 3, "name": "three", "count": 6}, 

189 ] 

190 registry.insertOpaqueData(table, *rows) 

191 self.assertCountEqual(rows, list(registry.fetchOpaqueData(table))) 

192 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=1))) 

193 self.assertEqual(rows[1:2], list(registry.fetchOpaqueData(table, name="two"))) 

194 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=(1, 3), name=("one", "two")))) 

195 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=(1, 2, 3)))) 

196 # Test very long IN clause which exceeds sqlite limit on number of 

197 # parameters. SQLite says the limit is 32k but it looks like it is 

198 # much higher. 

199 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=list(range(300_000))))) 

200 # Two IN clauses, each longer than 1k batch size, first with 

201 # duplicates, second has matching elements in different batches (after 

202 # sorting). 

203 self.assertEqual( 

204 rows[0:2], 

205 list( 

206 registry.fetchOpaqueData( 

207 table, 

208 id=list(range(1000)) + list(range(100, 0, -1)), 

209 name=["one"] + [f"q{i}" for i in range(2200)] + ["two"], 

210 ) 

211 ), 

212 ) 

213 self.assertEqual([], list(registry.fetchOpaqueData(table, id=1, name="two"))) 

214 registry.deleteOpaqueData(table, id=3) 

215 self.assertCountEqual(rows[:2], list(registry.fetchOpaqueData(table))) 

216 registry.deleteOpaqueData(table) 

217 self.assertEqual([], list(registry.fetchOpaqueData(table))) 

218 

219 def testDatasetType(self): 

220 """Tests for `Registry.registerDatasetType` and 

221 `Registry.getDatasetType`. 

222 """ 

223 registry = self.makeRegistry() 

224 # Check valid insert 

225 datasetTypeName = "test" 

226 storageClass = StorageClass("testDatasetType") 

227 registry.storageClasses.registerStorageClass(storageClass) 

228 dimensions = registry.dimensions.extract(("instrument", "visit")) 

229 differentDimensions = registry.dimensions.extract(("instrument", "patch")) 

230 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

231 # Inserting for the first time should return True 

232 self.assertTrue(registry.registerDatasetType(inDatasetType)) 

233 outDatasetType1 = registry.getDatasetType(datasetTypeName) 

234 self.assertEqual(outDatasetType1, inDatasetType) 

235 

236 # Re-inserting should work 

237 self.assertFalse(registry.registerDatasetType(inDatasetType)) 

238 # Except when they are not identical 

239 with self.assertRaises(ConflictingDefinitionError): 

240 nonIdenticalDatasetType = DatasetType(datasetTypeName, differentDimensions, storageClass) 

241 registry.registerDatasetType(nonIdenticalDatasetType) 

242 

243 # Template can be None 

244 datasetTypeName = "testNoneTemplate" 

245 storageClass = StorageClass("testDatasetType2") 

246 registry.storageClasses.registerStorageClass(storageClass) 

247 dimensions = registry.dimensions.extract(("instrument", "visit")) 

248 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass) 

249 registry.registerDatasetType(inDatasetType) 

250 outDatasetType2 = registry.getDatasetType(datasetTypeName) 

251 self.assertEqual(outDatasetType2, inDatasetType) 

252 

253 allTypes = set(registry.queryDatasetTypes()) 

254 self.assertEqual(allTypes, {outDatasetType1, outDatasetType2}) 

255 

256 def testDimensions(self): 

257 """Tests for `Registry.insertDimensionData`, 

258 `Registry.syncDimensionData`, and `Registry.expandDataId`. 

259 """ 

260 registry = self.makeRegistry() 

261 dimensionName = "instrument" 

262 dimension = registry.dimensions[dimensionName] 

263 dimensionValue = { 

264 "name": "DummyCam", 

265 "visit_max": 10, 

266 "visit_system": 0, 

267 "exposure_max": 10, 

268 "detector_max": 2, 

269 "class_name": "lsst.pipe.base.Instrument", 

270 } 

271 registry.insertDimensionData(dimensionName, dimensionValue) 

272 # Inserting the same value twice should fail 

273 with self.assertRaises(sqlalchemy.exc.IntegrityError): 

274 registry.insertDimensionData(dimensionName, dimensionValue) 

275 # expandDataId should retrieve the record we just inserted 

276 self.assertEqual( 

277 registry.expandDataId(instrument="DummyCam", graph=dimension.graph) 

278 .records[dimensionName] 

279 .toDict(), 

280 dimensionValue, 

281 ) 

282 # expandDataId should raise if there is no record with the given ID. 

283 with self.assertRaises(DataIdValueError): 

284 registry.expandDataId({"instrument": "Unknown"}, graph=dimension.graph) 

285 # band doesn't have a table; insert should fail. 

286 with self.assertRaises(TypeError): 

287 registry.insertDimensionData("band", {"band": "i"}) 

288 dimensionName2 = "physical_filter" 

289 dimension2 = registry.dimensions[dimensionName2] 

290 dimensionValue2 = {"name": "DummyCam_i", "band": "i"} 

291 # Missing required dependency ("instrument") should fail 

292 with self.assertRaises(KeyError): 

293 registry.insertDimensionData(dimensionName2, dimensionValue2) 

294 # Adding required dependency should fix the failure 

295 dimensionValue2["instrument"] = "DummyCam" 

296 registry.insertDimensionData(dimensionName2, dimensionValue2) 

297 # expandDataId should retrieve the record we just inserted. 

298 self.assertEqual( 

299 registry.expandDataId(instrument="DummyCam", physical_filter="DummyCam_i", graph=dimension2.graph) 

300 .records[dimensionName2] 

301 .toDict(), 

302 dimensionValue2, 

303 ) 

304 # Use syncDimensionData to insert a new record successfully. 

305 dimensionName3 = "detector" 

306 dimensionValue3 = { 

307 "instrument": "DummyCam", 

308 "id": 1, 

309 "full_name": "one", 

310 "name_in_raft": "zero", 

311 "purpose": "SCIENCE", 

312 } 

313 self.assertTrue(registry.syncDimensionData(dimensionName3, dimensionValue3)) 

314 # Sync that again. Note that one field ("raft") is NULL, and that 

315 # should be okay. 

316 self.assertFalse(registry.syncDimensionData(dimensionName3, dimensionValue3)) 

317 # Now try that sync with the same primary key but a different value. 

318 # This should fail. 

319 with self.assertRaises(ConflictingDefinitionError): 

320 registry.syncDimensionData( 

321 dimensionName3, 

322 { 

323 "instrument": "DummyCam", 

324 "id": 1, 

325 "full_name": "one", 

326 "name_in_raft": "four", 

327 "purpose": "SCIENCE", 

328 }, 

329 ) 

330 

331 @unittest.skipIf(np is None, "numpy not available.") 

332 def testNumpyDataId(self): 

333 """Test that we can use a numpy int in a dataId.""" 

334 registry = self.makeRegistry() 

335 dimensionEntries = [ 

336 ("instrument", {"instrument": "DummyCam"}), 

337 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}), 

338 # Using an np.int64 here fails unless Records.fromDict is also 

339 # patched to look for numbers.Integral 

340 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}), 

341 ] 

342 for args in dimensionEntries: 

343 registry.insertDimensionData(*args) 

344 

345 # Try a normal integer and something that looks like an int but 

346 # is not. 

347 for visit_id in (42, np.int64(42)): 

348 with self.subTest(visit_id=visit_id, id_type=type(visit_id).__name__): 

349 expanded = registry.expandDataId({"instrument": "DummyCam", "visit": visit_id}) 

350 self.assertEqual(expanded["visit"], int(visit_id)) 

351 self.assertIsInstance(expanded["visit"], int) 

352 

353 def testDataIdRelationships(self): 

354 """Test that `Registry.expandDataId` raises an exception when the given 

355 keys are inconsistent. 

356 """ 

357 registry = self.makeRegistry() 

358 self.loadData(registry, "base.yaml") 

359 # Insert a few more dimension records for the next test. 

360 registry.insertDimensionData( 

361 "exposure", 

362 {"instrument": "Cam1", "id": 1, "obs_id": "one", "physical_filter": "Cam1-G"}, 

363 ) 

364 registry.insertDimensionData( 

365 "exposure", 

366 {"instrument": "Cam1", "id": 2, "obs_id": "two", "physical_filter": "Cam1-G"}, 

367 ) 

368 registry.insertDimensionData( 

369 "visit_system", 

370 {"instrument": "Cam1", "id": 0, "name": "one-to-one"}, 

371 ) 

372 registry.insertDimensionData( 

373 "visit", 

374 {"instrument": "Cam1", "id": 1, "name": "one", "physical_filter": "Cam1-G", "visit_system": 0}, 

375 ) 

376 registry.insertDimensionData( 

377 "visit_definition", 

378 {"instrument": "Cam1", "visit": 1, "exposure": 1, "visit_system": 0}, 

379 ) 

380 with self.assertRaises(InconsistentDataIdError): 

381 registry.expandDataId( 

382 {"instrument": "Cam1", "visit": 1, "exposure": 2}, 

383 ) 

384 

385 def testDataset(self): 

386 """Basic tests for `Registry.insertDatasets`, `Registry.getDataset`, 

387 and `Registry.removeDatasets`. 

388 """ 

389 registry = self.makeRegistry() 

390 self.loadData(registry, "base.yaml") 

391 run = "tésτ" 

392 registry.registerRun(run) 

393 datasetType = registry.getDatasetType("bias") 

394 dataId = {"instrument": "Cam1", "detector": 2} 

395 (ref,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run) 

396 outRef = registry.getDataset(ref.id) 

397 self.assertIsNotNone(ref.id) 

398 self.assertEqual(ref, outRef) 

399 with self.assertRaises(ConflictingDefinitionError): 

400 registry.insertDatasets(datasetType, dataIds=[dataId], run=run) 

401 registry.removeDatasets([ref]) 

402 self.assertIsNone(registry.findDataset(datasetType, dataId, collections=[run])) 

403 

404 def testFindDataset(self): 

405 """Tests for `Registry.findDataset`.""" 

406 registry = self.makeRegistry() 

407 self.loadData(registry, "base.yaml") 

408 run = "tésτ" 

409 datasetType = registry.getDatasetType("bias") 

410 dataId = {"instrument": "Cam1", "detector": 4} 

411 registry.registerRun(run) 

412 (inputRef,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run) 

413 outputRef = registry.findDataset(datasetType, dataId, collections=[run]) 

414 self.assertEqual(outputRef, inputRef) 

415 # Check that retrieval with invalid dataId raises 

416 with self.assertRaises(LookupError): 

417 dataId = {"instrument": "Cam1"} # no detector 

418 registry.findDataset(datasetType, dataId, collections=run) 

419 # Check that different dataIds match to different datasets 

420 dataId1 = {"instrument": "Cam1", "detector": 1} 

421 (inputRef1,) = registry.insertDatasets(datasetType, dataIds=[dataId1], run=run) 

422 dataId2 = {"instrument": "Cam1", "detector": 2} 

423 (inputRef2,) = registry.insertDatasets(datasetType, dataIds=[dataId2], run=run) 

424 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef1) 

425 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef2) 

426 self.assertNotEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef2) 

427 self.assertNotEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef1) 

428 # Check that requesting a non-existing dataId returns None 

429 nonExistingDataId = {"instrument": "Cam1", "detector": 3} 

430 self.assertIsNone(registry.findDataset(datasetType, nonExistingDataId, collections=run)) 

431 # Search more than one collection, in which two have the right 

432 # dataset type and another does not. 

433 registry.registerRun("empty") 

434 self.loadData(registry, "datasets-uuid.yaml") 

435 bias1 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_g"]) 

436 self.assertIsNotNone(bias1) 

437 bias2 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_r"]) 

438 self.assertIsNotNone(bias2) 

439 self.assertEqual( 

440 bias1, 

441 registry.findDataset( 

442 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "imported_r"] 

443 ), 

444 ) 

445 self.assertEqual( 

446 bias2, 

447 registry.findDataset( 

448 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_r", "imported_g"] 

449 ), 

450 ) 

451 # Search more than one collection, with one of them a CALIBRATION 

452 # collection. 

453 registry.registerCollection("Cam1/calib", CollectionType.CALIBRATION) 

454 timespan = Timespan( 

455 begin=astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai"), 

456 end=astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai"), 

457 ) 

458 registry.certify("Cam1/calib", [bias2], timespan=timespan) 

459 self.assertEqual( 

460 bias1, 

461 registry.findDataset( 

462 "bias", 

463 instrument="Cam1", 

464 detector=2, 

465 collections=["empty", "imported_g", "Cam1/calib"], 

466 timespan=timespan, 

467 ), 

468 ) 

469 self.assertEqual( 

470 bias2, 

471 registry.findDataset( 

472 "bias", 

473 instrument="Cam1", 

474 detector=2, 

475 collections=["empty", "Cam1/calib", "imported_g"], 

476 timespan=timespan, 

477 ), 

478 ) 

479 # If we try to search those same collections without a timespan, it 

480 # should still work, since the CALIBRATION collection is ignored. 

481 self.assertEqual( 

482 bias1, 

483 registry.findDataset( 

484 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "Cam1/calib"] 

485 ), 

486 ) 

487 self.assertEqual( 

488 bias1, 

489 registry.findDataset( 

490 "bias", instrument="Cam1", detector=2, collections=["empty", "Cam1/calib", "imported_g"] 

491 ), 

492 ) 

493 

494 def testRemoveDatasetTypeSuccess(self): 

495 """Test that Registry.removeDatasetType works when there are no 

496 datasets of that type present. 

497 """ 

498 registry = self.makeRegistry() 

499 self.loadData(registry, "base.yaml") 

500 registry.removeDatasetType("flat") 

501 with self.assertRaises(MissingDatasetTypeError): 

502 registry.getDatasetType("flat") 

503 

504 def testRemoveDatasetTypeFailure(self): 

505 """Test that Registry.removeDatasetType raises when there are datasets 

506 of that type present or if the dataset type is for a component. 

507 """ 

508 registry = self.makeRegistry() 

509 self.loadData(registry, "base.yaml") 

510 self.loadData(registry, "datasets.yaml") 

511 with self.assertRaises(OrphanedRecordError): 

512 registry.removeDatasetType("flat") 

513 with self.assertRaises(ValueError): 

514 registry.removeDatasetType(DatasetType.nameWithComponent("flat", "image")) 

515 

516 def testImportDatasetsUUID(self): 

517 """Test for `Registry._importDatasets` with UUID dataset ID.""" 

518 if isinstance(self.datasetsManager, str): 

519 if not self.datasetsManager.endswith(".ByDimensionsDatasetRecordStorageManagerUUID"): 

520 self.skipTest(f"Unexpected dataset manager {self.datasetsManager}") 

521 elif isinstance(self.datasetsManager, dict): 

522 if not self.datasetsManager["cls"].endswith(".ByDimensionsDatasetRecordStorageManagerUUID"): 

523 self.skipTest(f"Unexpected dataset manager {self.datasetsManager['cls']}") 

524 

525 registry = self.makeRegistry() 

526 self.loadData(registry, "base.yaml") 

527 for run in range(6): 

528 registry.registerRun(f"run{run}") 

529 datasetTypeBias = registry.getDatasetType("bias") 

530 datasetTypeFlat = registry.getDatasetType("flat") 

531 dataIdBias1 = {"instrument": "Cam1", "detector": 1} 

532 dataIdBias2 = {"instrument": "Cam1", "detector": 2} 

533 dataIdFlat1 = {"instrument": "Cam1", "detector": 1, "physical_filter": "Cam1-G", "band": "g"} 

534 

535 dataset_id = uuid.uuid4() 

536 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=dataset_id, run="run0") 

537 (ref1,) = registry._importDatasets([ref]) 

538 # UUID is used without change 

539 self.assertEqual(ref.id, ref1.id) 

540 

541 # All different failure modes 

542 refs = ( 

543 # Importing same DatasetRef with different dataset ID is an error 

544 DatasetRef(datasetTypeBias, dataIdBias1, id=uuid.uuid4(), run="run0"), 

545 # Same DatasetId but different DataId 

546 DatasetRef(datasetTypeBias, dataIdBias2, id=ref1.id, run="run0"), 

547 DatasetRef(datasetTypeFlat, dataIdFlat1, id=ref1.id, run="run0"), 

548 # Same DatasetRef and DatasetId but different run 

549 DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run="run1"), 

550 ) 

551 for ref in refs: 

552 with self.assertRaises(ConflictingDefinitionError): 

553 registry._importDatasets([ref]) 

554 

555 # Test for non-unique IDs, they can be re-imported multiple times. 

556 for run, idGenMode in ((2, DatasetIdGenEnum.DATAID_TYPE), (4, DatasetIdGenEnum.DATAID_TYPE_RUN)): 

557 with self.subTest(idGenMode=idGenMode): 

558 # Use integer dataset ID to force UUID calculation in _import 

559 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=0, run=f"run{run}") 

560 (ref1,) = registry._importDatasets([ref], idGenerationMode=idGenMode) 

561 self.assertIsInstance(ref1.id, uuid.UUID) 

562 self.assertEqual(ref1.id.version, 5) 

563 

564 # Importing it again is OK 

565 (ref2,) = registry._importDatasets([ref1]) 

566 self.assertEqual(ref2.id, ref1.id) 

567 

568 # Cannot import to different run with the same ID 

569 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run=f"run{run+1}") 

570 with self.assertRaises(ConflictingDefinitionError): 

571 registry._importDatasets([ref]) 

572 

573 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=0, run=f"run{run+1}") 

574 if idGenMode is DatasetIdGenEnum.DATAID_TYPE: 

575 # Cannot import same DATAID_TYPE ref into a new run 

576 with self.assertRaises(ConflictingDefinitionError): 

577 (ref2,) = registry._importDatasets([ref], idGenerationMode=idGenMode) 

578 else: 

579 # DATAID_TYPE_RUN ref can be imported into a new run 

580 (ref2,) = registry._importDatasets([ref], idGenerationMode=idGenMode) 

581 

582 def testDatasetTypeComponentQueries(self): 

583 """Test component options when querying for dataset types. 

584 

585 All of the behavior here is deprecated, so many of these tests are 

586 currently wrapped in a context to check that we get a warning whenever 

587 a component dataset is actually returned. 

588 """ 

589 registry = self.makeRegistry() 

590 self.loadData(registry, "base.yaml") 

591 self.loadData(registry, "datasets.yaml") 

592 # Test querying for dataset types with different inputs. 

593 # First query for all dataset types; components should only be included 

594 # when components=True. 

595 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes()).names) 

596 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes(components=False)).names) 

597 with self.assertWarns(FutureWarning): 

598 self.assertLess( 

599 {"bias", "flat", "bias.wcs", "flat.photoCalib"}, 

600 NamedValueSet(registry.queryDatasetTypes(components=True)).names, 

601 ) 

602 # Use a pattern that can match either parent or components. Again, 

603 # components are only returned if components=True. 

604 self.assertEqual({"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"))).names) 

605 self.assertEqual( 

606 {"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=False)).names 

607 ) 

608 with self.assertWarns(FutureWarning): 

609 self.assertLess( 

610 {"bias", "bias.wcs"}, 

611 NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=True)).names, 

612 ) 

613 # This pattern matches only a component. In this case we also return 

614 # that component dataset type if components=None. 

615 with self.assertWarns(FutureWarning): 

616 self.assertEqual( 

617 {"bias.wcs"}, NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"))).names 

618 ) 

619 self.assertEqual( 

620 set(), 

621 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=False)).names, 

622 ) 

623 with self.assertWarns(FutureWarning): 

624 self.assertEqual( 

625 {"bias.wcs"}, 

626 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=True)).names, 

627 ) 

628 # Add a dataset type using a StorageClass that we'll then remove; check 

629 # that this does not affect our ability to query for dataset types 

630 # (though it will warn). 

631 tempStorageClass = StorageClass( 

632 name="TempStorageClass", 

633 components={ 

634 "data1": registry.storageClasses.getStorageClass("StructuredDataDict"), 

635 "data2": registry.storageClasses.getStorageClass("StructuredDataDict"), 

636 }, 

637 ) 

638 registry.storageClasses.registerStorageClass(tempStorageClass) 

639 datasetType = DatasetType( 

640 "temporary", 

641 dimensions=["instrument"], 

642 storageClass=tempStorageClass, 

643 universe=registry.dimensions, 

644 ) 

645 registry.registerDatasetType(datasetType) 

646 registry.storageClasses._unregisterStorageClass(tempStorageClass.name) 

647 datasetType._storageClass = None 

648 del tempStorageClass 

649 # Querying for all dataset types, including components, should include 

650 # at least all non-component dataset types (and I don't want to 

651 # enumerate all of the Exposure components for bias and flat here). 

652 with self.assertWarns(FutureWarning): 

653 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm: 

654 everything = NamedValueSet(registry.queryDatasetTypes(components=True)) 

655 self.assertIn("TempStorageClass", cm.output[0]) 

656 self.assertLess({"bias", "flat", "temporary"}, everything.names) 

657 # It should not include "temporary.columns", because we tried to remove 

658 # the storage class that would tell it about that. So if the next line 

659 # fails (i.e. "temporary.columns" _is_ in everything.names), it means 

660 # this part of the test isn't doing anything, because the _unregister 

661 # call about isn't simulating the real-life case we want it to 

662 # simulate, in which different versions of daf_butler in entirely 

663 # different Python processes interact with the same repo. 

664 self.assertNotIn("temporary.data", everything.names) 

665 # Query for dataset types that start with "temp". This should again 

666 # not include the component, and also not fail. 

667 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm: 

668 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=True)) 

669 self.assertIn("TempStorageClass", cm.output[0]) 

670 self.assertEqual({"temporary"}, startsWithTemp.names) 

671 # Querying with no components should not warn at all. 

672 with self.assertLogs("lsst.daf.butler.registries", logging.WARN) as cm: 

673 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=False)) 

674 # Must issue a warning of our own to be captured. 

675 logging.getLogger("lsst.daf.butler.registries").warning("test message") 

676 self.assertEqual(len(cm.output), 1) 

677 self.assertIn("test message", cm.output[0]) 

678 

679 def testComponentLookups(self): 

680 """Test searching for component datasets via their parents. 

681 

682 All of the behavior here is deprecated, so many of these tests are 

683 currently wrapped in a context to check that we get a warning whenever 

684 a component dataset is actually returned. 

685 """ 

686 registry = self.makeRegistry() 

687 self.loadData(registry, "base.yaml") 

688 self.loadData(registry, "datasets.yaml") 

689 # Test getting the child dataset type (which does still exist in the 

690 # Registry), and check for consistency with 

691 # DatasetRef.makeComponentRef. 

692 collection = "imported_g" 

693 parentType = registry.getDatasetType("bias") 

694 childType = registry.getDatasetType("bias.wcs") 

695 parentRefResolved = registry.findDataset( 

696 parentType, collections=collection, instrument="Cam1", detector=1 

697 ) 

698 self.assertIsInstance(parentRefResolved, DatasetRef) 

699 self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType) 

700 # Search for a single dataset with findDataset. 

701 childRef1 = registry.findDataset("bias.wcs", collections=collection, dataId=parentRefResolved.dataId) 

702 self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs")) 

703 # Search for detector data IDs constrained by component dataset 

704 # existence with queryDataIds. 

705 with self.assertWarns(FutureWarning): 

706 dataIds = registry.queryDataIds( 

707 ["detector"], 

708 datasets=["bias.wcs"], 

709 collections=collection, 

710 ).toSet() 

711 self.assertEqual( 

712 dataIds, 

713 DataCoordinateSet( 

714 { 

715 DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions) 

716 for d in (1, 2, 3) 

717 }, 

718 parentType.dimensions, 

719 ), 

720 ) 

721 # Search for multiple datasets of a single type with queryDatasets. 

722 with self.assertWarns(FutureWarning): 

723 childRefs2 = set( 

724 registry.queryDatasets( 

725 "bias.wcs", 

726 collections=collection, 

727 ) 

728 ) 

729 self.assertEqual( 

730 {ref.unresolved() for ref in childRefs2}, {DatasetRef(childType, dataId) for dataId in dataIds} 

731 ) 

732 

733 def testCollections(self): 

734 """Tests for registry methods that manage collections.""" 

735 registry = self.makeRegistry() 

736 other_registry = self.makeRegistry(share_repo_with=registry) 

737 self.loadData(registry, "base.yaml") 

738 self.loadData(registry, "datasets.yaml") 

739 run1 = "imported_g" 

740 run2 = "imported_r" 

741 # Test setting a collection docstring after it has been created. 

742 registry.setCollectionDocumentation(run1, "doc for run1") 

743 self.assertEqual(registry.getCollectionDocumentation(run1), "doc for run1") 

744 registry.setCollectionDocumentation(run1, None) 

745 self.assertIsNone(registry.getCollectionDocumentation(run1)) 

746 datasetType = "bias" 

747 # Find some datasets via their run's collection. 

748 dataId1 = {"instrument": "Cam1", "detector": 1} 

749 ref1 = registry.findDataset(datasetType, dataId1, collections=run1) 

750 self.assertIsNotNone(ref1) 

751 dataId2 = {"instrument": "Cam1", "detector": 2} 

752 ref2 = registry.findDataset(datasetType, dataId2, collections=run1) 

753 self.assertIsNotNone(ref2) 

754 # Associate those into a new collection, then look for them there. 

755 tag1 = "tag1" 

756 registry.registerCollection(tag1, type=CollectionType.TAGGED, doc="doc for tag1") 

757 # Check that we can query for old and new collections by type. 

758 self.assertEqual(set(registry.queryCollections(collectionTypes=CollectionType.RUN)), {run1, run2}) 

759 self.assertEqual( 

760 set(registry.queryCollections(collectionTypes={CollectionType.TAGGED, CollectionType.RUN})), 

761 {tag1, run1, run2}, 

762 ) 

763 self.assertEqual(registry.getCollectionDocumentation(tag1), "doc for tag1") 

764 registry.associate(tag1, [ref1, ref2]) 

765 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1) 

766 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2) 

767 # Disassociate one and verify that we can't it there anymore... 

768 registry.disassociate(tag1, [ref1]) 

769 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=tag1)) 

770 # ...but we can still find ref2 in tag1, and ref1 in the run. 

771 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run1), ref1) 

772 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2) 

773 collections = set(registry.queryCollections()) 

774 self.assertEqual(collections, {run1, run2, tag1}) 

775 # Associate both refs into tag1 again; ref2 is already there, but that 

776 # should be a harmless no-op. 

777 registry.associate(tag1, [ref1, ref2]) 

778 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1) 

779 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2) 

780 # Get a different dataset (from a different run) that has the same 

781 # dataset type and data ID as ref2. 

782 ref2b = registry.findDataset(datasetType, dataId2, collections=run2) 

783 self.assertNotEqual(ref2, ref2b) 

784 # Attempting to associate that into tag1 should be an error. 

785 with self.assertRaises(ConflictingDefinitionError): 

786 registry.associate(tag1, [ref2b]) 

787 # That error shouldn't have messed up what we had before. 

788 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1) 

789 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2) 

790 # Attempt to associate the conflicting dataset again, this time with 

791 # a dataset that isn't in the collection and won't cause a conflict. 

792 # Should also fail without modifying anything. 

793 dataId3 = {"instrument": "Cam1", "detector": 3} 

794 ref3 = registry.findDataset(datasetType, dataId3, collections=run1) 

795 with self.assertRaises(ConflictingDefinitionError): 

796 registry.associate(tag1, [ref3, ref2b]) 

797 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1) 

798 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2) 

799 self.assertIsNone(registry.findDataset(datasetType, dataId3, collections=tag1)) 

800 # Register a chained collection that searches [tag1, run2] 

801 chain1 = "chain1" 

802 registry.registerCollection(chain1, type=CollectionType.CHAINED) 

803 self.assertIs(registry.getCollectionType(chain1), CollectionType.CHAINED) 

804 # Chained collection exists, but has no collections in it. 

805 self.assertFalse(registry.getCollectionChain(chain1)) 

806 # If we query for all collections, we should get the chained collection 

807 # only if we don't ask to flatten it (i.e. yield only its children). 

808 self.assertEqual(set(registry.queryCollections(flattenChains=False)), {tag1, run1, run2, chain1}) 

809 self.assertEqual(set(registry.queryCollections(flattenChains=True)), {tag1, run1, run2}) 

810 # Attempt to set its child collections to something circular; that 

811 # should fail. 

812 with self.assertRaises(ValueError): 

813 registry.setCollectionChain(chain1, [tag1, chain1]) 

814 # Add the child collections. 

815 registry.setCollectionChain(chain1, [tag1, run2]) 

816 self.assertEqual(list(registry.getCollectionChain(chain1)), [tag1, run2]) 

817 self.assertEqual(registry.getCollectionParentChains(tag1), {chain1}) 

818 self.assertEqual(registry.getCollectionParentChains(run2), {chain1}) 

819 # Refresh the other registry that points to the same repo, and make 

820 # sure it can see the things we've done (note that this does require 

821 # an explicit refresh(); that's the documented behavior, because 

822 # caching is ~impossible otherwise). 

823 if other_registry is not None: 

824 other_registry.refresh() 

825 self.assertEqual(list(other_registry.getCollectionChain(chain1)), [tag1, run2]) 

826 self.assertEqual(other_registry.getCollectionParentChains(tag1), {chain1}) 

827 self.assertEqual(other_registry.getCollectionParentChains(run2), {chain1}) 

828 # Searching for dataId1 or dataId2 in the chain should return ref1 and 

829 # ref2, because both are in tag1. 

830 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain1), ref1) 

831 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain1), ref2) 

832 # Now disassociate ref2 from tag1. The search (for bias) with 

833 # dataId2 in chain1 should then: 

834 # 1. not find it in tag1 

835 # 2. find a different dataset in run2 

836 registry.disassociate(tag1, [ref2]) 

837 ref2b = registry.findDataset(datasetType, dataId2, collections=chain1) 

838 self.assertNotEqual(ref2b, ref2) 

839 self.assertEqual(ref2b, registry.findDataset(datasetType, dataId2, collections=run2)) 

840 # Define a new chain so we can test recursive chains. 

841 chain2 = "chain2" 

842 registry.registerCollection(chain2, type=CollectionType.CHAINED) 

843 registry.setCollectionChain(chain2, [run2, chain1]) 

844 self.assertEqual(registry.getCollectionParentChains(chain1), {chain2}) 

845 self.assertEqual(registry.getCollectionParentChains(run2), {chain1, chain2}) 

846 # Query for collections matching a regex. 

847 self.assertCountEqual( 

848 list(registry.queryCollections(re.compile("imported_."), flattenChains=False)), 

849 ["imported_r", "imported_g"], 

850 ) 

851 # Query for collections matching a regex or an explicit str. 

852 self.assertCountEqual( 

853 list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)), 

854 ["imported_r", "imported_g", "chain1"], 

855 ) 

856 # Search for bias with dataId1 should find it via tag1 in chain2, 

857 # recursing, because is not in run1. 

858 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=run2)) 

859 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain2), ref1) 

860 # Search for bias with dataId2 should find it in run2 (ref2b). 

861 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain2), ref2b) 

862 # Search for a flat that is in run2. That should not be found 

863 # at the front of chain2, because of the restriction to bias 

864 # on run2 there, but it should be found in at the end of chain1. 

865 dataId4 = {"instrument": "Cam1", "detector": 3, "physical_filter": "Cam1-R2"} 

866 ref4 = registry.findDataset("flat", dataId4, collections=run2) 

867 self.assertIsNotNone(ref4) 

868 self.assertEqual(ref4, registry.findDataset("flat", dataId4, collections=chain2)) 

869 # Deleting a collection that's part of a CHAINED collection is not 

870 # allowed, and is exception-safe. 

871 with self.assertRaises(Exception): 

872 registry.removeCollection(run2) 

873 self.assertEqual(registry.getCollectionType(run2), CollectionType.RUN) 

874 with self.assertRaises(Exception): 

875 registry.removeCollection(chain1) 

876 self.assertEqual(registry.getCollectionType(chain1), CollectionType.CHAINED) 

877 # Actually remove chain2, test that it's gone by asking for its type. 

878 registry.removeCollection(chain2) 

879 with self.assertRaises(MissingCollectionError): 

880 registry.getCollectionType(chain2) 

881 # Actually remove run2 and chain1, which should work now. 

882 registry.removeCollection(chain1) 

883 registry.removeCollection(run2) 

884 with self.assertRaises(MissingCollectionError): 

885 registry.getCollectionType(run2) 

886 with self.assertRaises(MissingCollectionError): 

887 registry.getCollectionType(chain1) 

888 # Remove tag1 as well, just to test that we can remove TAGGED 

889 # collections. 

890 registry.removeCollection(tag1) 

891 with self.assertRaises(MissingCollectionError): 

892 registry.getCollectionType(tag1) 

893 

894 def testCollectionChainFlatten(self): 

895 """Test that Registry.setCollectionChain obeys its 'flatten' option.""" 

896 registry = self.makeRegistry() 

897 registry.registerCollection("inner", CollectionType.CHAINED) 

898 registry.registerCollection("innermost", CollectionType.RUN) 

899 registry.setCollectionChain("inner", ["innermost"]) 

900 registry.registerCollection("outer", CollectionType.CHAINED) 

901 registry.setCollectionChain("outer", ["inner"], flatten=False) 

902 self.assertEqual(list(registry.getCollectionChain("outer")), ["inner"]) 

903 registry.setCollectionChain("outer", ["inner"], flatten=True) 

904 self.assertEqual(list(registry.getCollectionChain("outer")), ["innermost"]) 

905 

906 def testBasicTransaction(self): 

907 """Test that all operations within a single transaction block are 

908 rolled back if an exception propagates out of the block. 

909 """ 

910 registry = self.makeRegistry() 

911 storageClass = StorageClass("testDatasetType") 

912 registry.storageClasses.registerStorageClass(storageClass) 

913 with registry.transaction(): 

914 registry.insertDimensionData("instrument", {"name": "Cam1", "class_name": "A"}) 

915 with self.assertRaises(ValueError): 

916 with registry.transaction(): 

917 registry.insertDimensionData("instrument", {"name": "Cam2"}) 

918 raise ValueError("Oops, something went wrong") 

919 # Cam1 should exist 

920 self.assertEqual(registry.expandDataId(instrument="Cam1").records["instrument"].class_name, "A") 

921 # But Cam2 and Cam3 should both not exist 

922 with self.assertRaises(DataIdValueError): 

923 registry.expandDataId(instrument="Cam2") 

924 with self.assertRaises(DataIdValueError): 

925 registry.expandDataId(instrument="Cam3") 

926 

927 def testNestedTransaction(self): 

928 """Test that operations within a transaction block are not rolled back 

929 if an exception propagates out of an inner transaction block and is 

930 then caught. 

931 """ 

932 registry = self.makeRegistry() 

933 dimension = registry.dimensions["instrument"] 

934 dataId1 = {"instrument": "DummyCam"} 

935 dataId2 = {"instrument": "DummyCam2"} 

936 checkpointReached = False 

937 with registry.transaction(): 

938 # This should be added and (ultimately) committed. 

939 registry.insertDimensionData(dimension, dataId1) 

940 with self.assertRaises(sqlalchemy.exc.IntegrityError): 

941 with registry.transaction(savepoint=True): 

942 # This does not conflict, and should succeed (but not 

943 # be committed). 

944 registry.insertDimensionData(dimension, dataId2) 

945 checkpointReached = True 

946 # This should conflict and raise, triggerring a rollback 

947 # of the previous insertion within the same transaction 

948 # context, but not the original insertion in the outer 

949 # block. 

950 registry.insertDimensionData(dimension, dataId1) 

951 self.assertTrue(checkpointReached) 

952 self.assertIsNotNone(registry.expandDataId(dataId1, graph=dimension.graph)) 

953 with self.assertRaises(DataIdValueError): 

954 registry.expandDataId(dataId2, graph=dimension.graph) 

955 

956 def testInstrumentDimensions(self): 

957 """Test queries involving only instrument dimensions, with no joins to 

958 skymap.""" 

959 registry = self.makeRegistry() 

960 

961 # need a bunch of dimensions and datasets for test 

962 registry.insertDimensionData( 

963 "instrument", dict(name="DummyCam", visit_max=25, exposure_max=300, detector_max=6) 

964 ) 

965 registry.insertDimensionData( 

966 "physical_filter", 

967 dict(instrument="DummyCam", name="dummy_r", band="r"), 

968 dict(instrument="DummyCam", name="dummy_i", band="i"), 

969 ) 

970 registry.insertDimensionData( 

971 "detector", *[dict(instrument="DummyCam", id=i, full_name=str(i)) for i in range(1, 6)] 

972 ) 

973 registry.insertDimensionData( 

974 "visit_system", 

975 dict(instrument="DummyCam", id=1, name="default"), 

976 ) 

977 registry.insertDimensionData( 

978 "visit", 

979 dict(instrument="DummyCam", id=10, name="ten", physical_filter="dummy_i", visit_system=1), 

980 dict(instrument="DummyCam", id=11, name="eleven", physical_filter="dummy_r", visit_system=1), 

981 dict(instrument="DummyCam", id=20, name="twelve", physical_filter="dummy_r", visit_system=1), 

982 ) 

983 for i in range(1, 6): 

984 registry.insertDimensionData( 

985 "visit_detector_region", 

986 dict(instrument="DummyCam", visit=10, detector=i), 

987 dict(instrument="DummyCam", visit=11, detector=i), 

988 dict(instrument="DummyCam", visit=20, detector=i), 

989 ) 

990 registry.insertDimensionData( 

991 "exposure", 

992 dict(instrument="DummyCam", id=100, obs_id="100", physical_filter="dummy_i"), 

993 dict(instrument="DummyCam", id=101, obs_id="101", physical_filter="dummy_i"), 

994 dict(instrument="DummyCam", id=110, obs_id="110", physical_filter="dummy_r"), 

995 dict(instrument="DummyCam", id=111, obs_id="111", physical_filter="dummy_r"), 

996 dict(instrument="DummyCam", id=200, obs_id="200", physical_filter="dummy_r"), 

997 dict(instrument="DummyCam", id=201, obs_id="201", physical_filter="dummy_r"), 

998 ) 

999 registry.insertDimensionData( 

1000 "visit_definition", 

1001 dict(instrument="DummyCam", exposure=100, visit_system=1, visit=10), 

1002 dict(instrument="DummyCam", exposure=101, visit_system=1, visit=10), 

1003 dict(instrument="DummyCam", exposure=110, visit_system=1, visit=11), 

1004 dict(instrument="DummyCam", exposure=111, visit_system=1, visit=11), 

1005 dict(instrument="DummyCam", exposure=200, visit_system=1, visit=20), 

1006 dict(instrument="DummyCam", exposure=201, visit_system=1, visit=20), 

1007 ) 

1008 # dataset types 

1009 run1 = "test1_r" 

1010 run2 = "test2_r" 

1011 tagged2 = "test2_t" 

1012 registry.registerRun(run1) 

1013 registry.registerRun(run2) 

1014 registry.registerCollection(tagged2) 

1015 storageClass = StorageClass("testDataset") 

1016 registry.storageClasses.registerStorageClass(storageClass) 

1017 rawType = DatasetType( 

1018 name="RAW", 

1019 dimensions=registry.dimensions.extract(("instrument", "exposure", "detector")), 

1020 storageClass=storageClass, 

1021 ) 

1022 registry.registerDatasetType(rawType) 

1023 calexpType = DatasetType( 

1024 name="CALEXP", 

1025 dimensions=registry.dimensions.extract(("instrument", "visit", "detector")), 

1026 storageClass=storageClass, 

1027 ) 

1028 registry.registerDatasetType(calexpType) 

1029 

1030 # add pre-existing datasets 

1031 for exposure in (100, 101, 110, 111): 

1032 for detector in (1, 2, 3): 

1033 # note that only 3 of 5 detectors have datasets 

1034 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector) 

1035 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run1) 

1036 # exposures 100 and 101 appear in both run1 and tagged2. 

1037 # 100 has different datasets in the different collections 

1038 # 101 has the same dataset in both collections. 

1039 if exposure == 100: 

1040 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2) 

1041 if exposure in (100, 101): 

1042 registry.associate(tagged2, [ref]) 

1043 # Add pre-existing datasets to tagged2. 

1044 for exposure in (200, 201): 

1045 for detector in (3, 4, 5): 

1046 # note that only 3 of 5 detectors have datasets 

1047 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector) 

1048 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2) 

1049 registry.associate(tagged2, [ref]) 

1050 

1051 dimensions = DimensionGraph( 

1052 registry.dimensions, dimensions=(rawType.dimensions.required | calexpType.dimensions.required) 

1053 ) 

1054 # Test that single dim string works as well as list of str 

1055 rows = registry.queryDataIds("visit", datasets=rawType, collections=run1).expanded().toSet() 

1056 rowsI = registry.queryDataIds(["visit"], datasets=rawType, collections=run1).expanded().toSet() 

1057 self.assertEqual(rows, rowsI) 

1058 # with empty expression 

1059 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=run1).expanded().toSet() 

1060 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors 

1061 for dataId in rows: 

1062 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit")) 

1063 packer1 = registry.dimensions.makePacker("visit_detector", dataId) 

1064 packer2 = registry.dimensions.makePacker("exposure_detector", dataId) 

1065 self.assertEqual( 

1066 packer1.unpack(packer1.pack(dataId)), 

1067 DataCoordinate.standardize(dataId, graph=packer1.dimensions), 

1068 ) 

1069 self.assertEqual( 

1070 packer2.unpack(packer2.pack(dataId)), 

1071 DataCoordinate.standardize(dataId, graph=packer2.dimensions), 

1072 ) 

1073 self.assertNotEqual(packer1.pack(dataId), packer2.pack(dataId)) 

1074 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101, 110, 111)) 

1075 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10, 11)) 

1076 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3)) 

1077 

1078 # second collection 

1079 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=tagged2).toSet() 

1080 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors 

1081 for dataId in rows: 

1082 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit")) 

1083 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101, 200, 201)) 

1084 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10, 20)) 

1085 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3, 4, 5)) 

1086 

1087 # with two input datasets 

1088 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=[run1, tagged2]).toSet() 

1089 self.assertEqual(len(set(rows)), 6 * 3) # 6 exposures times 3 detectors; set needed to de-dupe 

1090 for dataId in rows: 

1091 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit")) 

1092 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101, 110, 111, 200, 201)) 

1093 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10, 11, 20)) 

1094 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3, 4, 5)) 

1095 

1096 # limit to single visit 

1097 rows = registry.queryDataIds( 

1098 dimensions, datasets=rawType, collections=run1, where="visit = 10", instrument="DummyCam" 

1099 ).toSet() 

1100 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors 

1101 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101)) 

1102 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10,)) 

1103 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3)) 

1104 

1105 # more limiting expression, using link names instead of Table.column 

1106 rows = registry.queryDataIds( 

1107 dimensions, 

1108 datasets=rawType, 

1109 collections=run1, 

1110 where="visit = 10 and detector > 1 and 'DummyCam'=instrument", 

1111 ).toSet() 

1112 self.assertEqual(len(rows), 2 * 2) # 2 exposures times 2 detectors 

1113 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101)) 

1114 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10,)) 

1115 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (2, 3)) 

1116 

1117 # queryDataIds with only one of `datasets` and `collections` is an 

1118 # error. 

1119 with self.assertRaises(CollectionError): 

1120 registry.queryDataIds(dimensions, datasets=rawType) 

1121 with self.assertRaises(ArgumentError): 

1122 registry.queryDataIds(dimensions, collections=run1) 

1123 

1124 # expression excludes everything 

1125 rows = registry.queryDataIds( 

1126 dimensions, datasets=rawType, collections=run1, where="visit > 1000", instrument="DummyCam" 

1127 ).toSet() 

1128 self.assertEqual(len(rows), 0) 

1129 

1130 # Selecting by physical_filter, this is not in the dimensions, but it 

1131 # is a part of the full expression so it should work too. 

1132 rows = registry.queryDataIds( 

1133 dimensions, 

1134 datasets=rawType, 

1135 collections=run1, 

1136 where="physical_filter = 'dummy_r'", 

1137 instrument="DummyCam", 

1138 ).toSet() 

1139 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors 

1140 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (110, 111)) 

1141 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (11,)) 

1142 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3)) 

1143 

1144 def testSkyMapDimensions(self): 

1145 """Tests involving only skymap dimensions, no joins to instrument.""" 

1146 registry = self.makeRegistry() 

1147 

1148 # need a bunch of dimensions and datasets for test, we want 

1149 # "band" in the test so also have to add physical_filter 

1150 # dimensions 

1151 registry.insertDimensionData("instrument", dict(instrument="DummyCam")) 

1152 registry.insertDimensionData( 

1153 "physical_filter", 

1154 dict(instrument="DummyCam", name="dummy_r", band="r"), 

1155 dict(instrument="DummyCam", name="dummy_i", band="i"), 

1156 ) 

1157 registry.insertDimensionData("skymap", dict(name="DummyMap", hash="sha!".encode("utf8"))) 

1158 for tract in range(10): 

1159 registry.insertDimensionData("tract", dict(skymap="DummyMap", id=tract)) 

1160 registry.insertDimensionData( 

1161 "patch", 

1162 *[dict(skymap="DummyMap", tract=tract, id=patch, cell_x=0, cell_y=0) for patch in range(10)], 

1163 ) 

1164 

1165 # dataset types 

1166 run = "tésτ" 

1167 registry.registerRun(run) 

1168 storageClass = StorageClass("testDataset") 

1169 registry.storageClasses.registerStorageClass(storageClass) 

1170 calexpType = DatasetType( 

1171 name="deepCoadd_calexp", 

1172 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")), 

1173 storageClass=storageClass, 

1174 ) 

1175 registry.registerDatasetType(calexpType) 

1176 mergeType = DatasetType( 

1177 name="deepCoadd_mergeDet", 

1178 dimensions=registry.dimensions.extract(("skymap", "tract", "patch")), 

1179 storageClass=storageClass, 

1180 ) 

1181 registry.registerDatasetType(mergeType) 

1182 measType = DatasetType( 

1183 name="deepCoadd_meas", 

1184 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")), 

1185 storageClass=storageClass, 

1186 ) 

1187 registry.registerDatasetType(measType) 

1188 

1189 dimensions = DimensionGraph( 

1190 registry.dimensions, 

1191 dimensions=( 

1192 calexpType.dimensions.required | mergeType.dimensions.required | measType.dimensions.required 

1193 ), 

1194 ) 

1195 

1196 # add pre-existing datasets 

1197 for tract in (1, 3, 5): 

1198 for patch in (2, 4, 6, 7): 

1199 dataId = dict(skymap="DummyMap", tract=tract, patch=patch) 

1200 registry.insertDatasets(mergeType, dataIds=[dataId], run=run) 

1201 for aFilter in ("i", "r"): 

1202 dataId = dict(skymap="DummyMap", tract=tract, patch=patch, band=aFilter) 

1203 registry.insertDatasets(calexpType, dataIds=[dataId], run=run) 

1204 

1205 # with empty expression 

1206 rows = registry.queryDataIds(dimensions, datasets=[calexpType, mergeType], collections=run).toSet() 

1207 self.assertEqual(len(rows), 3 * 4 * 2) # 4 tracts x 4 patches x 2 filters 

1208 for dataId in rows: 

1209 self.assertCountEqual(dataId.keys(), ("skymap", "tract", "patch", "band")) 

1210 self.assertCountEqual(set(dataId["tract"] for dataId in rows), (1, 3, 5)) 

1211 self.assertCountEqual(set(dataId["patch"] for dataId in rows), (2, 4, 6, 7)) 

1212 self.assertCountEqual(set(dataId["band"] for dataId in rows), ("i", "r")) 

1213 

1214 # limit to 2 tracts and 2 patches 

1215 rows = registry.queryDataIds( 

1216 dimensions, 

1217 datasets=[calexpType, mergeType], 

1218 collections=run, 

1219 where="tract IN (1, 5) AND patch IN (2, 7)", 

1220 skymap="DummyMap", 

1221 ).toSet() 

1222 self.assertEqual(len(rows), 2 * 2 * 2) # 2 tracts x 2 patches x 2 filters 

1223 self.assertCountEqual(set(dataId["tract"] for dataId in rows), (1, 5)) 

1224 self.assertCountEqual(set(dataId["patch"] for dataId in rows), (2, 7)) 

1225 self.assertCountEqual(set(dataId["band"] for dataId in rows), ("i", "r")) 

1226 

1227 # limit to single filter 

1228 rows = registry.queryDataIds( 

1229 dimensions, datasets=[calexpType, mergeType], collections=run, where="band = 'i'" 

1230 ).toSet() 

1231 self.assertEqual(len(rows), 3 * 4 * 1) # 4 tracts x 4 patches x 2 filters 

1232 self.assertCountEqual(set(dataId["tract"] for dataId in rows), (1, 3, 5)) 

1233 self.assertCountEqual(set(dataId["patch"] for dataId in rows), (2, 4, 6, 7)) 

1234 self.assertCountEqual(set(dataId["band"] for dataId in rows), ("i",)) 

1235 

1236 # Specifying non-existing skymap is an exception 

1237 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"): 

1238 rows = registry.queryDataIds( 

1239 dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'" 

1240 ).toSet() 

1241 

1242 def testSpatialJoin(self): 

1243 """Test queries that involve spatial overlap joins.""" 

1244 registry = self.makeRegistry() 

1245 self.loadData(registry, "hsc-rc2-subset.yaml") 

1246 

1247 # Dictionary of spatial DatabaseDimensionElements, keyed by the name of 

1248 # the TopologicalFamily they belong to. We'll relate all elements in 

1249 # each family to all of the elements in each other family. 

1250 families = defaultdict(set) 

1251 # Dictionary of {element.name: {dataId: region}}. 

1252 regions = {} 

1253 for element in registry.dimensions.getDatabaseElements(): 

1254 if element.spatial is not None: 

1255 families[element.spatial.name].add(element) 

1256 regions[element.name] = { 

1257 record.dataId: record.region for record in registry.queryDimensionRecords(element) 

1258 } 

1259 

1260 # If this check fails, it's not necessarily a problem - it may just be 

1261 # a reasonable change to the default dimension definitions - but the 

1262 # test below depends on there being more than one family to do anything 

1263 # useful. 

1264 self.assertEqual(len(families), 2) 

1265 

1266 # Overlap DatabaseDimensionElements with each other. 

1267 for family1, family2 in itertools.combinations(families, 2): 

1268 for element1, element2 in itertools.product(families[family1], families[family2]): 

1269 graph = DimensionGraph.union(element1.graph, element2.graph) 

1270 # Construct expected set of overlapping data IDs via a 

1271 # brute-force comparison of the regions we've already fetched. 

1272 expected = { 

1273 DataCoordinate.standardize({**dataId1.byName(), **dataId2.byName()}, graph=graph) 

1274 for (dataId1, region1), (dataId2, region2) in itertools.product( 

1275 regions[element1.name].items(), regions[element2.name].items() 

1276 ) 

1277 if not region1.isDisjointFrom(region2) 

1278 } 

1279 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.") 

1280 queried = set(registry.queryDataIds(graph)) 

1281 self.assertEqual(expected, queried) 

1282 

1283 # Overlap each DatabaseDimensionElement with the commonSkyPix system. 

1284 commonSkyPix = registry.dimensions.commonSkyPix 

1285 for elementName, regions in regions.items(): 

1286 graph = DimensionGraph.union(registry.dimensions[elementName].graph, commonSkyPix.graph) 

1287 expected = set() 

1288 for dataId, region in regions.items(): 

1289 for begin, end in commonSkyPix.pixelization.envelope(region): 

1290 expected.update( 

1291 DataCoordinate.standardize({commonSkyPix.name: index, **dataId.byName()}, graph=graph) 

1292 for index in range(begin, end) 

1293 ) 

1294 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.") 

1295 queried = set(registry.queryDataIds(graph)) 

1296 self.assertEqual(expected, queried) 

1297 

1298 def testAbstractQuery(self): 

1299 """Test that we can run a query that just lists the known 

1300 bands. This is tricky because band is 

1301 backed by a query against physical_filter. 

1302 """ 

1303 registry = self.makeRegistry() 

1304 registry.insertDimensionData("instrument", dict(name="DummyCam")) 

1305 registry.insertDimensionData( 

1306 "physical_filter", 

1307 dict(instrument="DummyCam", name="dummy_i", band="i"), 

1308 dict(instrument="DummyCam", name="dummy_i2", band="i"), 

1309 dict(instrument="DummyCam", name="dummy_r", band="r"), 

1310 ) 

1311 rows = registry.queryDataIds(["band"]).toSet() 

1312 self.assertCountEqual( 

1313 rows, 

1314 [ 

1315 DataCoordinate.standardize(band="i", universe=registry.dimensions), 

1316 DataCoordinate.standardize(band="r", universe=registry.dimensions), 

1317 ], 

1318 ) 

1319 

1320 def testAttributeManager(self): 

1321 """Test basic functionality of attribute manager.""" 

1322 # number of attributes with schema versions in a fresh database, 

1323 # 6 managers with 2 records per manager, plus config for dimensions 

1324 VERSION_COUNT = 6 * 2 + 1 

1325 

1326 registry = self.makeRegistry() 

1327 attributes = registry._managers.attributes 

1328 

1329 # check what get() returns for non-existing key 

1330 self.assertIsNone(attributes.get("attr")) 

1331 self.assertEqual(attributes.get("attr", ""), "") 

1332 self.assertEqual(attributes.get("attr", "Value"), "Value") 

1333 self.assertEqual(len(list(attributes.items())), VERSION_COUNT) 

1334 

1335 # cannot store empty key or value 

1336 with self.assertRaises(ValueError): 

1337 attributes.set("", "value") 

1338 with self.assertRaises(ValueError): 

1339 attributes.set("attr", "") 

1340 

1341 # set value of non-existing key 

1342 attributes.set("attr", "value") 

1343 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1) 

1344 self.assertEqual(attributes.get("attr"), "value") 

1345 

1346 # update value of existing key 

1347 with self.assertRaises(ButlerAttributeExistsError): 

1348 attributes.set("attr", "value2") 

1349 

1350 attributes.set("attr", "value2", force=True) 

1351 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1) 

1352 self.assertEqual(attributes.get("attr"), "value2") 

1353 

1354 # delete existing key 

1355 self.assertTrue(attributes.delete("attr")) 

1356 self.assertEqual(len(list(attributes.items())), VERSION_COUNT) 

1357 

1358 # delete non-existing key 

1359 self.assertFalse(attributes.delete("non-attr")) 

1360 

1361 # store bunch of keys and get the list back 

1362 data = [ 

1363 ("version.core", "1.2.3"), 

1364 ("version.dimensions", "3.2.1"), 

1365 ("config.managers.opaque", "ByNameOpaqueTableStorageManager"), 

1366 ] 

1367 for key, value in data: 

1368 attributes.set(key, value) 

1369 items = dict(attributes.items()) 

1370 for key, value in data: 

1371 self.assertEqual(items[key], value) 

1372 

1373 def testQueryDatasetsDeduplication(self): 

1374 """Test that the findFirst option to queryDatasets selects datasets 

1375 from collections in the order given". 

1376 """ 

1377 registry = self.makeRegistry() 

1378 self.loadData(registry, "base.yaml") 

1379 self.loadData(registry, "datasets.yaml") 

1380 self.assertCountEqual( 

1381 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"])), 

1382 [ 

1383 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"), 

1384 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"), 

1385 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"), 

1386 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"), 

1387 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"), 

1388 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"), 

1389 ], 

1390 ) 

1391 self.assertCountEqual( 

1392 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"], findFirst=True)), 

1393 [ 

1394 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"), 

1395 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"), 

1396 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"), 

1397 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"), 

1398 ], 

1399 ) 

1400 self.assertCountEqual( 

1401 list(registry.queryDatasets("bias", collections=["imported_r", "imported_g"], findFirst=True)), 

1402 [ 

1403 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"), 

1404 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"), 

1405 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"), 

1406 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"), 

1407 ], 

1408 ) 

1409 

1410 def testQueryResults(self): 

1411 """Test querying for data IDs and then manipulating the QueryResults 

1412 object returned to perform other queries. 

1413 """ 

1414 registry = self.makeRegistry() 

1415 self.loadData(registry, "base.yaml") 

1416 self.loadData(registry, "datasets.yaml") 

1417 bias = registry.getDatasetType("bias") 

1418 flat = registry.getDatasetType("flat") 

1419 # Obtain expected results from methods other than those we're testing 

1420 # here. That includes: 

1421 # - the dimensions of the data IDs we want to query: 

1422 expectedGraph = DimensionGraph(registry.dimensions, names=["detector", "physical_filter"]) 

1423 # - the dimensions of some other data IDs we'll extract from that: 

1424 expectedSubsetGraph = DimensionGraph(registry.dimensions, names=["detector"]) 

1425 # - the data IDs we expect to obtain from the first queries: 

1426 expectedDataIds = DataCoordinateSet( 

1427 { 

1428 DataCoordinate.standardize( 

1429 instrument="Cam1", detector=d, physical_filter=p, universe=registry.dimensions 

1430 ) 

1431 for d, p in itertools.product({1, 2, 3}, {"Cam1-G", "Cam1-R1", "Cam1-R2"}) 

1432 }, 

1433 graph=expectedGraph, 

1434 hasFull=False, 

1435 hasRecords=False, 

1436 ) 

1437 # - the flat datasets we expect to find from those data IDs, in just 

1438 # one collection (so deduplication is irrelevant): 

1439 expectedFlats = [ 

1440 registry.findDataset( 

1441 flat, instrument="Cam1", detector=1, physical_filter="Cam1-R1", collections="imported_r" 

1442 ), 

1443 registry.findDataset( 

1444 flat, instrument="Cam1", detector=2, physical_filter="Cam1-R1", collections="imported_r" 

1445 ), 

1446 registry.findDataset( 

1447 flat, instrument="Cam1", detector=3, physical_filter="Cam1-R2", collections="imported_r" 

1448 ), 

1449 ] 

1450 # - the data IDs we expect to extract from that: 

1451 expectedSubsetDataIds = expectedDataIds.subset(expectedSubsetGraph) 

1452 # - the bias datasets we expect to find from those data IDs, after we 

1453 # subset-out the physical_filter dimension, both with duplicates: 

1454 expectedAllBiases = [ 

1455 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"), 

1456 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_g"), 

1457 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_g"), 

1458 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"), 

1459 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"), 

1460 ] 

1461 # - ...and without duplicates: 

1462 expectedDeduplicatedBiases = [ 

1463 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"), 

1464 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"), 

1465 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"), 

1466 ] 

1467 # Test against those expected results, using a "lazy" query for the 

1468 # data IDs (which re-executes that query each time we use it to do 

1469 # something new). 

1470 dataIds = registry.queryDataIds( 

1471 ["detector", "physical_filter"], 

1472 where="detector.purpose = 'SCIENCE'", # this rejects detector=4 

1473 instrument="Cam1", 

1474 ) 

1475 self.assertEqual(dataIds.graph, expectedGraph) 

1476 self.assertEqual(dataIds.toSet(), expectedDataIds) 

1477 self.assertCountEqual( 

1478 list( 

1479 dataIds.findDatasets( 

1480 flat, 

1481 collections=["imported_r"], 

1482 ) 

1483 ), 

1484 expectedFlats, 

1485 ) 

1486 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True) 

1487 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph) 

1488 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds) 

1489 self.assertCountEqual( 

1490 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=False)), 

1491 expectedAllBiases, 

1492 ) 

1493 self.assertCountEqual( 

1494 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)), 

1495 expectedDeduplicatedBiases, 

1496 ) 

1497 

1498 # Check dimensions match. 

1499 with self.assertRaises(ValueError): 

1500 subsetDataIds.findDatasets("flat", collections=["imported_r", "imported_g"], findFirst=True) 

1501 

1502 # Use a component dataset type. 

1503 self.assertCountEqual( 

1504 [ 

1505 ref.makeComponentRef("image") 

1506 for ref in subsetDataIds.findDatasets( 

1507 bias, 

1508 collections=["imported_r", "imported_g"], 

1509 findFirst=False, 

1510 ) 

1511 ], 

1512 [ref.makeComponentRef("image") for ref in expectedAllBiases], 

1513 ) 

1514 

1515 # Use a named dataset type that does not exist and a dataset type 

1516 # object that does not exist. 

1517 unknown_type = DatasetType("not_known", dimensions=bias.dimensions, storageClass="Exposure") 

1518 

1519 # Test both string name and dataset type object. 

1520 test_type: Union[str, DatasetType] 

1521 for test_type, test_type_name in ( 

1522 (unknown_type, unknown_type.name), 

1523 (unknown_type.name, unknown_type.name), 

1524 ): 

1525 with self.assertRaisesRegex(DatasetTypeError, expected_regex=test_type_name): 

1526 list( 

1527 subsetDataIds.findDatasets( 

1528 test_type, collections=["imported_r", "imported_g"], findFirst=True 

1529 ) 

1530 ) 

1531 

1532 # Materialize the bias dataset queries (only) by putting the results 

1533 # into temporary tables, then repeat those tests. 

1534 with subsetDataIds.findDatasets( 

1535 bias, collections=["imported_r", "imported_g"], findFirst=False 

1536 ).materialize() as biases: 

1537 self.assertCountEqual(list(biases), expectedAllBiases) 

1538 with subsetDataIds.findDatasets( 

1539 bias, collections=["imported_r", "imported_g"], findFirst=True 

1540 ).materialize() as biases: 

1541 self.assertCountEqual(list(biases), expectedDeduplicatedBiases) 

1542 # Materialize the data ID subset query, but not the dataset queries. 

1543 with subsetDataIds.materialize() as subsetDataIds: 

1544 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph) 

1545 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds) 

1546 self.assertCountEqual( 

1547 list( 

1548 subsetDataIds.findDatasets( 

1549 bias, collections=["imported_r", "imported_g"], findFirst=False 

1550 ) 

1551 ), 

1552 expectedAllBiases, 

1553 ) 

1554 self.assertCountEqual( 

1555 list( 

1556 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True) 

1557 ), 

1558 expectedDeduplicatedBiases, 

1559 ) 

1560 # Materialize the dataset queries, too. 

1561 with subsetDataIds.findDatasets( 

1562 bias, collections=["imported_r", "imported_g"], findFirst=False 

1563 ).materialize() as biases: 

1564 self.assertCountEqual(list(biases), expectedAllBiases) 

1565 with subsetDataIds.findDatasets( 

1566 bias, collections=["imported_r", "imported_g"], findFirst=True 

1567 ).materialize() as biases: 

1568 self.assertCountEqual(list(biases), expectedDeduplicatedBiases) 

1569 # Materialize the original query, but none of the follow-up queries. 

1570 with dataIds.materialize() as dataIds: 

1571 self.assertEqual(dataIds.graph, expectedGraph) 

1572 self.assertEqual(dataIds.toSet(), expectedDataIds) 

1573 self.assertCountEqual( 

1574 list( 

1575 dataIds.findDatasets( 

1576 flat, 

1577 collections=["imported_r"], 

1578 ) 

1579 ), 

1580 expectedFlats, 

1581 ) 

1582 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True) 

1583 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph) 

1584 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds) 

1585 self.assertCountEqual( 

1586 list( 

1587 subsetDataIds.findDatasets( 

1588 bias, collections=["imported_r", "imported_g"], findFirst=False 

1589 ) 

1590 ), 

1591 expectedAllBiases, 

1592 ) 

1593 self.assertCountEqual( 

1594 list( 

1595 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True) 

1596 ), 

1597 expectedDeduplicatedBiases, 

1598 ) 

1599 # Materialize just the bias dataset queries. 

1600 with subsetDataIds.findDatasets( 

1601 bias, collections=["imported_r", "imported_g"], findFirst=False 

1602 ).materialize() as biases: 

1603 self.assertCountEqual(list(biases), expectedAllBiases) 

1604 with subsetDataIds.findDatasets( 

1605 bias, collections=["imported_r", "imported_g"], findFirst=True 

1606 ).materialize() as biases: 

1607 self.assertCountEqual(list(biases), expectedDeduplicatedBiases) 

1608 # Materialize the subset data ID query, but not the dataset 

1609 # queries. 

1610 with subsetDataIds.materialize() as subsetDataIds: 

1611 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph) 

1612 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds) 

1613 self.assertCountEqual( 

1614 list( 

1615 subsetDataIds.findDatasets( 

1616 bias, collections=["imported_r", "imported_g"], findFirst=False 

1617 ) 

1618 ), 

1619 expectedAllBiases, 

1620 ) 

1621 self.assertCountEqual( 

1622 list( 

1623 subsetDataIds.findDatasets( 

1624 bias, collections=["imported_r", "imported_g"], findFirst=True 

1625 ) 

1626 ), 

1627 expectedDeduplicatedBiases, 

1628 ) 

1629 # Materialize the bias dataset queries, too, so now we're 

1630 # materializing every single step. 

1631 with subsetDataIds.findDatasets( 

1632 bias, collections=["imported_r", "imported_g"], findFirst=False 

1633 ).materialize() as biases: 

1634 self.assertCountEqual(list(biases), expectedAllBiases) 

1635 with subsetDataIds.findDatasets( 

1636 bias, collections=["imported_r", "imported_g"], findFirst=True 

1637 ).materialize() as biases: 

1638 self.assertCountEqual(list(biases), expectedDeduplicatedBiases) 

1639 

1640 def testStorageClassPropagation(self): 

1641 """Test that queries for datasets respect the storage class passed in 

1642 as part of a full dataset type. 

1643 """ 

1644 registry = self.makeRegistry() 

1645 self.loadData(registry, "base.yaml") 

1646 dataset_type_in_registry = DatasetType( 

1647 "tbl", dimensions=["instrument"], storageClass="DataFrame", universe=registry.dimensions 

1648 ) 

1649 registry.registerDatasetType(dataset_type_in_registry) 

1650 run = "run1" 

1651 registry.registerRun(run) 

1652 (inserted_ref,) = registry.insertDatasets( 

1653 dataset_type_in_registry, [registry.expandDataId(instrument="Cam1")], run=run 

1654 ) 

1655 self.assertEqual(inserted_ref.datasetType, dataset_type_in_registry) 

1656 query_dataset_type = DatasetType( 

1657 "tbl", dimensions=["instrument"], storageClass="ArrowAstropy", universe=registry.dimensions 

1658 ) 

1659 self.assertNotEqual(dataset_type_in_registry, query_dataset_type) 

1660 query_datasets_result = registry.queryDatasets(query_dataset_type, collections=[run]) 

1661 self.assertEqual(query_datasets_result.parentDatasetType, query_dataset_type) # type: ignore 

1662 (query_datasets_ref,) = query_datasets_result 

1663 self.assertEqual(query_datasets_ref.datasetType, query_dataset_type) 

1664 query_data_ids_find_datasets_result = registry.queryDataIds(["instrument"]).findDatasets( 

1665 query_dataset_type, collections=[run] 

1666 ) 

1667 self.assertEqual(query_data_ids_find_datasets_result.parentDatasetType, query_dataset_type) 

1668 (query_data_ids_find_datasets_ref,) = query_data_ids_find_datasets_result 

1669 self.assertEqual(query_data_ids_find_datasets_ref.datasetType, query_dataset_type) 

1670 query_dataset_types_result = registry.queryDatasetTypes(query_dataset_type) 

1671 self.assertEqual(list(query_dataset_types_result), [query_dataset_type]) 

1672 find_dataset_ref = registry.findDataset(query_dataset_type, instrument="Cam1", collections=[run]) 

1673 self.assertEqual(find_dataset_ref.datasetType, query_dataset_type) 

1674 

1675 def testEmptyDimensionsQueries(self): 

1676 """Test Query and QueryResults objects in the case where there are no 

1677 dimensions. 

1678 """ 

1679 # Set up test data: one dataset type, two runs, one dataset in each. 

1680 registry = self.makeRegistry() 

1681 self.loadData(registry, "base.yaml") 

1682 schema = DatasetType("schema", dimensions=registry.dimensions.empty, storageClass="Catalog") 

1683 registry.registerDatasetType(schema) 

1684 dataId = DataCoordinate.makeEmpty(registry.dimensions) 

1685 run1 = "run1" 

1686 run2 = "run2" 

1687 registry.registerRun(run1) 

1688 registry.registerRun(run2) 

1689 (dataset1,) = registry.insertDatasets(schema, dataIds=[dataId], run=run1) 

1690 (dataset2,) = registry.insertDatasets(schema, dataIds=[dataId], run=run2) 

1691 # Query directly for both of the datasets, and each one, one at a time. 

1692 self.checkQueryResults( 

1693 registry.queryDatasets(schema, collections=[run1, run2], findFirst=False), [dataset1, dataset2] 

1694 ) 

1695 self.checkQueryResults( 

1696 registry.queryDatasets(schema, collections=[run1, run2], findFirst=True), 

1697 [dataset1], 

1698 ) 

1699 self.checkQueryResults( 

1700 registry.queryDatasets(schema, collections=[run2, run1], findFirst=True), 

1701 [dataset2], 

1702 ) 

1703 # Query for data IDs with no dimensions. 

1704 dataIds = registry.queryDataIds([]) 

1705 self.checkQueryResults(dataIds, [dataId]) 

1706 # Use queried data IDs to find the datasets. 

1707 self.checkQueryResults( 

1708 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False), 

1709 [dataset1, dataset2], 

1710 ) 

1711 self.checkQueryResults( 

1712 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True), 

1713 [dataset1], 

1714 ) 

1715 self.checkQueryResults( 

1716 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True), 

1717 [dataset2], 

1718 ) 

1719 # Now materialize the data ID query results and repeat those tests. 

1720 with dataIds.materialize() as dataIds: 

1721 self.checkQueryResults(dataIds, [dataId]) 

1722 self.checkQueryResults( 

1723 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True), 

1724 [dataset1], 

1725 ) 

1726 self.checkQueryResults( 

1727 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True), 

1728 [dataset2], 

1729 ) 

1730 # Query for non-empty data IDs, then subset that to get the empty one. 

1731 # Repeat the above tests starting from that. 

1732 dataIds = registry.queryDataIds(["instrument"]).subset(registry.dimensions.empty, unique=True) 

1733 self.checkQueryResults(dataIds, [dataId]) 

1734 self.checkQueryResults( 

1735 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False), 

1736 [dataset1, dataset2], 

1737 ) 

1738 self.checkQueryResults( 

1739 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True), 

1740 [dataset1], 

1741 ) 

1742 self.checkQueryResults( 

1743 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True), 

1744 [dataset2], 

1745 ) 

1746 with dataIds.materialize() as dataIds: 

1747 self.checkQueryResults(dataIds, [dataId]) 

1748 self.checkQueryResults( 

1749 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False), 

1750 [dataset1, dataset2], 

1751 ) 

1752 self.checkQueryResults( 

1753 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True), 

1754 [dataset1], 

1755 ) 

1756 self.checkQueryResults( 

1757 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True), 

1758 [dataset2], 

1759 ) 

1760 # Query for non-empty data IDs, then materialize, then subset to get 

1761 # the empty one. Repeat again. 

1762 with registry.queryDataIds(["instrument"]).materialize() as nonEmptyDataIds: 

1763 dataIds = nonEmptyDataIds.subset(registry.dimensions.empty, unique=True) 

1764 self.checkQueryResults(dataIds, [dataId]) 

1765 self.checkQueryResults( 

1766 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False), 

1767 [dataset1, dataset2], 

1768 ) 

1769 self.checkQueryResults( 

1770 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True), 

1771 [dataset1], 

1772 ) 

1773 self.checkQueryResults( 

1774 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True), 

1775 [dataset2], 

1776 ) 

1777 with dataIds.materialize() as dataIds: 

1778 self.checkQueryResults(dataIds, [dataId]) 

1779 self.checkQueryResults( 

1780 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False), 

1781 [dataset1, dataset2], 

1782 ) 

1783 self.checkQueryResults( 

1784 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True), 

1785 [dataset1], 

1786 ) 

1787 self.checkQueryResults( 

1788 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True), 

1789 [dataset2], 

1790 ) 

1791 # Query for non-empty data IDs with a constraint on an empty-data-ID 

1792 # dataset that exists. 

1793 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=...) 

1794 self.checkQueryResults( 

1795 dataIds.subset(unique=True), 

1796 [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)], 

1797 ) 

1798 # Again query for non-empty data IDs with a constraint on empty-data-ID 

1799 # datasets, but when the datasets don't exist. We delete the existing 

1800 # dataset and query just that collection rather than creating a new 

1801 # empty collection because this is a bit less likely for our build-time 

1802 # logic to shortcut-out (via the collection summaries), and such a 

1803 # shortcut would make this test a bit more trivial than we'd like. 

1804 registry.removeDatasets([dataset2]) 

1805 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=run2) 

1806 self.checkQueryResults(dataIds, []) 

1807 

1808 def testDimensionDataModifications(self): 

1809 """Test that modifying dimension records via: 

1810 syncDimensionData(..., update=True) and 

1811 insertDimensionData(..., replace=True) works as expected, even in the 

1812 presence of datasets using those dimensions and spatial overlap 

1813 relationships. 

1814 """ 

1815 

1816 def unpack_range_set(ranges: lsst.sphgeom.RangeSet) -> Iterator[int]: 

1817 """Unpack a sphgeom.RangeSet into the integers it contains.""" 

1818 for begin, end in ranges: 

1819 yield from range(begin, end) 

1820 

1821 def range_set_hull( 

1822 ranges: lsst.sphgeom.RangeSet, 

1823 pixelization: lsst.sphgeom.HtmPixelization, 

1824 ) -> lsst.sphgeom.ConvexPolygon: 

1825 """Create a ConvexPolygon hull of the region defined by a set of 

1826 HTM pixelization index ranges. 

1827 """ 

1828 points = [] 

1829 for index in unpack_range_set(ranges): 

1830 points.extend(pixelization.triangle(index).getVertices()) 

1831 return lsst.sphgeom.ConvexPolygon(points) 

1832 

1833 # Use HTM to set up an initial parent region (one arbitrary trixel) 

1834 # and four child regions (the trixels within the parent at the next 

1835 # level. We'll use the parent as a tract/visit region and the children 

1836 # as its patch/visit_detector regions. 

1837 registry = self.makeRegistry() 

1838 htm6 = registry.dimensions.skypix["htm"][6].pixelization 

1839 commonSkyPix = registry.dimensions.commonSkyPix.pixelization 

1840 index = 12288 

1841 child_ranges_small = lsst.sphgeom.RangeSet(index).scaled(4) 

1842 assert htm6.universe().contains(child_ranges_small) 

1843 child_regions_small = [htm6.triangle(i) for i in unpack_range_set(child_ranges_small)] 

1844 parent_region_small = lsst.sphgeom.ConvexPolygon( 

1845 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_small)) 

1846 ) 

1847 assert all(parent_region_small.contains(c) for c in child_regions_small) 

1848 # Make a larger version of each child region, defined to be the set of 

1849 # htm6 trixels that overlap the original's bounding circle. Make a new 

1850 # parent that's the convex hull of the new children. 

1851 child_regions_large = [ 

1852 range_set_hull(htm6.envelope(c.getBoundingCircle()), htm6) for c in child_regions_small 

1853 ] 

1854 assert all(large.contains(small) for large, small in zip(child_regions_large, child_regions_small)) 

1855 parent_region_large = lsst.sphgeom.ConvexPolygon( 

1856 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_large)) 

1857 ) 

1858 assert all(parent_region_large.contains(c) for c in child_regions_large) 

1859 assert parent_region_large.contains(parent_region_small) 

1860 assert not parent_region_small.contains(parent_region_large) 

1861 assert not all(parent_region_small.contains(c) for c in child_regions_large) 

1862 # Find some commonSkyPix indices that overlap the large regions but not 

1863 # overlap the small regions. We use commonSkyPix here to make sure the 

1864 # real tests later involve what's in the database, not just post-query 

1865 # filtering of regions. 

1866 child_difference_indices = [] 

1867 for large, small in zip(child_regions_large, child_regions_small): 

1868 difference = list(unpack_range_set(commonSkyPix.envelope(large) - commonSkyPix.envelope(small))) 

1869 assert difference, "if this is empty, we can't test anything useful with these regions" 

1870 assert all( 

1871 not commonSkyPix.triangle(d).isDisjointFrom(large) 

1872 and commonSkyPix.triangle(d).isDisjointFrom(small) 

1873 for d in difference 

1874 ) 

1875 child_difference_indices.append(difference) 

1876 parent_difference_indices = list( 

1877 unpack_range_set( 

1878 commonSkyPix.envelope(parent_region_large) - commonSkyPix.envelope(parent_region_small) 

1879 ) 

1880 ) 

1881 assert parent_difference_indices, "if this is empty, we can't test anything useful with these regions" 

1882 assert all( 

1883 ( 

1884 not commonSkyPix.triangle(d).isDisjointFrom(parent_region_large) 

1885 and commonSkyPix.triangle(d).isDisjointFrom(parent_region_small) 

1886 ) 

1887 for d in parent_difference_indices 

1888 ) 

1889 # Now that we've finally got those regions, we'll insert the large ones 

1890 # as tract/patch dimension records. 

1891 skymap_name = "testing_v1" 

1892 registry.insertDimensionData( 

1893 "skymap", 

1894 { 

1895 "name": skymap_name, 

1896 "hash": bytes([42]), 

1897 "tract_max": 1, 

1898 "patch_nx_max": 2, 

1899 "patch_ny_max": 2, 

1900 }, 

1901 ) 

1902 registry.insertDimensionData("tract", {"skymap": skymap_name, "id": 0, "region": parent_region_large}) 

1903 registry.insertDimensionData( 

1904 "patch", 

1905 *[ 

1906 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c} 

1907 for n, c in enumerate(child_regions_large) 

1908 ], 

1909 ) 

1910 # Add at dataset that uses these dimensions to make sure that modifying 

1911 # them doesn't disrupt foreign keys (need to make sure DB doesn't 

1912 # implement insert with replace=True as delete-then-insert). 

1913 dataset_type = DatasetType( 

1914 "coadd", 

1915 dimensions=["tract", "patch"], 

1916 universe=registry.dimensions, 

1917 storageClass="Exposure", 

1918 ) 

1919 registry.registerDatasetType(dataset_type) 

1920 registry.registerCollection("the_run", CollectionType.RUN) 

1921 registry.insertDatasets( 

1922 dataset_type, 

1923 [{"skymap": skymap_name, "tract": 0, "patch": 2}], 

1924 run="the_run", 

1925 ) 

1926 # Query for tracts and patches that overlap some "difference" htm9 

1927 # pixels; there should be overlaps, because the database has 

1928 # the "large" suite of regions. 

1929 self.assertEqual( 

1930 {0}, 

1931 { 

1932 data_id["tract"] 

1933 for data_id in registry.queryDataIds( 

1934 ["tract"], 

1935 skymap=skymap_name, 

1936 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]}, 

1937 ) 

1938 }, 

1939 ) 

1940 for patch_id, patch_difference_indices in enumerate(child_difference_indices): 

1941 self.assertIn( 

1942 patch_id, 

1943 { 

1944 data_id["patch"] 

1945 for data_id in registry.queryDataIds( 

1946 ["patch"], 

1947 skymap=skymap_name, 

1948 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]}, 

1949 ) 

1950 }, 

1951 ) 

1952 # Use sync to update the tract region and insert to update the regions 

1953 # of the patches, to the "small" suite. 

1954 updated = registry.syncDimensionData( 

1955 "tract", 

1956 {"skymap": skymap_name, "id": 0, "region": parent_region_small}, 

1957 update=True, 

1958 ) 

1959 self.assertEqual(updated, {"region": parent_region_large}) 

1960 registry.insertDimensionData( 

1961 "patch", 

1962 *[ 

1963 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c} 

1964 for n, c in enumerate(child_regions_small) 

1965 ], 

1966 replace=True, 

1967 ) 

1968 # Query again; there now should be no such overlaps, because the 

1969 # database has the "small" suite of regions. 

1970 self.assertFalse( 

1971 set( 

1972 registry.queryDataIds( 

1973 ["tract"], 

1974 skymap=skymap_name, 

1975 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]}, 

1976 ) 

1977 ) 

1978 ) 

1979 for patch_id, patch_difference_indices in enumerate(child_difference_indices): 

1980 self.assertNotIn( 

1981 patch_id, 

1982 { 

1983 data_id["patch"] 

1984 for data_id in registry.queryDataIds( 

1985 ["patch"], 

1986 skymap=skymap_name, 

1987 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]}, 

1988 ) 

1989 }, 

1990 ) 

1991 # Update back to the large regions and query one more time. 

1992 updated = registry.syncDimensionData( 

1993 "tract", 

1994 {"skymap": skymap_name, "id": 0, "region": parent_region_large}, 

1995 update=True, 

1996 ) 

1997 self.assertEqual(updated, {"region": parent_region_small}) 

1998 registry.insertDimensionData( 

1999 "patch", 

2000 *[ 

2001 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c} 

2002 for n, c in enumerate(child_regions_large) 

2003 ], 

2004 replace=True, 

2005 ) 

2006 self.assertEqual( 

2007 {0}, 

2008 { 

2009 data_id["tract"] 

2010 for data_id in registry.queryDataIds( 

2011 ["tract"], 

2012 skymap=skymap_name, 

2013 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]}, 

2014 ) 

2015 }, 

2016 ) 

2017 for patch_id, patch_difference_indices in enumerate(child_difference_indices): 

2018 self.assertIn( 

2019 patch_id, 

2020 { 

2021 data_id["patch"] 

2022 for data_id in registry.queryDataIds( 

2023 ["patch"], 

2024 skymap=skymap_name, 

2025 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]}, 

2026 ) 

2027 }, 

2028 ) 

2029 

2030 def testCalibrationCollections(self): 

2031 """Test operations on `~CollectionType.CALIBRATION` collections, 

2032 including `Registry.certify`, `Registry.decertify`, and 

2033 `Registry.findDataset`. 

2034 """ 

2035 # Setup - make a Registry, fill it with some datasets in 

2036 # non-calibration collections. 

2037 registry = self.makeRegistry() 

2038 self.loadData(registry, "base.yaml") 

2039 self.loadData(registry, "datasets.yaml") 

2040 # Set up some timestamps. 

2041 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai") 

2042 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai") 

2043 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai") 

2044 t4 = astropy.time.Time("2020-01-01T04:00:00", format="isot", scale="tai") 

2045 t5 = astropy.time.Time("2020-01-01T05:00:00", format="isot", scale="tai") 

2046 allTimespans = [ 

2047 Timespan(a, b) for a, b in itertools.combinations([None, t1, t2, t3, t4, t5, None], r=2) 

2048 ] 

2049 # Get references to some datasets. 

2050 bias2a = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g") 

2051 bias3a = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g") 

2052 bias2b = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r") 

2053 bias3b = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r") 

2054 # Register the main calibration collection we'll be working with. 

2055 collection = "Cam1/calibs/default" 

2056 registry.registerCollection(collection, type=CollectionType.CALIBRATION) 

2057 # Cannot associate into a calibration collection (no timespan). 

2058 with self.assertRaises(CollectionTypeError): 

2059 registry.associate(collection, [bias2a]) 

2060 # Certify 2a dataset with [t2, t4) validity. 

2061 registry.certify(collection, [bias2a], Timespan(begin=t2, end=t4)) 

2062 # Test that we can query for this dataset via the new collection, both 

2063 # on its own and with a RUN collection, as long as we don't try to join 

2064 # in temporal dimensions or use findFirst=True. 

2065 self.assertEqual( 

2066 set(registry.queryDatasets("bias", findFirst=False, collections=collection)), 

2067 {bias2a}, 

2068 ) 

2069 self.assertEqual( 

2070 set(registry.queryDatasets("bias", findFirst=False, collections=[collection, "imported_r"])), 

2071 { 

2072 bias2a, 

2073 bias2b, 

2074 bias3b, 

2075 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"), 

2076 }, 

2077 ) 

2078 self.assertEqual( 

2079 set(registry.queryDataIds("detector", datasets="bias", collections=collection)), 

2080 {registry.expandDataId(instrument="Cam1", detector=2)}, 

2081 ) 

2082 self.assertEqual( 

2083 set(registry.queryDataIds("detector", datasets="bias", collections=[collection, "imported_r"])), 

2084 { 

2085 registry.expandDataId(instrument="Cam1", detector=2), 

2086 registry.expandDataId(instrument="Cam1", detector=3), 

2087 registry.expandDataId(instrument="Cam1", detector=4), 

2088 }, 

2089 ) 

2090 

2091 # We should not be able to certify 2b with anything overlapping that 

2092 # window. 

2093 with self.assertRaises(ConflictingDefinitionError): 

2094 registry.certify(collection, [bias2b], Timespan(begin=None, end=t3)) 

2095 with self.assertRaises(ConflictingDefinitionError): 

2096 registry.certify(collection, [bias2b], Timespan(begin=None, end=t5)) 

2097 with self.assertRaises(ConflictingDefinitionError): 

2098 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t3)) 

2099 with self.assertRaises(ConflictingDefinitionError): 

2100 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t5)) 

2101 with self.assertRaises(ConflictingDefinitionError): 

2102 registry.certify(collection, [bias2b], Timespan(begin=t1, end=None)) 

2103 with self.assertRaises(ConflictingDefinitionError): 

2104 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t3)) 

2105 with self.assertRaises(ConflictingDefinitionError): 

2106 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t5)) 

2107 with self.assertRaises(ConflictingDefinitionError): 

2108 registry.certify(collection, [bias2b], Timespan(begin=t2, end=None)) 

2109 # We should be able to certify 3a with a range overlapping that window, 

2110 # because it's for a different detector. 

2111 # We'll certify 3a over [t1, t3). 

2112 registry.certify(collection, [bias3a], Timespan(begin=t1, end=t3)) 

2113 # Now we'll certify 2b and 3b together over [t4, ∞). 

2114 registry.certify(collection, [bias2b, bias3b], Timespan(begin=t4, end=None)) 

2115 

2116 # Fetch all associations and check that they are what we expect. 

2117 self.assertCountEqual( 

2118 list( 

2119 registry.queryDatasetAssociations( 

2120 "bias", 

2121 collections=[collection, "imported_g", "imported_r"], 

2122 ) 

2123 ), 

2124 [ 

2125 DatasetAssociation( 

2126 ref=registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"), 

2127 collection="imported_g", 

2128 timespan=None, 

2129 ), 

2130 DatasetAssociation( 

2131 ref=registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"), 

2132 collection="imported_r", 

2133 timespan=None, 

2134 ), 

2135 DatasetAssociation(ref=bias2a, collection="imported_g", timespan=None), 

2136 DatasetAssociation(ref=bias3a, collection="imported_g", timespan=None), 

2137 DatasetAssociation(ref=bias2b, collection="imported_r", timespan=None), 

2138 DatasetAssociation(ref=bias3b, collection="imported_r", timespan=None), 

2139 DatasetAssociation(ref=bias2a, collection=collection, timespan=Timespan(begin=t2, end=t4)), 

2140 DatasetAssociation(ref=bias3a, collection=collection, timespan=Timespan(begin=t1, end=t3)), 

2141 DatasetAssociation(ref=bias2b, collection=collection, timespan=Timespan(begin=t4, end=None)), 

2142 DatasetAssociation(ref=bias3b, collection=collection, timespan=Timespan(begin=t4, end=None)), 

2143 ], 

2144 ) 

2145 

2146 class Ambiguous: 

2147 """Tag class to denote lookups that should be ambiguous.""" 

2148 

2149 pass 

2150 

2151 def assertLookup( 

2152 detector: int, timespan: Timespan, expected: Optional[Union[DatasetRef, Type[Ambiguous]]] 

2153 ) -> None: 

2154 """Local function that asserts that a bias lookup returns the given 

2155 expected result. 

2156 """ 

2157 if expected is Ambiguous: 

2158 with self.assertRaises((DatasetTypeError, LookupError)): 

2159 registry.findDataset( 

2160 "bias", 

2161 collections=collection, 

2162 instrument="Cam1", 

2163 detector=detector, 

2164 timespan=timespan, 

2165 ) 

2166 else: 

2167 self.assertEqual( 

2168 expected, 

2169 registry.findDataset( 

2170 "bias", 

2171 collections=collection, 

2172 instrument="Cam1", 

2173 detector=detector, 

2174 timespan=timespan, 

2175 ), 

2176 ) 

2177 

2178 # Systematically test lookups against expected results. 

2179 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None) 

2180 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None) 

2181 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a) 

2182 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a) 

2183 assertLookup(detector=2, timespan=Timespan(None, t5), expected=Ambiguous) 

2184 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous) 

2185 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None) 

2186 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a) 

2187 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a) 

2188 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=Ambiguous) 

2189 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous) 

2190 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a) 

2191 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a) 

2192 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=Ambiguous) 

2193 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous) 

2194 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=bias2a) 

2195 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=Ambiguous) 

2196 assertLookup(detector=2, timespan=Timespan(t3, None), expected=Ambiguous) 

2197 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=bias2b) 

2198 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b) 

2199 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b) 

2200 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None) 

2201 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a) 

2202 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a) 

2203 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a) 

2204 assertLookup(detector=3, timespan=Timespan(None, t5), expected=Ambiguous) 

2205 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous) 

2206 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a) 

2207 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a) 

2208 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a) 

2209 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=Ambiguous) 

2210 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous) 

2211 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a) 

2212 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a) 

2213 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=Ambiguous) 

2214 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous) 

2215 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None) 

2216 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=bias3b) 

2217 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b) 

2218 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=bias3b) 

2219 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b) 

2220 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b) 

2221 

2222 # Decertify [t3, t5) for all data IDs, and do test lookups again. 

2223 # This should truncate bias2a to [t2, t3), leave bias3a unchanged at 

2224 # [t1, t3), and truncate bias2b and bias3b to [t5, ∞). 

2225 registry.decertify(collection=collection, datasetType="bias", timespan=Timespan(t3, t5)) 

2226 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None) 

2227 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None) 

2228 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a) 

2229 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a) 

2230 assertLookup(detector=2, timespan=Timespan(None, t5), expected=bias2a) 

2231 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous) 

2232 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None) 

2233 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a) 

2234 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a) 

2235 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=bias2a) 

2236 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous) 

2237 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a) 

2238 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a) 

2239 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=bias2a) 

2240 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous) 

2241 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=None) 

2242 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=None) 

2243 assertLookup(detector=2, timespan=Timespan(t3, None), expected=bias2b) 

2244 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=None) 

2245 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b) 

2246 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b) 

2247 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None) 

2248 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a) 

2249 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a) 

2250 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a) 

2251 assertLookup(detector=3, timespan=Timespan(None, t5), expected=bias3a) 

2252 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous) 

2253 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a) 

2254 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a) 

2255 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a) 

2256 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=bias3a) 

2257 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous) 

2258 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a) 

2259 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a) 

2260 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=bias3a) 

2261 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous) 

2262 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None) 

2263 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=None) 

2264 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b) 

2265 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=None) 

2266 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b) 

2267 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b) 

2268 

2269 # Decertify everything, this time with explicit data IDs, then check 

2270 # that no lookups succeed. 

2271 registry.decertify( 

2272 collection, 

2273 "bias", 

2274 Timespan(None, None), 

2275 dataIds=[ 

2276 dict(instrument="Cam1", detector=2), 

2277 dict(instrument="Cam1", detector=3), 

2278 ], 

2279 ) 

2280 for detector in (2, 3): 

2281 for timespan in allTimespans: 

2282 assertLookup(detector=detector, timespan=timespan, expected=None) 

2283 # Certify bias2a and bias3a over (-∞, ∞), check that all lookups return 

2284 # those. 

2285 registry.certify( 

2286 collection, 

2287 [bias2a, bias3a], 

2288 Timespan(None, None), 

2289 ) 

2290 for timespan in allTimespans: 

2291 assertLookup(detector=2, timespan=timespan, expected=bias2a) 

2292 assertLookup(detector=3, timespan=timespan, expected=bias3a) 

2293 # Decertify just bias2 over [t2, t4). 

2294 # This should split a single certification row into two (and leave the 

2295 # other existing row, for bias3a, alone). 

2296 registry.decertify( 

2297 collection, "bias", Timespan(t2, t4), dataIds=[dict(instrument="Cam1", detector=2)] 

2298 ) 

2299 for timespan in allTimespans: 

2300 assertLookup(detector=3, timespan=timespan, expected=bias3a) 

2301 overlapsBefore = timespan.overlaps(Timespan(None, t2)) 

2302 overlapsAfter = timespan.overlaps(Timespan(t4, None)) 

2303 if overlapsBefore and overlapsAfter: 

2304 expected = Ambiguous 

2305 elif overlapsBefore or overlapsAfter: 

2306 expected = bias2a 

2307 else: 

2308 expected = None 

2309 assertLookup(detector=2, timespan=timespan, expected=expected) 

2310 

2311 def testSkipCalibs(self): 

2312 """Test how queries handle skipping of calibration collections.""" 

2313 registry = self.makeRegistry() 

2314 self.loadData(registry, "base.yaml") 

2315 self.loadData(registry, "datasets.yaml") 

2316 

2317 coll_calib = "Cam1/calibs/default" 

2318 registry.registerCollection(coll_calib, type=CollectionType.CALIBRATION) 

2319 

2320 # Add all biases to the calibration collection. 

2321 # Without this, the logic that prunes dataset subqueries based on 

2322 # datasetType-collection summary information will fire before the logic 

2323 # we want to test below. This is a good thing (it avoids the dreaded 

2324 # NotImplementedError a bit more often) everywhere but here. 

2325 registry.certify(coll_calib, registry.queryDatasets("bias", collections=...), Timespan(None, None)) 

2326 

2327 coll_list = [coll_calib, "imported_g", "imported_r"] 

2328 chain = "Cam1/chain" 

2329 registry.registerCollection(chain, type=CollectionType.CHAINED) 

2330 registry.setCollectionChain(chain, coll_list) 

2331 

2332 # explicit list will raise if findFirst=True or there are temporal 

2333 # dimensions 

2334 with self.assertRaises(NotImplementedError): 

2335 registry.queryDatasets("bias", collections=coll_list, findFirst=True) 

2336 with self.assertRaises(NotImplementedError): 

2337 registry.queryDataIds( 

2338 ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list 

2339 ).count() 

2340 

2341 # chain will skip 

2342 datasets = list(registry.queryDatasets("bias", collections=chain)) 

2343 self.assertGreater(len(datasets), 0) 

2344 

2345 dataIds = list(registry.queryDataIds(["instrument", "detector"], datasets="bias", collections=chain)) 

2346 self.assertGreater(len(dataIds), 0) 

2347 

2348 # glob will skip too 

2349 datasets = list(registry.queryDatasets("bias", collections="*d*")) 

2350 self.assertGreater(len(datasets), 0) 

2351 

2352 # regular expression will skip too 

2353 pattern = re.compile(".*") 

2354 datasets = list(registry.queryDatasets("bias", collections=pattern)) 

2355 self.assertGreater(len(datasets), 0) 

2356 

2357 # ellipsis should work as usual 

2358 datasets = list(registry.queryDatasets("bias", collections=...)) 

2359 self.assertGreater(len(datasets), 0) 

2360 

2361 # few tests with findFirst 

2362 datasets = list(registry.queryDatasets("bias", collections=chain, findFirst=True)) 

2363 self.assertGreater(len(datasets), 0) 

2364 

2365 def testIngestTimeQuery(self): 

2366 registry = self.makeRegistry() 

2367 self.loadData(registry, "base.yaml") 

2368 dt0 = datetime.utcnow() 

2369 self.loadData(registry, "datasets.yaml") 

2370 dt1 = datetime.utcnow() 

2371 

2372 datasets = list(registry.queryDatasets(..., collections=...)) 

2373 len0 = len(datasets) 

2374 self.assertGreater(len0, 0) 

2375 

2376 where = "ingest_date > T'2000-01-01'" 

2377 datasets = list(registry.queryDatasets(..., collections=..., where=where)) 

2378 len1 = len(datasets) 

2379 self.assertEqual(len0, len1) 

2380 

2381 # no one will ever use this piece of software in 30 years 

2382 where = "ingest_date > T'2050-01-01'" 

2383 datasets = list(registry.queryDatasets(..., collections=..., where=where)) 

2384 len2 = len(datasets) 

2385 self.assertEqual(len2, 0) 

2386 

2387 # Check more exact timing to make sure there is no 37 seconds offset 

2388 # (after fixing DM-30124). SQLite time precision is 1 second, make 

2389 # sure that we don't test with higher precision. 

2390 tests = [ 

2391 # format: (timestamp, operator, expected_len) 

2392 (dt0 - timedelta(seconds=1), ">", len0), 

2393 (dt0 - timedelta(seconds=1), "<", 0), 

2394 (dt1 + timedelta(seconds=1), "<", len0), 

2395 (dt1 + timedelta(seconds=1), ">", 0), 

2396 ] 

2397 for dt, op, expect_len in tests: 

2398 dt_str = dt.isoformat(sep=" ") 

2399 

2400 where = f"ingest_date {op} T'{dt_str}'" 

2401 datasets = list(registry.queryDatasets(..., collections=..., where=where)) 

2402 self.assertEqual(len(datasets), expect_len) 

2403 

2404 # same with bind using datetime or astropy Time 

2405 where = f"ingest_date {op} ingest_time" 

2406 datasets = list( 

2407 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt}) 

2408 ) 

2409 self.assertEqual(len(datasets), expect_len) 

2410 

2411 dt_astropy = astropy.time.Time(dt, format="datetime") 

2412 datasets = list( 

2413 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt_astropy}) 

2414 ) 

2415 self.assertEqual(len(datasets), expect_len) 

2416 

2417 def testTimespanQueries(self): 

2418 """Test query expressions involving timespans.""" 

2419 registry = self.makeRegistry() 

2420 self.loadData(registry, "hsc-rc2-subset.yaml") 

2421 # All exposures in the database; mapping from ID to timespan. 

2422 visits = {record.id: record.timespan for record in registry.queryDimensionRecords("visit")} 

2423 # Just those IDs, sorted (which is also temporal sorting, because HSC 

2424 # exposure IDs are monotonically increasing). 

2425 ids = sorted(visits.keys()) 

2426 self.assertGreater(len(ids), 20) 

2427 # Pick some quasi-random indexes into `ids` to play with. 

2428 i1 = int(len(ids) * 0.1) 

2429 i2 = int(len(ids) * 0.3) 

2430 i3 = int(len(ids) * 0.6) 

2431 i4 = int(len(ids) * 0.8) 

2432 # Extract some times from those: just before the beginning of i1 (which 

2433 # should be after the end of the exposure before), exactly the 

2434 # beginning of i2, just after the beginning of i3 (and before its end), 

2435 # and the exact end of i4. 

2436 t1 = visits[ids[i1]].begin - astropy.time.TimeDelta(1.0, format="sec") 

2437 self.assertGreater(t1, visits[ids[i1 - 1]].end) 

2438 t2 = visits[ids[i2]].begin 

2439 t3 = visits[ids[i3]].begin + astropy.time.TimeDelta(1.0, format="sec") 

2440 self.assertLess(t3, visits[ids[i3]].end) 

2441 t4 = visits[ids[i4]].end 

2442 # Make sure those are actually in order. 

2443 self.assertEqual([t1, t2, t3, t4], sorted([t4, t3, t2, t1])) 

2444 

2445 bind = { 

2446 "t1": t1, 

2447 "t2": t2, 

2448 "t3": t3, 

2449 "t4": t4, 

2450 "ts23": Timespan(t2, t3), 

2451 } 

2452 

2453 def query(where): 

2454 """Helper function that queries for visit data IDs and returns 

2455 results as a sorted, deduplicated list of visit IDs. 

2456 """ 

2457 return sorted( 

2458 { 

2459 dataId["visit"] 

2460 for dataId in registry.queryDataIds("visit", instrument="HSC", bind=bind, where=where) 

2461 } 

2462 ) 

2463 

2464 # Try a bunch of timespan queries, mixing up the bounds themselves, 

2465 # where they appear in the expression, and how we get the timespan into 

2466 # the expression. 

2467 

2468 # t1 is before the start of i1, so this should not include i1. 

2469 self.assertEqual(ids[:i1], query("visit.timespan OVERLAPS (null, t1)")) 

2470 # t2 is exactly at the start of i2, but ends are exclusive, so these 

2471 # should not include i2. 

2472 self.assertEqual(ids[i1:i2], query("(t1, t2) OVERLAPS visit.timespan")) 

2473 self.assertEqual(ids[:i2], query("visit.timespan < (t2, t4)")) 

2474 # t3 is in the middle of i3, so this should include i3. 

2475 self.assertEqual(ids[i2 : i3 + 1], query("visit.timespan OVERLAPS ts23")) 

2476 # This one should not include t3 by the same reasoning. 

2477 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > (t1, t3)")) 

2478 # t4 is exactly at the end of i4, so this should include i4. 

2479 self.assertEqual(ids[i3 : i4 + 1], query(f"visit.timespan OVERLAPS (T'{t3.tai.isot}', t4)")) 

2480 # i4's upper bound of t4 is exclusive so this should not include t4. 

2481 self.assertEqual(ids[i4 + 1 :], query("visit.timespan OVERLAPS (t4, NULL)")) 

2482 

2483 # Now some timespan vs. time scalar queries. 

2484 self.assertEqual(ids[:i2], query("visit.timespan < t2")) 

2485 self.assertEqual(ids[:i2], query("t2 > visit.timespan")) 

2486 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > t3")) 

2487 self.assertEqual(ids[i3 + 1 :], query("t3 < visit.timespan")) 

2488 self.assertEqual(ids[i3 : i3 + 1], query("visit.timespan OVERLAPS t3")) 

2489 self.assertEqual(ids[i3 : i3 + 1], query(f"T'{t3.tai.isot}' OVERLAPS visit.timespan")) 

2490 

2491 # Empty timespans should not overlap anything. 

2492 self.assertEqual([], query("visit.timespan OVERLAPS (t3, t2)")) 

2493 

2494 def testCollectionSummaries(self): 

2495 """Test recording and retrieval of collection summaries.""" 

2496 self.maxDiff = None 

2497 registry = self.makeRegistry() 

2498 # Importing datasets from yaml should go through the code path where 

2499 # we update collection summaries as we insert datasets. 

2500 self.loadData(registry, "base.yaml") 

2501 self.loadData(registry, "datasets.yaml") 

2502 flat = registry.getDatasetType("flat") 

2503 expected1 = CollectionSummary() 

2504 expected1.dataset_types.add(registry.getDatasetType("bias")) 

2505 expected1.add_data_ids( 

2506 flat, [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)] 

2507 ) 

2508 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1) 

2509 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1) 

2510 # Create a chained collection with both of the imported runs; the 

2511 # summary should be the same, because it's a union with itself. 

2512 chain = "chain" 

2513 registry.registerCollection(chain, CollectionType.CHAINED) 

2514 registry.setCollectionChain(chain, ["imported_r", "imported_g"]) 

2515 self.assertEqual(registry.getCollectionSummary(chain), expected1) 

2516 # Associate flats only into a tagged collection and a calibration 

2517 # collection to check summaries of those. 

2518 tag = "tag" 

2519 registry.registerCollection(tag, CollectionType.TAGGED) 

2520 registry.associate(tag, registry.queryDatasets(flat, collections="imported_g")) 

2521 calibs = "calibs" 

2522 registry.registerCollection(calibs, CollectionType.CALIBRATION) 

2523 registry.certify( 

2524 calibs, registry.queryDatasets(flat, collections="imported_g"), timespan=Timespan(None, None) 

2525 ) 

2526 expected2 = expected1.copy() 

2527 expected2.dataset_types.discard("bias") 

2528 self.assertEqual(registry.getCollectionSummary(tag), expected2) 

2529 self.assertEqual(registry.getCollectionSummary(calibs), expected2) 

2530 # Explicitly calling Registry.refresh() should load those same 

2531 # summaries, via a totally different code path. 

2532 registry.refresh() 

2533 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1) 

2534 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1) 

2535 self.assertEqual(registry.getCollectionSummary(tag), expected2) 

2536 self.assertEqual(registry.getCollectionSummary(calibs), expected2) 

2537 

2538 def testBindInQueryDatasets(self): 

2539 """Test that the bind parameter is correctly forwarded in 

2540 queryDatasets recursion. 

2541 """ 

2542 registry = self.makeRegistry() 

2543 # Importing datasets from yaml should go through the code path where 

2544 # we update collection summaries as we insert datasets. 

2545 self.loadData(registry, "base.yaml") 

2546 self.loadData(registry, "datasets.yaml") 

2547 self.assertEqual( 

2548 set(registry.queryDatasets("flat", band="r", collections=...)), 

2549 set(registry.queryDatasets("flat", where="band=my_band", bind={"my_band": "r"}, collections=...)), 

2550 ) 

2551 

2552 def testQueryIntRangeExpressions(self): 

2553 """Test integer range expressions in ``where`` arguments. 

2554 

2555 Note that our expressions use inclusive stop values, unlike Python's. 

2556 """ 

2557 registry = self.makeRegistry() 

2558 self.loadData(registry, "base.yaml") 

2559 self.assertEqual( 

2560 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..2)")), 

2561 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 2]}, 

2562 ) 

2563 self.assertEqual( 

2564 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..4:2)")), 

2565 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 3]}, 

2566 ) 

2567 self.assertEqual( 

2568 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (2..4:2)")), 

2569 {registry.expandDataId(instrument="Cam1", detector=n) for n in [2, 4]}, 

2570 ) 

2571 

2572 def testQueryResultSummaries(self): 

2573 """Test summary methods like `count`, `any`, and `explain_no_results` 

2574 on `DataCoordinateQueryResults` and `DatasetQueryResults` 

2575 """ 

2576 registry = self.makeRegistry() 

2577 self.loadData(registry, "base.yaml") 

2578 self.loadData(registry, "datasets.yaml") 

2579 self.loadData(registry, "spatial.yaml") 

2580 # Default test dataset has two collections, each with both flats and 

2581 # biases. Add a new collection with only biases. 

2582 registry.registerCollection("biases", CollectionType.TAGGED) 

2583 registry.associate("biases", registry.queryDatasets("bias", collections=["imported_g"])) 

2584 # First query yields two results, and involves no postprocessing. 

2585 query1 = registry.queryDataIds(["physical_filter"], band="r") 

2586 self.assertTrue(query1.any(execute=False, exact=False)) 

2587 self.assertTrue(query1.any(execute=True, exact=False)) 

2588 self.assertTrue(query1.any(execute=True, exact=True)) 

2589 self.assertEqual(query1.count(exact=False), 2) 

2590 self.assertEqual(query1.count(exact=True), 2) 

2591 self.assertFalse(list(query1.explain_no_results())) 

2592 # Second query should yield no results, which we should see when 

2593 # we attempt to expand the data ID. 

2594 query2 = registry.queryDataIds(["physical_filter"], band="h") 

2595 # There's no execute=False, exact=Fals test here because the behavior 

2596 # not something we want to guarantee in this case (and exact=False 

2597 # says either answer is legal). 

2598 self.assertFalse(query2.any(execute=True, exact=False)) 

2599 self.assertFalse(query2.any(execute=True, exact=True)) 

2600 self.assertEqual(query2.count(exact=False), 0) 

2601 self.assertEqual(query2.count(exact=True), 0) 

2602 self.assertTrue(list(query2.explain_no_results())) 

2603 # These queries yield no results due to various problems that can be 

2604 # spotted prior to execution, yielding helpful diagnostics. 

2605 base_query = registry.queryDataIds(["detector", "physical_filter"]) 

2606 queries_and_snippets = [ 

2607 ( 

2608 # Dataset type name doesn't match any existing dataset types. 

2609 registry.queryDatasets("nonexistent", collections=...), 

2610 ["nonexistent"], 

2611 ), 

2612 ( 

2613 # Dataset type object isn't registered. 

2614 registry.queryDatasets( 

2615 DatasetType( 

2616 "nonexistent", 

2617 dimensions=["instrument"], 

2618 universe=registry.dimensions, 

2619 storageClass="Image", 

2620 ), 

2621 collections=..., 

2622 ), 

2623 ["nonexistent"], 

2624 ), 

2625 ( 

2626 # No datasets of this type in this collection. 

2627 registry.queryDatasets("flat", collections=["biases"]), 

2628 ["flat", "biases"], 

2629 ), 

2630 ( 

2631 # No datasets of this type in this collection. 

2632 base_query.findDatasets("flat", collections=["biases"]), 

2633 ["flat", "biases"], 

2634 ), 

2635 ( 

2636 # No collections matching at all. 

2637 registry.queryDatasets("flat", collections=re.compile("potato.+")), 

2638 ["potato"], 

2639 ), 

2640 ] 

2641 # The behavior of these additional queries is slated to change in the 

2642 # future, so we also check for deprecation warnings. 

2643 with self.assertWarns(FutureWarning): 

2644 queries_and_snippets.append( 

2645 ( 

2646 # Dataset type name doesn't match any existing dataset 

2647 # types. 

2648 registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...), 

2649 ["nonexistent"], 

2650 ) 

2651 ) 

2652 with self.assertWarns(FutureWarning): 

2653 queries_and_snippets.append( 

2654 ( 

2655 # Dataset type name doesn't match any existing dataset 

2656 # types. 

2657 registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...), 

2658 ["nonexistent"], 

2659 ) 

2660 ) 

2661 for query, snippets in queries_and_snippets: 

2662 self.assertFalse(query.any(execute=False, exact=False)) 

2663 self.assertFalse(query.any(execute=True, exact=False)) 

2664 self.assertFalse(query.any(execute=True, exact=True)) 

2665 self.assertEqual(query.count(exact=False), 0) 

2666 self.assertEqual(query.count(exact=True), 0) 

2667 messages = list(query.explain_no_results()) 

2668 self.assertTrue(messages) 

2669 # Want all expected snippets to appear in at least one message. 

2670 self.assertTrue( 

2671 any( 

2672 all(snippet in message for snippet in snippets) for message in query.explain_no_results() 

2673 ), 

2674 messages, 

2675 ) 

2676 

2677 # This query does yield results, but should also emit a warning because 

2678 # dataset type patterns to queryDataIds is deprecated; just look for 

2679 # the warning. 

2680 with self.assertWarns(FutureWarning): 

2681 registry.queryDataIds(["detector"], datasets=re.compile("^nonexistent$"), collections=...) 

2682 

2683 # These queries yield no results due to problems that can be identified 

2684 # by cheap follow-up queries, yielding helpful diagnostics. 

2685 for query, snippets in [ 

2686 ( 

2687 # No records for one of the involved dimensions. 

2688 registry.queryDataIds(["subfilter"]), 

2689 ["no rows", "subfilter"], 

2690 ), 

2691 ( 

2692 # No records for one of the involved dimensions. 

2693 registry.queryDimensionRecords("subfilter"), 

2694 ["no rows", "subfilter"], 

2695 ), 

2696 ]: 

2697 self.assertFalse(query.any(execute=True, exact=False)) 

2698 self.assertFalse(query.any(execute=True, exact=True)) 

2699 self.assertEqual(query.count(exact=True), 0) 

2700 messages = list(query.explain_no_results()) 

2701 self.assertTrue(messages) 

2702 # Want all expected snippets to appear in at least one message. 

2703 self.assertTrue( 

2704 any( 

2705 all(snippet in message for snippet in snippets) for message in query.explain_no_results() 

2706 ), 

2707 messages, 

2708 ) 

2709 

2710 # This query yields four overlaps in the database, but one is filtered 

2711 # out in postprocessing. The count queries aren't accurate because 

2712 # they don't account for duplication that happens due to an internal 

2713 # join against commonSkyPix. 

2714 query3 = registry.queryDataIds(["visit", "tract"], instrument="Cam1", skymap="SkyMap1") 

2715 self.assertEqual( 

2716 { 

2717 DataCoordinate.standardize( 

2718 instrument="Cam1", 

2719 skymap="SkyMap1", 

2720 visit=v, 

2721 tract=t, 

2722 universe=registry.dimensions, 

2723 ) 

2724 for v, t in [(1, 0), (2, 0), (2, 1)] 

2725 }, 

2726 set(query3), 

2727 ) 

2728 self.assertTrue(query3.any(execute=False, exact=False)) 

2729 self.assertTrue(query3.any(execute=True, exact=False)) 

2730 self.assertTrue(query3.any(execute=True, exact=True)) 

2731 self.assertGreaterEqual(query3.count(exact=False), 4) 

2732 self.assertGreaterEqual(query3.count(exact=True, discard=True), 3) 

2733 self.assertFalse(list(query3.explain_no_results())) 

2734 # This query yields overlaps in the database, but all are filtered 

2735 # out in postprocessing. The count queries again aren't very useful. 

2736 # We have to use `where=` here to avoid an optimization that 

2737 # (currently) skips the spatial postprocess-filtering because it 

2738 # recognizes that no spatial join is necessary. That's not ideal, but 

2739 # fixing it is out of scope for this ticket. 

2740 query4 = registry.queryDataIds( 

2741 ["visit", "tract"], 

2742 instrument="Cam1", 

2743 skymap="SkyMap1", 

2744 where="visit=1 AND detector=1 AND tract=0 AND patch=4", 

2745 ) 

2746 self.assertFalse(set(query4)) 

2747 self.assertTrue(query4.any(execute=False, exact=False)) 

2748 self.assertTrue(query4.any(execute=True, exact=False)) 

2749 self.assertFalse(query4.any(execute=True, exact=True)) 

2750 self.assertGreaterEqual(query4.count(exact=False), 1) 

2751 self.assertEqual(query4.count(exact=True, discard=True), 0) 

2752 messages = query4.explain_no_results() 

2753 self.assertTrue(messages) 

2754 self.assertTrue(any("overlap" in message for message in messages)) 

2755 # This query should yield results from one dataset type but not the 

2756 # other, which is not registered. 

2757 query5 = registry.queryDatasets(["bias", "nonexistent"], collections=["biases"]) 

2758 self.assertTrue(set(query5)) 

2759 self.assertTrue(query5.any(execute=False, exact=False)) 

2760 self.assertTrue(query5.any(execute=True, exact=False)) 

2761 self.assertTrue(query5.any(execute=True, exact=True)) 

2762 self.assertGreaterEqual(query5.count(exact=False), 1) 

2763 self.assertGreaterEqual(query5.count(exact=True), 1) 

2764 self.assertFalse(list(query5.explain_no_results())) 

2765 # This query applies a selection that yields no results, fully in the 

2766 # database. Explaining why it fails involves traversing the relation 

2767 # tree and running a LIMIT 1 query at each level that has the potential 

2768 # to remove rows. 

2769 query6 = registry.queryDimensionRecords( 

2770 "detector", where="detector.purpose = 'no-purpose'", instrument="Cam1" 

2771 ) 

2772 self.assertEqual(query6.count(exact=True), 0) 

2773 messages = query6.explain_no_results() 

2774 self.assertTrue(messages) 

2775 self.assertTrue(any("no-purpose" in message for message in messages)) 

2776 

2777 def testQueryDataIdsOrderBy(self): 

2778 """Test order_by and limit on result returned by queryDataIds().""" 

2779 registry = self.makeRegistry() 

2780 self.loadData(registry, "base.yaml") 

2781 self.loadData(registry, "datasets.yaml") 

2782 self.loadData(registry, "spatial.yaml") 

2783 

2784 def do_query(dimensions=("visit", "tract"), datasets=None, collections=None): 

2785 return registry.queryDataIds( 

2786 dimensions, datasets=datasets, collections=collections, instrument="Cam1", skymap="SkyMap1" 

2787 ) 

2788 

2789 Test = namedtuple( 

2790 "testQueryDataIdsOrderByTest", 

2791 ("order_by", "keys", "result", "limit", "datasets", "collections"), 

2792 defaults=(None, None, None), 

2793 ) 

2794 

2795 test_data = ( 

2796 Test("tract,visit", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))), 

2797 Test("-tract,visit", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))), 

2798 Test("tract,-visit", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))), 

2799 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2), (0, 2), (0, 1), (0, 1))), 

2800 Test( 

2801 "tract.id,visit.id", 

2802 "tract,visit", 

2803 ((0, 1), (0, 1), (0, 2)), 

2804 limit=(3,), 

2805 ), 

2806 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2)), limit=(3,)), 

2807 Test("tract,visit", "tract,visit", ((0, 2), (1, 2), (1, 2)), limit=(3, 3)), 

2808 Test("-tract,-visit", "tract,visit", ((0, 1),), limit=(3, 5)), 

2809 Test( 

2810 "tract,visit.exposure_time", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2)) 

2811 ), 

2812 Test( 

2813 "-tract,-visit.exposure_time", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2)) 

2814 ), 

2815 Test("tract,-exposure_time", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))), 

2816 Test("tract,visit.name", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))), 

2817 Test( 

2818 "tract,-timespan.begin,timespan.end", 

2819 "tract,visit", 

2820 ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2)), 

2821 ), 

2822 Test("visit.day_obs,exposure.day_obs", "visit,exposure", ()), 

2823 Test("visit.timespan.begin,-exposure.timespan.begin", "visit,exposure", ()), 

2824 Test( 

2825 "tract,detector", 

2826 "tract,detector", 

2827 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)), 

2828 datasets="flat", 

2829 collections="imported_r", 

2830 ), 

2831 Test( 

2832 "tract,detector.full_name", 

2833 "tract,detector", 

2834 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)), 

2835 datasets="flat", 

2836 collections="imported_r", 

2837 ), 

2838 Test( 

2839 "tract,detector.raft,detector.name_in_raft", 

2840 "tract,detector", 

2841 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)), 

2842 datasets="flat", 

2843 collections="imported_r", 

2844 ), 

2845 ) 

2846 

2847 for test in test_data: 

2848 order_by = test.order_by.split(",") 

2849 keys = test.keys.split(",") 

2850 query = do_query(keys, test.datasets, test.collections).order_by(*order_by) 

2851 if test.limit is not None: 

2852 query = query.limit(*test.limit) 

2853 dataIds = tuple(tuple(dataId[k] for k in keys) for dataId in query) 

2854 self.assertEqual(dataIds, test.result) 

2855 

2856 # and materialize 

2857 query = do_query(keys).order_by(*order_by) 

2858 if test.limit is not None: 

2859 query = query.limit(*test.limit) 

2860 with self.assertRaises(RelationalAlgebraError): 

2861 with query.materialize(): 

2862 pass 

2863 

2864 # errors in a name 

2865 for order_by in ("", "-"): 

2866 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"): 

2867 list(do_query().order_by(order_by)) 

2868 

2869 for order_by in ("undimension.name", "-undimension.name"): 

2870 with self.assertRaisesRegex(ValueError, "Unknown dimension element name 'undimension'"): 

2871 list(do_query().order_by(order_by)) 

2872 

2873 for order_by in ("attract", "-attract"): 

2874 with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"): 

2875 list(do_query().order_by(order_by)) 

2876 

2877 with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"): 

2878 list(do_query(("exposure", "visit")).order_by("exposure_time")) 

2879 

2880 with self.assertRaisesRegex(ValueError, "Timespan exists in more than one dimesion"): 

2881 list(do_query(("exposure", "visit")).order_by("timespan.begin")) 

2882 

2883 with self.assertRaisesRegex( 

2884 ValueError, "Cannot find any temporal dimension element for 'timespan.begin'" 

2885 ): 

2886 list(do_query("tract").order_by("timespan.begin")) 

2887 

2888 with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"): 

2889 list(do_query("tract").order_by("tract.timespan.begin")) 

2890 

2891 with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."): 

2892 list(do_query("tract").order_by("tract.name")) 

2893 

2894 def testQueryDataIdsGovernorExceptions(self): 

2895 """Test exceptions raised by queryDataIds() for incorrect governors.""" 

2896 registry = self.makeRegistry() 

2897 self.loadData(registry, "base.yaml") 

2898 self.loadData(registry, "datasets.yaml") 

2899 self.loadData(registry, "spatial.yaml") 

2900 

2901 def do_query(dimensions, dataId=None, where="", bind=None, **kwargs): 

2902 return registry.queryDataIds(dimensions, dataId=dataId, where=where, bind=bind, **kwargs) 

2903 

2904 Test = namedtuple( 

2905 "testQueryDataIdExceptionsTest", 

2906 ("dimensions", "dataId", "where", "bind", "kwargs", "exception", "count"), 

2907 defaults=(None, None, None, {}, None, 0), 

2908 ) 

2909 

2910 test_data = ( 

2911 Test("tract,visit", count=6), 

2912 Test("tract,visit", kwargs={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6), 

2913 Test( 

2914 "tract,visit", kwargs={"instrument": "Cam2", "skymap": "SkyMap1"}, exception=DataIdValueError 

2915 ), 

2916 Test("tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6), 

2917 Test( 

2918 "tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap2"}, exception=DataIdValueError 

2919 ), 

2920 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap1'", count=6), 

2921 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap5'", exception=DataIdValueError), 

2922 Test( 

2923 "tract,visit", 

2924 where="instrument=cam AND skymap=map", 

2925 bind={"cam": "Cam1", "map": "SkyMap1"}, 

2926 count=6, 

2927 ), 

2928 Test( 

2929 "tract,visit", 

2930 where="instrument=cam AND skymap=map", 

2931 bind={"cam": "Cam", "map": "SkyMap"}, 

2932 exception=DataIdValueError, 

2933 ), 

2934 ) 

2935 

2936 for test in test_data: 

2937 dimensions = test.dimensions.split(",") 

2938 if test.exception: 

2939 with self.assertRaises(test.exception): 

2940 do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs).count() 

2941 else: 

2942 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs) 

2943 self.assertEqual(query.count(discard=True), test.count) 

2944 

2945 # and materialize 

2946 if test.exception: 

2947 with self.assertRaises(test.exception): 

2948 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs) 

2949 with query.materialize() as materialized: 

2950 materialized.count(discard=True) 

2951 else: 

2952 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs) 

2953 with query.materialize() as materialized: 

2954 self.assertEqual(materialized.count(discard=True), test.count) 

2955 

2956 def testQueryDimensionRecordsOrderBy(self): 

2957 """Test order_by and limit on result returned by 

2958 queryDimensionRecords(). 

2959 """ 

2960 registry = self.makeRegistry() 

2961 self.loadData(registry, "base.yaml") 

2962 self.loadData(registry, "datasets.yaml") 

2963 self.loadData(registry, "spatial.yaml") 

2964 

2965 def do_query(element, datasets=None, collections=None): 

2966 return registry.queryDimensionRecords( 

2967 element, instrument="Cam1", datasets=datasets, collections=collections 

2968 ) 

2969 

2970 query = do_query("detector") 

2971 self.assertEqual(len(list(query)), 4) 

2972 

2973 Test = namedtuple( 

2974 "testQueryDataIdsOrderByTest", 

2975 ("element", "order_by", "result", "limit", "datasets", "collections"), 

2976 defaults=(None, None, None), 

2977 ) 

2978 

2979 test_data = ( 

2980 Test("detector", "detector", (1, 2, 3, 4)), 

2981 Test("detector", "-detector", (4, 3, 2, 1)), 

2982 Test("detector", "raft,-name_in_raft", (2, 1, 4, 3)), 

2983 Test("detector", "-detector.purpose", (4,), limit=(1,)), 

2984 Test("detector", "-purpose,detector.raft,name_in_raft", (2, 3), limit=(2, 2)), 

2985 Test("visit", "visit", (1, 2)), 

2986 Test("visit", "-visit.id", (2, 1)), 

2987 Test("visit", "zenith_angle", (1, 2)), 

2988 Test("visit", "-visit.name", (2, 1)), 

2989 Test("visit", "day_obs,-timespan.begin", (2, 1)), 

2990 ) 

2991 

2992 for test in test_data: 

2993 order_by = test.order_by.split(",") 

2994 query = do_query(test.element).order_by(*order_by) 

2995 if test.limit is not None: 

2996 query = query.limit(*test.limit) 

2997 dataIds = tuple(rec.id for rec in query) 

2998 self.assertEqual(dataIds, test.result) 

2999 

3000 # errors in a name 

3001 for order_by in ("", "-"): 

3002 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"): 

3003 list(do_query("detector").order_by(order_by)) 

3004 

3005 for order_by in ("undimension.name", "-undimension.name"): 

3006 with self.assertRaisesRegex(ValueError, "Element name mismatch: 'undimension'"): 

3007 list(do_query("detector").order_by(order_by)) 

3008 

3009 for order_by in ("attract", "-attract"): 

3010 with self.assertRaisesRegex(ValueError, "Field 'attract' does not exist in 'detector'."): 

3011 list(do_query("detector").order_by(order_by)) 

3012 

3013 def testQueryDimensionRecordsExceptions(self): 

3014 """Test exceptions raised by queryDimensionRecords().""" 

3015 registry = self.makeRegistry() 

3016 self.loadData(registry, "base.yaml") 

3017 self.loadData(registry, "datasets.yaml") 

3018 self.loadData(registry, "spatial.yaml") 

3019 

3020 result = registry.queryDimensionRecords("detector") 

3021 self.assertEqual(result.count(), 4) 

3022 result = registry.queryDimensionRecords("detector", instrument="Cam1") 

3023 self.assertEqual(result.count(), 4) 

3024 result = registry.queryDimensionRecords("detector", dataId={"instrument": "Cam1"}) 

3025 self.assertEqual(result.count(), 4) 

3026 result = registry.queryDimensionRecords("detector", where="instrument='Cam1'") 

3027 self.assertEqual(result.count(), 4) 

3028 result = registry.queryDimensionRecords("detector", where="instrument=instr", bind={"instr": "Cam1"}) 

3029 self.assertEqual(result.count(), 4) 

3030 

3031 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"): 

3032 result = registry.queryDimensionRecords("detector", instrument="NotCam1") 

3033 result.count() 

3034 

3035 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"): 

3036 result = registry.queryDimensionRecords("detector", dataId={"instrument": "NotCam1"}) 

3037 result.count() 

3038 

3039 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"): 

3040 result = registry.queryDimensionRecords("detector", where="instrument='NotCam1'") 

3041 result.count() 

3042 

3043 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"): 

3044 result = registry.queryDimensionRecords( 

3045 "detector", where="instrument=instr", bind={"instr": "NotCam1"} 

3046 ) 

3047 result.count() 

3048 

3049 def testDatasetConstrainedDimensionRecordQueries(self): 

3050 """Test that queryDimensionRecords works even when given a dataset 

3051 constraint whose dimensions extend beyond the requested dimension 

3052 element's. 

3053 """ 

3054 registry = self.makeRegistry() 

3055 self.loadData(registry, "base.yaml") 

3056 self.loadData(registry, "datasets.yaml") 

3057 # Query for physical_filter dimension records, using a dataset that 

3058 # has both physical_filter and dataset dimensions. 

3059 records = registry.queryDimensionRecords( 

3060 "physical_filter", 

3061 datasets=["flat"], 

3062 collections="imported_r", 

3063 ) 

3064 self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"}) 

3065 # Trying to constrain by all dataset types is an error. 

3066 with self.assertRaises(TypeError): 

3067 list(registry.queryDimensionRecords("physical_filter", datasets=..., collections="imported_r")) 

3068 

3069 def testSkyPixDatasetQueries(self): 

3070 """Test that we can build queries involving skypix dimensions as long 

3071 as a dataset type that uses those dimensions is included. 

3072 """ 

3073 registry = self.makeRegistry() 

3074 self.loadData(registry, "base.yaml") 

3075 dataset_type = DatasetType( 

3076 "a", dimensions=["htm7", "instrument"], universe=registry.dimensions, storageClass="int" 

3077 ) 

3078 registry.registerDatasetType(dataset_type) 

3079 run = "r" 

3080 registry.registerRun(run) 

3081 # First try queries where there are no datasets; the concern is whether 

3082 # we can even build and execute these queries without raising, even 

3083 # when "doomed" query shortcuts are in play. 

3084 self.assertFalse( 

3085 list(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run)) 

3086 ) 

3087 self.assertFalse(list(registry.queryDatasets(dataset_type, collections=run))) 

3088 # Now add a dataset and see that we can get it back. 

3089 htm7 = registry.dimensions.skypix["htm"][7].pixelization 

3090 data_id = registry.expandDataId(instrument="Cam1", htm7=htm7.universe()[0][0]) 

3091 (ref,) = registry.insertDatasets(dataset_type, [data_id], run=run) 

3092 self.assertEqual( 

3093 set(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run)), 

3094 {data_id}, 

3095 ) 

3096 self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref}) 

3097 

3098 def testDatasetIdFactory(self): 

3099 """Simple test for DatasetIdFactory, mostly to catch potential changes 

3100 in its API. 

3101 """ 

3102 registry = self.makeRegistry() 

3103 factory = registry.datasetIdFactory 

3104 dataset_type = DatasetType( 

3105 "datasetType", 

3106 dimensions=["detector", "instrument"], 

3107 universe=registry.dimensions, 

3108 storageClass="int", 

3109 ) 

3110 run = "run" 

3111 data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions) 

3112 

3113 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE) 

3114 self.assertIsInstance(datasetId, uuid.UUID) 

3115 self.assertEqual(datasetId.version, 4) 

3116 

3117 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE) 

3118 self.assertIsInstance(datasetId, uuid.UUID) 

3119 self.assertEqual(datasetId.version, 5) 

3120 

3121 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN) 

3122 self.assertIsInstance(datasetId, uuid.UUID) 

3123 self.assertEqual(datasetId.version, 5) 

3124 

3125 def testExposureQueries(self): 

3126 """Test query methods using arguments sourced from the exposure log 

3127 service. 

3128 

3129 The most complete test dataset currently available to daf_butler tests 

3130 is hsc-rc2-subset.yaml export (which is unfortunately distinct from the 

3131 the lsst/rc2_subset GitHub repo), but that does not have 'exposure' 

3132 dimension records as it was focused on providing nontrivial spatial 

3133 overlaps between visit+detector and tract+patch. So in this test we 

3134 need to translate queries that originally used the exposure dimension 

3135 to use the (very similar) visit dimension instead. 

3136 """ 

3137 registry = self.makeRegistry() 

3138 self.loadData(registry, "hsc-rc2-subset.yaml") 

3139 self.assertEqual( 

3140 [ 

3141 record.id 

3142 for record in registry.queryDimensionRecords("visit", instrument="HSC") 

3143 .order_by("id") 

3144 .limit(5) 

3145 ], 

3146 [318, 322, 326, 330, 332], 

3147 ) 

3148 self.assertEqual( 

3149 [ 

3150 data_id["visit"] 

3151 for data_id in registry.queryDataIds(["visit"], instrument="HSC").order_by("id").limit(5) 

3152 ], 

3153 [318, 322, 326, 330, 332], 

3154 ) 

3155 self.assertEqual( 

3156 [ 

3157 record.id 

3158 for record in registry.queryDimensionRecords("detector", instrument="HSC") 

3159 .order_by("full_name") 

3160 .limit(5) 

3161 ], 

3162 [73, 72, 71, 70, 65], 

3163 ) 

3164 self.assertEqual( 

3165 [ 

3166 data_id["detector"] 

3167 for data_id in registry.queryDataIds(["detector"], instrument="HSC") 

3168 .order_by("full_name") 

3169 .limit(5) 

3170 ], 

3171 [73, 72, 71, 70, 65], 

3172 ) 

3173 

3174 def test_long_query_names(self) -> None: 

3175 """Test that queries involving very long names are handled correctly. 

3176 

3177 This is especially important for PostgreSQL, which truncates symbols 

3178 longer than 64 chars, but it's worth testing for all DBs. 

3179 """ 

3180 registry = self.makeRegistry() 

3181 name = "abcd" * 17 

3182 registry.registerDatasetType( 

3183 DatasetType( 

3184 name, 

3185 dimensions=(), 

3186 storageClass="Exposure", 

3187 universe=registry.dimensions, 

3188 ) 

3189 ) 

3190 # Need to search more than one collection actually containing a 

3191 # matching dataset to avoid optimizations that sidestep bugs due to 

3192 # truncation by making findFirst=True a no-op. 

3193 run1 = "run1" 

3194 registry.registerRun(run1) 

3195 run2 = "run2" 

3196 registry.registerRun(run2) 

3197 (ref1,) = registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run1) 

3198 registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run2) 

3199 self.assertEqual( 

3200 set(registry.queryDatasets(name, collections=[run1, run2], findFirst=True)), 

3201 {ref1}, 

3202 ) 

3203 

3204 def test_skypix_constraint_queries(self) -> None: 

3205 """Test queries spatially constrained by a skypix data ID.""" 

3206 registry = self.makeRegistry() 

3207 self.loadData(registry, "hsc-rc2-subset.yaml") 

3208 patch_regions = { 

3209 (data_id["tract"], data_id["patch"]): data_id.region 

3210 for data_id in registry.queryDataIds(["patch"]).expanded() 

3211 } 

3212 skypix_dimension: SkyPixDimension = registry.dimensions["htm11"] 

3213 # This check ensures the test doesn't become trivial due to a config 

3214 # change; if it does, just pick a different HTML level. 

3215 self.assertNotEqual(skypix_dimension, registry.dimensions.commonSkyPix) 

3216 # Gather all skypix IDs that definitely overlap at least one of these 

3217 # patches. 

3218 relevant_skypix_ids = lsst.sphgeom.RangeSet() 

3219 for patch_region in patch_regions.values(): 

3220 relevant_skypix_ids |= skypix_dimension.pixelization.interior(patch_region) 

3221 # Look for a "nontrivial" skypix_id that overlaps at least one patch 

3222 # and does not overlap at least one other patch. 

3223 for skypix_id in itertools.chain.from_iterable( 

3224 range(begin, end) for begin, end in relevant_skypix_ids 

3225 ): 

3226 skypix_region = skypix_dimension.pixelization.pixel(skypix_id) 

3227 overlapping_patches = { 

3228 patch_key 

3229 for patch_key, patch_region in patch_regions.items() 

3230 if not patch_region.isDisjointFrom(skypix_region) 

3231 } 

3232 if overlapping_patches and overlapping_patches != patch_regions.keys(): 

3233 break 

3234 else: 

3235 raise RuntimeError("Could not find usable skypix ID for this dimension configuration.") 

3236 self.assertEqual( 

3237 { 

3238 (data_id["tract"], data_id["patch"]) 

3239 for data_id in registry.queryDataIds( 

3240 ["patch"], 

3241 dataId={skypix_dimension.name: skypix_id}, 

3242 ) 

3243 }, 

3244 overlapping_patches, 

3245 ) 

3246 

3247 def test_spatial_constraint_queries(self) -> None: 

3248 """Test queries in which one spatial dimension in the constraint (data 

3249 ID or ``where`` string) constrains a different spatial dimension in the 

3250 query result columns. 

3251 """ 

3252 registry = self.makeRegistry() 

3253 self.loadData(registry, "hsc-rc2-subset.yaml") 

3254 patch_regions = { 

3255 (data_id["tract"], data_id["patch"]): data_id.region 

3256 for data_id in registry.queryDataIds(["patch"]).expanded() 

3257 } 

3258 observation_regions = { 

3259 (data_id["visit"], data_id["detector"]): data_id.region 

3260 for data_id in registry.queryDataIds(["visit", "detector"]).expanded() 

3261 } 

3262 all_combos = { 

3263 (patch_key, observation_key) 

3264 for patch_key, observation_key in itertools.product(patch_regions, observation_regions) 

3265 } 

3266 overlapping_combos = { 

3267 (patch_key, observation_key) 

3268 for patch_key, observation_key in all_combos 

3269 if not patch_regions[patch_key].isDisjointFrom(observation_regions[observation_key]) 

3270 } 

3271 # Check a direct spatial join with no constraint first. 

3272 self.assertEqual( 

3273 { 

3274 ((data_id["tract"], data_id["patch"]), (data_id["visit"], data_id["detector"])) 

3275 for data_id in registry.queryDataIds(["patch", "visit", "detector"]) 

3276 }, 

3277 overlapping_combos, 

3278 ) 

3279 overlaps_by_patch: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set) 

3280 overlaps_by_observation: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set) 

3281 for patch_key, observation_key in overlapping_combos: 

3282 overlaps_by_patch[patch_key].add(observation_key) 

3283 overlaps_by_observation[observation_key].add(patch_key) 

3284 # Find patches and observations that overlap at least one of the other 

3285 # but not all of the other. 

3286 nontrivial_patch = next( 

3287 iter( 

3288 patch_key 

3289 for patch_key, observation_keys in overlaps_by_patch.items() 

3290 if observation_keys and observation_keys != observation_regions.keys() 

3291 ) 

3292 ) 

3293 nontrivial_observation = next( 

3294 iter( 

3295 observation_key 

3296 for observation_key, patch_keys in overlaps_by_observation.items() 

3297 if patch_keys and patch_keys != patch_regions.keys() 

3298 ) 

3299 ) 

3300 # Use the nontrivial patches and observations as constraints on the 

3301 # other dimensions in various ways, first via a 'where' expression. 

3302 # It's better in general to us 'bind' instead of f-strings, but these 

3303 # all integers so there are no quoting concerns. 

3304 self.assertEqual( 

3305 { 

3306 (data_id["visit"], data_id["detector"]) 

3307 for data_id in registry.queryDataIds( 

3308 ["visit", "detector"], 

3309 where=f"tract={nontrivial_patch[0]} AND patch={nontrivial_patch[1]}", 

3310 skymap="hsc_rings_v1", 

3311 ) 

3312 }, 

3313 overlaps_by_patch[nontrivial_patch], 

3314 ) 

3315 self.assertEqual( 

3316 { 

3317 (data_id["tract"], data_id["patch"]) 

3318 for data_id in registry.queryDataIds( 

3319 ["patch"], 

3320 where=f"visit={nontrivial_observation[0]} AND detector={nontrivial_observation[1]}", 

3321 instrument="HSC", 

3322 ) 

3323 }, 

3324 overlaps_by_observation[nontrivial_observation], 

3325 ) 

3326 # and then via the dataId argument. 

3327 self.assertEqual( 

3328 { 

3329 (data_id["visit"], data_id["detector"]) 

3330 for data_id in registry.queryDataIds( 

3331 ["visit", "detector"], 

3332 dataId={ 

3333 "tract": nontrivial_patch[0], 

3334 "patch": nontrivial_patch[1], 

3335 }, 

3336 skymap="hsc_rings_v1", 

3337 ) 

3338 }, 

3339 overlaps_by_patch[nontrivial_patch], 

3340 ) 

3341 self.assertEqual( 

3342 { 

3343 (data_id["tract"], data_id["patch"]) 

3344 for data_id in registry.queryDataIds( 

3345 ["patch"], 

3346 dataId={ 

3347 "visit": nontrivial_observation[0], 

3348 "detector": nontrivial_observation[1], 

3349 }, 

3350 instrument="HSC", 

3351 ) 

3352 }, 

3353 overlaps_by_observation[nontrivial_observation], 

3354 ) 

3355 

3356 def test_query_projection_drop_postprocessing(self) -> None: 

3357 """Test that projections and deduplications on query objects can 

3358 drop post-query region filtering to ensure the query remains in 

3359 the SQL engine. 

3360 """ 

3361 registry = self.makeRegistry() 

3362 self.loadData(registry, "base.yaml") 

3363 self.loadData(registry, "spatial.yaml") 

3364 

3365 def pop_transfer(tree: Relation) -> Relation: 

3366 """If a relation tree terminates with a transfer to a new engine, 

3367 return the relation prior to that transfer. If not, return the 

3368 original relation. 

3369 """ 

3370 match tree: 

3371 case Transfer(target=target): 

3372 return target 

3373 case _: 

3374 return tree 

3375 

3376 # There's no public way to get a Query object yet, so we get one from a 

3377 # DataCoordinateQueryResults private attribute. When a public API is 

3378 # available this test should use it. 

3379 query = registry.queryDataIds(["visit", "detector", "tract", "patch"])._query 

3380 # We expect this query to terminate in the iteration engine originally, 

3381 # because region-filtering is necessary. 

3382 self.assertIsInstance(pop_transfer(query.relation).engine, iteration.Engine) 

3383 # If we deduplicate, we usually have to do that downstream of the 

3384 # filtering. That means the deduplication has to happen in the 

3385 # iteration engine. 

3386 self.assertIsInstance(pop_transfer(query.projected(unique=True).relation).engine, iteration.Engine) 

3387 # If we pass drop_postprocessing, we instead drop the region filtering 

3388 # so the deduplication can happen in SQL (though there might still be 

3389 # transfer to iteration at the tail of the tree that we can ignore; 

3390 # that's what the pop_transfer takes care of here). 

3391 self.assertIsInstance( 

3392 pop_transfer(query.projected(unique=True, drop_postprocessing=True).relation).engine, 

3393 sql.Engine, 

3394 )