Coverage for python/lsst/daf/butler/registry/tests/_registry.py: 5%
1495 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-05 01:26 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RegistryTests"]
25import itertools
26import logging
27import os
28import re
29import unittest
30import uuid
31from abc import ABC, abstractmethod
32from collections import defaultdict, namedtuple
33from collections.abc import Iterator
34from datetime import datetime, timedelta
35from typing import TYPE_CHECKING
37import astropy.time
38import sqlalchemy
40try:
41 import numpy as np
42except ImportError:
43 np = None
45import lsst.sphgeom
46from lsst.daf.relation import Relation, RelationalAlgebraError, Transfer, iteration, sql
48from ...core import (
49 DataCoordinate,
50 DataCoordinateSet,
51 DatasetAssociation,
52 DatasetIdFactory,
53 DatasetIdGenEnum,
54 DatasetRef,
55 DatasetType,
56 DimensionGraph,
57 NamedValueSet,
58 SkyPixDimension,
59 StorageClass,
60 Timespan,
61 ddl,
62)
63from .._collection_summary import CollectionSummary
64from .._collectionType import CollectionType
65from .._config import RegistryConfig
66from .._exceptions import (
67 ArgumentError,
68 CollectionError,
69 CollectionTypeError,
70 ConflictingDefinitionError,
71 DataIdValueError,
72 DatasetTypeError,
73 InconsistentDataIdError,
74 MissingCollectionError,
75 MissingDatasetTypeError,
76 NoDefaultCollectionError,
77 OrphanedRecordError,
78)
79from ..interfaces import ButlerAttributeExistsError
81if TYPE_CHECKING:
82 from .._registry import Registry
85class RegistryTests(ABC):
86 """Generic tests for the `Registry` class that can be subclassed to
87 generate tests for different configurations.
88 """
90 collectionsManager: str | None = None
91 """Name of the collections manager class, if subclass provides value for
92 this member then it overrides name specified in default configuration
93 (`str`).
94 """
96 datasetsManager: str | dict[str, str] | None = None
97 """Name or configuration dictionary of the datasets manager class, if
98 subclass provides value for this member then it overrides name specified
99 in default configuration (`str` or `dict`).
100 """
102 @classmethod
103 @abstractmethod
104 def getDataDir(cls) -> str:
105 """Return the root directory containing test data YAML files."""
106 raise NotImplementedError()
108 def makeRegistryConfig(self) -> RegistryConfig:
109 """Create RegistryConfig used to create a registry.
111 This method should be called by a subclass from `makeRegistry`.
112 Returned instance will be pre-configured based on the values of class
113 members, and default-configured for all other parameters. Subclasses
114 that need default configuration should just instantiate
115 `RegistryConfig` directly.
116 """
117 config = RegistryConfig()
118 if self.collectionsManager:
119 config["managers", "collections"] = self.collectionsManager
120 if self.datasetsManager:
121 config["managers", "datasets"] = self.datasetsManager
122 return config
124 @abstractmethod
125 def makeRegistry(self, share_repo_with: Registry | None = None) -> Registry | None:
126 """Return the Registry instance to be tested.
128 Parameters
129 ----------
130 share_repo_with : `Registry`, optional
131 If provided, the new registry should point to the same data
132 repository as this existing registry.
134 Returns
135 -------
136 registry : `Registry`
137 New `Registry` instance, or `None` *only* if `share_repo_with` is
138 not `None` and this test case does not support that argument
139 (e.g. it is impossible with in-memory SQLite DBs).
140 """
141 raise NotImplementedError()
143 def loadData(self, registry: Registry, filename: str):
144 """Load registry test data from ``getDataDir/<filename>``,
145 which should be a YAML import/export file.
146 """
147 from ...transfers import YamlRepoImportBackend
149 with open(os.path.join(self.getDataDir(), filename)) as stream:
150 backend = YamlRepoImportBackend(stream, registry)
151 backend.register()
152 backend.load(datastore=None)
154 def checkQueryResults(self, results, expected):
155 """Check that a query results object contains expected values.
157 Parameters
158 ----------
159 results : `DataCoordinateQueryResults` or `DatasetQueryResults`
160 A lazy-evaluation query results object.
161 expected : `list`
162 A list of `DataCoordinate` o `DatasetRef` objects that should be
163 equal to results of the query, aside from ordering.
164 """
165 self.assertCountEqual(list(results), expected)
166 self.assertEqual(results.count(), len(expected))
167 if expected:
168 self.assertTrue(results.any())
169 else:
170 self.assertFalse(results.any())
172 def testOpaque(self):
173 """Tests for `Registry.registerOpaqueTable`,
174 `Registry.insertOpaqueData`, `Registry.fetchOpaqueData`, and
175 `Registry.deleteOpaqueData`.
176 """
177 registry = self.makeRegistry()
178 table = "opaque_table_for_testing"
179 registry.registerOpaqueTable(
180 table,
181 spec=ddl.TableSpec(
182 fields=[
183 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, primaryKey=True),
184 ddl.FieldSpec("name", dtype=sqlalchemy.String, length=16, nullable=False),
185 ddl.FieldSpec("count", dtype=sqlalchemy.SmallInteger, nullable=True),
186 ],
187 ),
188 )
189 rows = [
190 {"id": 1, "name": "one", "count": None},
191 {"id": 2, "name": "two", "count": 5},
192 {"id": 3, "name": "three", "count": 6},
193 ]
194 registry.insertOpaqueData(table, *rows)
195 self.assertCountEqual(rows, list(registry.fetchOpaqueData(table)))
196 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=1)))
197 self.assertEqual(rows[1:2], list(registry.fetchOpaqueData(table, name="two")))
198 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=(1, 3), name=("one", "two"))))
199 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=(1, 2, 3))))
200 # Test very long IN clause which exceeds sqlite limit on number of
201 # parameters. SQLite says the limit is 32k but it looks like it is
202 # much higher.
203 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=list(range(300_000)))))
204 # Two IN clauses, each longer than 1k batch size, first with
205 # duplicates, second has matching elements in different batches (after
206 # sorting).
207 self.assertEqual(
208 rows[0:2],
209 list(
210 registry.fetchOpaqueData(
211 table,
212 id=list(range(1000)) + list(range(100, 0, -1)),
213 name=["one"] + [f"q{i}" for i in range(2200)] + ["two"],
214 )
215 ),
216 )
217 self.assertEqual([], list(registry.fetchOpaqueData(table, id=1, name="two")))
218 registry.deleteOpaqueData(table, id=3)
219 self.assertCountEqual(rows[:2], list(registry.fetchOpaqueData(table)))
220 registry.deleteOpaqueData(table)
221 self.assertEqual([], list(registry.fetchOpaqueData(table)))
223 def testDatasetType(self):
224 """Tests for `Registry.registerDatasetType` and
225 `Registry.getDatasetType`.
226 """
227 registry = self.makeRegistry()
228 # Check valid insert
229 datasetTypeName = "test"
230 storageClass = StorageClass("testDatasetType")
231 registry.storageClasses.registerStorageClass(storageClass)
232 dimensions = registry.dimensions.extract(("instrument", "visit"))
233 differentDimensions = registry.dimensions.extract(("instrument", "patch"))
234 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
235 # Inserting for the first time should return True
236 self.assertTrue(registry.registerDatasetType(inDatasetType))
237 outDatasetType1 = registry.getDatasetType(datasetTypeName)
238 self.assertEqual(outDatasetType1, inDatasetType)
240 # Re-inserting should work
241 self.assertFalse(registry.registerDatasetType(inDatasetType))
242 # Except when they are not identical
243 with self.assertRaises(ConflictingDefinitionError):
244 nonIdenticalDatasetType = DatasetType(datasetTypeName, differentDimensions, storageClass)
245 registry.registerDatasetType(nonIdenticalDatasetType)
247 # Template can be None
248 datasetTypeName = "testNoneTemplate"
249 storageClass = StorageClass("testDatasetType2")
250 registry.storageClasses.registerStorageClass(storageClass)
251 dimensions = registry.dimensions.extract(("instrument", "visit"))
252 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
253 registry.registerDatasetType(inDatasetType)
254 outDatasetType2 = registry.getDatasetType(datasetTypeName)
255 self.assertEqual(outDatasetType2, inDatasetType)
257 allTypes = set(registry.queryDatasetTypes())
258 self.assertEqual(allTypes, {outDatasetType1, outDatasetType2})
260 def testDimensions(self):
261 """Tests for `Registry.insertDimensionData`,
262 `Registry.syncDimensionData`, and `Registry.expandDataId`.
263 """
264 registry = self.makeRegistry()
265 dimensionName = "instrument"
266 dimension = registry.dimensions[dimensionName]
267 dimensionValue = {
268 "name": "DummyCam",
269 "visit_max": 10,
270 "visit_system": 0,
271 "exposure_max": 10,
272 "detector_max": 2,
273 "class_name": "lsst.pipe.base.Instrument",
274 }
275 registry.insertDimensionData(dimensionName, dimensionValue)
276 # Inserting the same value twice should fail
277 with self.assertRaises(sqlalchemy.exc.IntegrityError):
278 registry.insertDimensionData(dimensionName, dimensionValue)
279 # expandDataId should retrieve the record we just inserted
280 self.assertEqual(
281 registry.expandDataId(instrument="DummyCam", graph=dimension.graph)
282 .records[dimensionName]
283 .toDict(),
284 dimensionValue,
285 )
286 # expandDataId should raise if there is no record with the given ID.
287 with self.assertRaises(DataIdValueError):
288 registry.expandDataId({"instrument": "Unknown"}, graph=dimension.graph)
289 # band doesn't have a table; insert should fail.
290 with self.assertRaises(TypeError):
291 registry.insertDimensionData("band", {"band": "i"})
292 dimensionName2 = "physical_filter"
293 dimension2 = registry.dimensions[dimensionName2]
294 dimensionValue2 = {"name": "DummyCam_i", "band": "i"}
295 # Missing required dependency ("instrument") should fail
296 with self.assertRaises(KeyError):
297 registry.insertDimensionData(dimensionName2, dimensionValue2)
298 # Adding required dependency should fix the failure
299 dimensionValue2["instrument"] = "DummyCam"
300 registry.insertDimensionData(dimensionName2, dimensionValue2)
301 # expandDataId should retrieve the record we just inserted.
302 self.assertEqual(
303 registry.expandDataId(instrument="DummyCam", physical_filter="DummyCam_i", graph=dimension2.graph)
304 .records[dimensionName2]
305 .toDict(),
306 dimensionValue2,
307 )
308 # Use syncDimensionData to insert a new record successfully.
309 dimensionName3 = "detector"
310 dimensionValue3 = {
311 "instrument": "DummyCam",
312 "id": 1,
313 "full_name": "one",
314 "name_in_raft": "zero",
315 "purpose": "SCIENCE",
316 }
317 self.assertTrue(registry.syncDimensionData(dimensionName3, dimensionValue3))
318 # Sync that again. Note that one field ("raft") is NULL, and that
319 # should be okay.
320 self.assertFalse(registry.syncDimensionData(dimensionName3, dimensionValue3))
321 # Now try that sync with the same primary key but a different value.
322 # This should fail.
323 with self.assertRaises(ConflictingDefinitionError):
324 registry.syncDimensionData(
325 dimensionName3,
326 {
327 "instrument": "DummyCam",
328 "id": 1,
329 "full_name": "one",
330 "name_in_raft": "four",
331 "purpose": "SCIENCE",
332 },
333 )
335 @unittest.skipIf(np is None, "numpy not available.")
336 def testNumpyDataId(self):
337 """Test that we can use a numpy int in a dataId."""
338 registry = self.makeRegistry()
339 dimensionEntries = [
340 ("instrument", {"instrument": "DummyCam"}),
341 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
342 # Using an np.int64 here fails unless Records.fromDict is also
343 # patched to look for numbers.Integral
344 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
345 ]
346 for args in dimensionEntries:
347 registry.insertDimensionData(*args)
349 # Try a normal integer and something that looks like an int but
350 # is not.
351 for visit_id in (42, np.int64(42)):
352 with self.subTest(visit_id=visit_id, id_type=type(visit_id).__name__):
353 expanded = registry.expandDataId({"instrument": "DummyCam", "visit": visit_id})
354 self.assertEqual(expanded["visit"], int(visit_id))
355 self.assertIsInstance(expanded["visit"], int)
357 def testDataIdRelationships(self):
358 """Test that `Registry.expandDataId` raises an exception when the given
359 keys are inconsistent.
360 """
361 registry = self.makeRegistry()
362 self.loadData(registry, "base.yaml")
363 # Insert a few more dimension records for the next test.
364 registry.insertDimensionData(
365 "exposure",
366 {"instrument": "Cam1", "id": 1, "obs_id": "one", "physical_filter": "Cam1-G"},
367 )
368 registry.insertDimensionData(
369 "exposure",
370 {"instrument": "Cam1", "id": 2, "obs_id": "two", "physical_filter": "Cam1-G"},
371 )
372 registry.insertDimensionData(
373 "visit_system",
374 {"instrument": "Cam1", "id": 0, "name": "one-to-one"},
375 )
376 registry.insertDimensionData(
377 "visit",
378 {"instrument": "Cam1", "id": 1, "name": "one", "physical_filter": "Cam1-G", "visit_system": 0},
379 )
380 registry.insertDimensionData(
381 "visit_definition",
382 {"instrument": "Cam1", "visit": 1, "exposure": 1, "visit_system": 0},
383 )
384 with self.assertRaises(InconsistentDataIdError):
385 registry.expandDataId(
386 {"instrument": "Cam1", "visit": 1, "exposure": 2},
387 )
389 def testDataset(self):
390 """Basic tests for `Registry.insertDatasets`, `Registry.getDataset`,
391 and `Registry.removeDatasets`.
392 """
393 registry = self.makeRegistry()
394 self.loadData(registry, "base.yaml")
395 run = "tésτ"
396 registry.registerRun(run)
397 datasetType = registry.getDatasetType("bias")
398 dataId = {"instrument": "Cam1", "detector": 2}
399 (ref,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
400 outRef = registry.getDataset(ref.id)
401 self.assertIsNotNone(ref.id)
402 self.assertEqual(ref, outRef)
403 with self.assertRaises(ConflictingDefinitionError):
404 registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
405 registry.removeDatasets([ref])
406 self.assertIsNone(registry.findDataset(datasetType, dataId, collections=[run]))
408 def testFindDataset(self):
409 """Tests for `Registry.findDataset`."""
410 registry = self.makeRegistry()
411 self.loadData(registry, "base.yaml")
412 run = "tésτ"
413 datasetType = registry.getDatasetType("bias")
414 dataId = {"instrument": "Cam1", "detector": 4}
415 registry.registerRun(run)
416 (inputRef,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
417 outputRef = registry.findDataset(datasetType, dataId, collections=[run])
418 self.assertEqual(outputRef, inputRef)
419 # Check that retrieval with invalid dataId raises
420 with self.assertRaises(LookupError):
421 dataId = {"instrument": "Cam1"} # no detector
422 registry.findDataset(datasetType, dataId, collections=run)
423 # Check that different dataIds match to different datasets
424 dataId1 = {"instrument": "Cam1", "detector": 1}
425 (inputRef1,) = registry.insertDatasets(datasetType, dataIds=[dataId1], run=run)
426 dataId2 = {"instrument": "Cam1", "detector": 2}
427 (inputRef2,) = registry.insertDatasets(datasetType, dataIds=[dataId2], run=run)
428 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef1)
429 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef2)
430 self.assertNotEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef2)
431 self.assertNotEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef1)
432 # Check that requesting a non-existing dataId returns None
433 nonExistingDataId = {"instrument": "Cam1", "detector": 3}
434 self.assertIsNone(registry.findDataset(datasetType, nonExistingDataId, collections=run))
435 # Search more than one collection, in which two have the right
436 # dataset type and another does not.
437 registry.registerRun("empty")
438 self.loadData(registry, "datasets-uuid.yaml")
439 bias1 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_g"])
440 self.assertIsNotNone(bias1)
441 bias2 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_r"])
442 self.assertIsNotNone(bias2)
443 self.assertEqual(
444 bias1,
445 registry.findDataset(
446 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "imported_r"]
447 ),
448 )
449 self.assertEqual(
450 bias2,
451 registry.findDataset(
452 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_r", "imported_g"]
453 ),
454 )
455 # Search more than one collection, with one of them a CALIBRATION
456 # collection.
457 registry.registerCollection("Cam1/calib", CollectionType.CALIBRATION)
458 timespan = Timespan(
459 begin=astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai"),
460 end=astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai"),
461 )
462 registry.certify("Cam1/calib", [bias2], timespan=timespan)
463 self.assertEqual(
464 bias1,
465 registry.findDataset(
466 "bias",
467 instrument="Cam1",
468 detector=2,
469 collections=["empty", "imported_g", "Cam1/calib"],
470 timespan=timespan,
471 ),
472 )
473 self.assertEqual(
474 bias2,
475 registry.findDataset(
476 "bias",
477 instrument="Cam1",
478 detector=2,
479 collections=["empty", "Cam1/calib", "imported_g"],
480 timespan=timespan,
481 ),
482 )
483 # If we try to search those same collections without a timespan, it
484 # should still work, since the CALIBRATION collection is ignored.
485 self.assertEqual(
486 bias1,
487 registry.findDataset(
488 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "Cam1/calib"]
489 ),
490 )
491 self.assertEqual(
492 bias1,
493 registry.findDataset(
494 "bias", instrument="Cam1", detector=2, collections=["empty", "Cam1/calib", "imported_g"]
495 ),
496 )
498 def testRemoveDatasetTypeSuccess(self):
499 """Test that Registry.removeDatasetType works when there are no
500 datasets of that type present.
501 """
502 registry = self.makeRegistry()
503 self.loadData(registry, "base.yaml")
504 registry.removeDatasetType("flat")
505 with self.assertRaises(MissingDatasetTypeError):
506 registry.getDatasetType("flat")
508 def testRemoveDatasetTypeFailure(self):
509 """Test that Registry.removeDatasetType raises when there are datasets
510 of that type present or if the dataset type is for a component.
511 """
512 registry = self.makeRegistry()
513 self.loadData(registry, "base.yaml")
514 self.loadData(registry, "datasets.yaml")
515 with self.assertRaises(OrphanedRecordError):
516 registry.removeDatasetType("flat")
517 with self.assertRaises(ValueError):
518 registry.removeDatasetType(DatasetType.nameWithComponent("flat", "image"))
520 def testImportDatasetsUUID(self):
521 """Test for `Registry._importDatasets` with UUID dataset ID."""
522 if isinstance(self.datasetsManager, str):
523 if not self.datasetsManager.endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
524 self.skipTest(f"Unexpected dataset manager {self.datasetsManager}")
525 elif isinstance(self.datasetsManager, dict) and not self.datasetsManager["cls"].endswith(
526 ".ByDimensionsDatasetRecordStorageManagerUUID"
527 ):
528 self.skipTest(f"Unexpected dataset manager {self.datasetsManager['cls']}")
530 registry = self.makeRegistry()
531 self.loadData(registry, "base.yaml")
532 for run in range(6):
533 registry.registerRun(f"run{run}")
534 datasetTypeBias = registry.getDatasetType("bias")
535 datasetTypeFlat = registry.getDatasetType("flat")
536 dataIdBias1 = {"instrument": "Cam1", "detector": 1}
537 dataIdBias2 = {"instrument": "Cam1", "detector": 2}
538 dataIdFlat1 = {"instrument": "Cam1", "detector": 1, "physical_filter": "Cam1-G", "band": "g"}
540 ref = DatasetRef(datasetTypeBias, dataIdBias1, run="run0")
541 (ref1,) = registry._importDatasets([ref])
542 # UUID is used without change
543 self.assertEqual(ref.id, ref1.id)
545 # All different failure modes
546 refs = (
547 # Importing same DatasetRef with different dataset ID is an error
548 DatasetRef(datasetTypeBias, dataIdBias1, run="run0"),
549 # Same DatasetId but different DataId
550 DatasetRef(datasetTypeBias, dataIdBias2, id=ref1.id, run="run0"),
551 DatasetRef(datasetTypeFlat, dataIdFlat1, id=ref1.id, run="run0"),
552 # Same DatasetRef and DatasetId but different run
553 DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run="run1"),
554 )
555 for ref in refs:
556 with self.assertRaises(ConflictingDefinitionError):
557 registry._importDatasets([ref])
559 # Test for non-unique IDs, they can be re-imported multiple times.
560 for run, idGenMode in ((2, DatasetIdGenEnum.DATAID_TYPE), (4, DatasetIdGenEnum.DATAID_TYPE_RUN)):
561 with self.subTest(idGenMode=idGenMode):
562 # Make dataset ref with reproducible dataset ID.
563 ref = DatasetRef(datasetTypeBias, dataIdBias1, run=f"run{run}", id_generation_mode=idGenMode)
564 (ref1,) = registry._importDatasets([ref])
565 self.assertIsInstance(ref1.id, uuid.UUID)
566 self.assertEqual(ref1.id.version, 5)
567 self.assertEqual(ref1.id, ref.id)
569 # Importing it again is OK
570 (ref2,) = registry._importDatasets([ref1])
571 self.assertEqual(ref2.id, ref1.id)
573 # Cannot import to different run with the same ID
574 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run=f"run{run+1}")
575 with self.assertRaises(ConflictingDefinitionError):
576 registry._importDatasets([ref])
578 ref = DatasetRef(
579 datasetTypeBias, dataIdBias1, run=f"run{run+1}", id_generation_mode=idGenMode
580 )
581 if idGenMode is DatasetIdGenEnum.DATAID_TYPE:
582 # Cannot import same DATAID_TYPE ref into a new run
583 with self.assertRaises(ConflictingDefinitionError):
584 (ref2,) = registry._importDatasets([ref])
585 else:
586 # DATAID_TYPE_RUN ref can be imported into a new run
587 (ref2,) = registry._importDatasets([ref])
589 def testDatasetTypeComponentQueries(self):
590 """Test component options when querying for dataset types.
592 All of the behavior here is deprecated, so many of these tests are
593 currently wrapped in a context to check that we get a warning whenever
594 a component dataset is actually returned.
595 """
596 registry = self.makeRegistry()
597 self.loadData(registry, "base.yaml")
598 self.loadData(registry, "datasets.yaml")
599 # Test querying for dataset types with different inputs.
600 # First query for all dataset types; components should only be included
601 # when components=True.
602 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes()).names)
603 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes(components=False)).names)
604 with self.assertWarns(FutureWarning):
605 self.assertLess(
606 {"bias", "flat", "bias.wcs", "flat.photoCalib"},
607 NamedValueSet(registry.queryDatasetTypes(components=True)).names,
608 )
609 # Use a pattern that can match either parent or components. Again,
610 # components are only returned if components=True.
611 self.assertEqual({"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"))).names)
612 self.assertEqual(
613 {"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=False)).names
614 )
615 with self.assertWarns(FutureWarning):
616 self.assertLess(
617 {"bias", "bias.wcs"},
618 NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=True)).names,
619 )
620 # This pattern matches only a component. In this case we also return
621 # that component dataset type if components=None.
622 with self.assertWarns(FutureWarning):
623 self.assertEqual(
624 {"bias.wcs"}, NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"))).names
625 )
626 self.assertEqual(
627 set(),
628 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=False)).names,
629 )
630 with self.assertWarns(FutureWarning):
631 self.assertEqual(
632 {"bias.wcs"},
633 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=True)).names,
634 )
635 # Add a dataset type using a StorageClass that we'll then remove; check
636 # that this does not affect our ability to query for dataset types
637 # (though it will warn).
638 tempStorageClass = StorageClass(
639 name="TempStorageClass",
640 components={
641 "data1": registry.storageClasses.getStorageClass("StructuredDataDict"),
642 "data2": registry.storageClasses.getStorageClass("StructuredDataDict"),
643 },
644 )
645 registry.storageClasses.registerStorageClass(tempStorageClass)
646 datasetType = DatasetType(
647 "temporary",
648 dimensions=["instrument"],
649 storageClass=tempStorageClass,
650 universe=registry.dimensions,
651 )
652 registry.registerDatasetType(datasetType)
653 registry.storageClasses._unregisterStorageClass(tempStorageClass.name)
654 datasetType._storageClass = None
655 del tempStorageClass
656 # Querying for all dataset types, including components, should include
657 # at least all non-component dataset types (and I don't want to
658 # enumerate all of the Exposure components for bias and flat here).
659 with self.assertWarns(FutureWarning):
660 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
661 everything = NamedValueSet(registry.queryDatasetTypes(components=True))
662 self.assertIn("TempStorageClass", cm.output[0])
663 self.assertLess({"bias", "flat", "temporary"}, everything.names)
664 # It should not include "temporary.columns", because we tried to remove
665 # the storage class that would tell it about that. So if the next line
666 # fails (i.e. "temporary.columns" _is_ in everything.names), it means
667 # this part of the test isn't doing anything, because the _unregister
668 # call about isn't simulating the real-life case we want it to
669 # simulate, in which different versions of daf_butler in entirely
670 # different Python processes interact with the same repo.
671 self.assertNotIn("temporary.data", everything.names)
672 # Query for dataset types that start with "temp". This should again
673 # not include the component, and also not fail.
674 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
675 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=True))
676 self.assertIn("TempStorageClass", cm.output[0])
677 self.assertEqual({"temporary"}, startsWithTemp.names)
678 # Querying with no components should not warn at all.
679 with self.assertLogs("lsst.daf.butler.registries", logging.WARN) as cm:
680 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=False))
681 # Must issue a warning of our own to be captured.
682 logging.getLogger("lsst.daf.butler.registries").warning("test message")
683 self.assertEqual(len(cm.output), 1)
684 self.assertIn("test message", cm.output[0])
686 def testComponentLookups(self):
687 """Test searching for component datasets via their parents.
689 All of the behavior here is deprecated, so many of these tests are
690 currently wrapped in a context to check that we get a warning whenever
691 a component dataset is actually returned.
692 """
693 registry = self.makeRegistry()
694 self.loadData(registry, "base.yaml")
695 self.loadData(registry, "datasets.yaml")
696 # Test getting the child dataset type (which does still exist in the
697 # Registry), and check for consistency with
698 # DatasetRef.makeComponentRef.
699 collection = "imported_g"
700 parentType = registry.getDatasetType("bias")
701 childType = registry.getDatasetType("bias.wcs")
702 parentRefResolved = registry.findDataset(
703 parentType, collections=collection, instrument="Cam1", detector=1
704 )
705 self.assertIsInstance(parentRefResolved, DatasetRef)
706 self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType)
707 # Search for a single dataset with findDataset.
708 childRef1 = registry.findDataset("bias.wcs", collections=collection, dataId=parentRefResolved.dataId)
709 self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs"))
710 # Search for detector data IDs constrained by component dataset
711 # existence with queryDataIds.
712 with self.assertWarns(FutureWarning):
713 dataIds = registry.queryDataIds(
714 ["detector"],
715 datasets=["bias.wcs"],
716 collections=collection,
717 ).toSet()
718 self.assertEqual(
719 dataIds,
720 DataCoordinateSet(
721 {
722 DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions)
723 for d in (1, 2, 3)
724 },
725 parentType.dimensions,
726 ),
727 )
728 # Search for multiple datasets of a single type with queryDatasets.
729 with self.assertWarns(FutureWarning):
730 childRefs2 = set(
731 registry.queryDatasets(
732 "bias.wcs",
733 collections=collection,
734 )
735 )
736 self.assertEqual({ref.datasetType for ref in childRefs2}, {childType})
737 self.assertEqual({ref.dataId for ref in childRefs2}, set(dataIds))
739 def testCollections(self):
740 """Tests for registry methods that manage collections."""
741 registry = self.makeRegistry()
742 other_registry = self.makeRegistry(share_repo_with=registry)
743 self.loadData(registry, "base.yaml")
744 self.loadData(registry, "datasets.yaml")
745 run1 = "imported_g"
746 run2 = "imported_r"
747 # Test setting a collection docstring after it has been created.
748 registry.setCollectionDocumentation(run1, "doc for run1")
749 self.assertEqual(registry.getCollectionDocumentation(run1), "doc for run1")
750 registry.setCollectionDocumentation(run1, None)
751 self.assertIsNone(registry.getCollectionDocumentation(run1))
752 datasetType = "bias"
753 # Find some datasets via their run's collection.
754 dataId1 = {"instrument": "Cam1", "detector": 1}
755 ref1 = registry.findDataset(datasetType, dataId1, collections=run1)
756 self.assertIsNotNone(ref1)
757 dataId2 = {"instrument": "Cam1", "detector": 2}
758 ref2 = registry.findDataset(datasetType, dataId2, collections=run1)
759 self.assertIsNotNone(ref2)
760 # Associate those into a new collection, then look for them there.
761 tag1 = "tag1"
762 registry.registerCollection(tag1, type=CollectionType.TAGGED, doc="doc for tag1")
763 # Check that we can query for old and new collections by type.
764 self.assertEqual(set(registry.queryCollections(collectionTypes=CollectionType.RUN)), {run1, run2})
765 self.assertEqual(
766 set(registry.queryCollections(collectionTypes={CollectionType.TAGGED, CollectionType.RUN})),
767 {tag1, run1, run2},
768 )
769 self.assertEqual(registry.getCollectionDocumentation(tag1), "doc for tag1")
770 registry.associate(tag1, [ref1, ref2])
771 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
772 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
773 # Disassociate one and verify that we can't it there anymore...
774 registry.disassociate(tag1, [ref1])
775 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=tag1))
776 # ...but we can still find ref2 in tag1, and ref1 in the run.
777 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run1), ref1)
778 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
779 collections = set(registry.queryCollections())
780 self.assertEqual(collections, {run1, run2, tag1})
781 # Associate both refs into tag1 again; ref2 is already there, but that
782 # should be a harmless no-op.
783 registry.associate(tag1, [ref1, ref2])
784 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
785 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
786 # Get a different dataset (from a different run) that has the same
787 # dataset type and data ID as ref2.
788 ref2b = registry.findDataset(datasetType, dataId2, collections=run2)
789 self.assertNotEqual(ref2, ref2b)
790 # Attempting to associate that into tag1 should be an error.
791 with self.assertRaises(ConflictingDefinitionError):
792 registry.associate(tag1, [ref2b])
793 # That error shouldn't have messed up what we had before.
794 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
795 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
796 # Attempt to associate the conflicting dataset again, this time with
797 # a dataset that isn't in the collection and won't cause a conflict.
798 # Should also fail without modifying anything.
799 dataId3 = {"instrument": "Cam1", "detector": 3}
800 ref3 = registry.findDataset(datasetType, dataId3, collections=run1)
801 with self.assertRaises(ConflictingDefinitionError):
802 registry.associate(tag1, [ref3, ref2b])
803 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
804 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
805 self.assertIsNone(registry.findDataset(datasetType, dataId3, collections=tag1))
806 # Register a chained collection that searches [tag1, run2]
807 chain1 = "chain1"
808 registry.registerCollection(chain1, type=CollectionType.CHAINED)
809 self.assertIs(registry.getCollectionType(chain1), CollectionType.CHAINED)
810 # Chained collection exists, but has no collections in it.
811 self.assertFalse(registry.getCollectionChain(chain1))
812 # If we query for all collections, we should get the chained collection
813 # only if we don't ask to flatten it (i.e. yield only its children).
814 self.assertEqual(set(registry.queryCollections(flattenChains=False)), {tag1, run1, run2, chain1})
815 self.assertEqual(set(registry.queryCollections(flattenChains=True)), {tag1, run1, run2})
816 # Attempt to set its child collections to something circular; that
817 # should fail.
818 with self.assertRaises(ValueError):
819 registry.setCollectionChain(chain1, [tag1, chain1])
820 # Add the child collections.
821 registry.setCollectionChain(chain1, [tag1, run2])
822 self.assertEqual(list(registry.getCollectionChain(chain1)), [tag1, run2])
823 self.assertEqual(registry.getCollectionParentChains(tag1), {chain1})
824 self.assertEqual(registry.getCollectionParentChains(run2), {chain1})
825 # Refresh the other registry that points to the same repo, and make
826 # sure it can see the things we've done (note that this does require
827 # an explicit refresh(); that's the documented behavior, because
828 # caching is ~impossible otherwise).
829 if other_registry is not None:
830 other_registry.refresh()
831 self.assertEqual(list(other_registry.getCollectionChain(chain1)), [tag1, run2])
832 self.assertEqual(other_registry.getCollectionParentChains(tag1), {chain1})
833 self.assertEqual(other_registry.getCollectionParentChains(run2), {chain1})
834 # Searching for dataId1 or dataId2 in the chain should return ref1 and
835 # ref2, because both are in tag1.
836 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain1), ref1)
837 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain1), ref2)
838 # Now disassociate ref2 from tag1. The search (for bias) with
839 # dataId2 in chain1 should then:
840 # 1. not find it in tag1
841 # 2. find a different dataset in run2
842 registry.disassociate(tag1, [ref2])
843 ref2b = registry.findDataset(datasetType, dataId2, collections=chain1)
844 self.assertNotEqual(ref2b, ref2)
845 self.assertEqual(ref2b, registry.findDataset(datasetType, dataId2, collections=run2))
846 # Define a new chain so we can test recursive chains.
847 chain2 = "chain2"
848 registry.registerCollection(chain2, type=CollectionType.CHAINED)
849 registry.setCollectionChain(chain2, [run2, chain1])
850 self.assertEqual(registry.getCollectionParentChains(chain1), {chain2})
851 self.assertEqual(registry.getCollectionParentChains(run2), {chain1, chain2})
852 # Query for collections matching a regex.
853 self.assertCountEqual(
854 list(registry.queryCollections(re.compile("imported_."), flattenChains=False)),
855 ["imported_r", "imported_g"],
856 )
857 # Query for collections matching a regex or an explicit str.
858 self.assertCountEqual(
859 list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)),
860 ["imported_r", "imported_g", "chain1"],
861 )
862 # Search for bias with dataId1 should find it via tag1 in chain2,
863 # recursing, because is not in run1.
864 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=run2))
865 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain2), ref1)
866 # Search for bias with dataId2 should find it in run2 (ref2b).
867 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain2), ref2b)
868 # Search for a flat that is in run2. That should not be found
869 # at the front of chain2, because of the restriction to bias
870 # on run2 there, but it should be found in at the end of chain1.
871 dataId4 = {"instrument": "Cam1", "detector": 3, "physical_filter": "Cam1-R2"}
872 ref4 = registry.findDataset("flat", dataId4, collections=run2)
873 self.assertIsNotNone(ref4)
874 self.assertEqual(ref4, registry.findDataset("flat", dataId4, collections=chain2))
875 # Deleting a collection that's part of a CHAINED collection is not
876 # allowed, and is exception-safe.
877 with self.assertRaises(sqlalchemy.exc.IntegrityError):
878 registry.removeCollection(run2)
879 self.assertEqual(registry.getCollectionType(run2), CollectionType.RUN)
880 with self.assertRaises(sqlalchemy.exc.IntegrityError):
881 registry.removeCollection(chain1)
882 self.assertEqual(registry.getCollectionType(chain1), CollectionType.CHAINED)
883 # Actually remove chain2, test that it's gone by asking for its type.
884 registry.removeCollection(chain2)
885 with self.assertRaises(MissingCollectionError):
886 registry.getCollectionType(chain2)
887 # Actually remove run2 and chain1, which should work now.
888 registry.removeCollection(chain1)
889 registry.removeCollection(run2)
890 with self.assertRaises(MissingCollectionError):
891 registry.getCollectionType(run2)
892 with self.assertRaises(MissingCollectionError):
893 registry.getCollectionType(chain1)
894 # Remove tag1 as well, just to test that we can remove TAGGED
895 # collections.
896 registry.removeCollection(tag1)
897 with self.assertRaises(MissingCollectionError):
898 registry.getCollectionType(tag1)
900 def testCollectionChainFlatten(self):
901 """Test that Registry.setCollectionChain obeys its 'flatten' option."""
902 registry = self.makeRegistry()
903 registry.registerCollection("inner", CollectionType.CHAINED)
904 registry.registerCollection("innermost", CollectionType.RUN)
905 registry.setCollectionChain("inner", ["innermost"])
906 registry.registerCollection("outer", CollectionType.CHAINED)
907 registry.setCollectionChain("outer", ["inner"], flatten=False)
908 self.assertEqual(list(registry.getCollectionChain("outer")), ["inner"])
909 registry.setCollectionChain("outer", ["inner"], flatten=True)
910 self.assertEqual(list(registry.getCollectionChain("outer")), ["innermost"])
912 def testBasicTransaction(self):
913 """Test that all operations within a single transaction block are
914 rolled back if an exception propagates out of the block.
915 """
916 registry = self.makeRegistry()
917 storageClass = StorageClass("testDatasetType")
918 registry.storageClasses.registerStorageClass(storageClass)
919 with registry.transaction():
920 registry.insertDimensionData("instrument", {"name": "Cam1", "class_name": "A"})
921 with self.assertRaises(ValueError):
922 with registry.transaction():
923 registry.insertDimensionData("instrument", {"name": "Cam2"})
924 raise ValueError("Oops, something went wrong")
925 # Cam1 should exist
926 self.assertEqual(registry.expandDataId(instrument="Cam1").records["instrument"].class_name, "A")
927 # But Cam2 and Cam3 should both not exist
928 with self.assertRaises(DataIdValueError):
929 registry.expandDataId(instrument="Cam2")
930 with self.assertRaises(DataIdValueError):
931 registry.expandDataId(instrument="Cam3")
933 def testNestedTransaction(self):
934 """Test that operations within a transaction block are not rolled back
935 if an exception propagates out of an inner transaction block and is
936 then caught.
937 """
938 registry = self.makeRegistry()
939 dimension = registry.dimensions["instrument"]
940 dataId1 = {"instrument": "DummyCam"}
941 dataId2 = {"instrument": "DummyCam2"}
942 checkpointReached = False
943 with registry.transaction():
944 # This should be added and (ultimately) committed.
945 registry.insertDimensionData(dimension, dataId1)
946 with self.assertRaises(sqlalchemy.exc.IntegrityError):
947 with registry.transaction(savepoint=True):
948 # This does not conflict, and should succeed (but not
949 # be committed).
950 registry.insertDimensionData(dimension, dataId2)
951 checkpointReached = True
952 # This should conflict and raise, triggerring a rollback
953 # of the previous insertion within the same transaction
954 # context, but not the original insertion in the outer
955 # block.
956 registry.insertDimensionData(dimension, dataId1)
957 self.assertTrue(checkpointReached)
958 self.assertIsNotNone(registry.expandDataId(dataId1, graph=dimension.graph))
959 with self.assertRaises(DataIdValueError):
960 registry.expandDataId(dataId2, graph=dimension.graph)
962 def testInstrumentDimensions(self):
963 """Test queries involving only instrument dimensions, with no joins to
964 skymap.
965 """
966 registry = self.makeRegistry()
968 # need a bunch of dimensions and datasets for test
969 registry.insertDimensionData(
970 "instrument", dict(name="DummyCam", visit_max=25, exposure_max=300, detector_max=6)
971 )
972 registry.insertDimensionData(
973 "physical_filter",
974 dict(instrument="DummyCam", name="dummy_r", band="r"),
975 dict(instrument="DummyCam", name="dummy_i", band="i"),
976 )
977 registry.insertDimensionData(
978 "detector", *[dict(instrument="DummyCam", id=i, full_name=str(i)) for i in range(1, 6)]
979 )
980 registry.insertDimensionData(
981 "visit_system",
982 dict(instrument="DummyCam", id=1, name="default"),
983 )
984 registry.insertDimensionData(
985 "visit",
986 dict(instrument="DummyCam", id=10, name="ten", physical_filter="dummy_i", visit_system=1),
987 dict(instrument="DummyCam", id=11, name="eleven", physical_filter="dummy_r", visit_system=1),
988 dict(instrument="DummyCam", id=20, name="twelve", physical_filter="dummy_r", visit_system=1),
989 )
990 for i in range(1, 6):
991 registry.insertDimensionData(
992 "visit_detector_region",
993 dict(instrument="DummyCam", visit=10, detector=i),
994 dict(instrument="DummyCam", visit=11, detector=i),
995 dict(instrument="DummyCam", visit=20, detector=i),
996 )
997 registry.insertDimensionData(
998 "exposure",
999 dict(instrument="DummyCam", id=100, obs_id="100", physical_filter="dummy_i"),
1000 dict(instrument="DummyCam", id=101, obs_id="101", physical_filter="dummy_i"),
1001 dict(instrument="DummyCam", id=110, obs_id="110", physical_filter="dummy_r"),
1002 dict(instrument="DummyCam", id=111, obs_id="111", physical_filter="dummy_r"),
1003 dict(instrument="DummyCam", id=200, obs_id="200", physical_filter="dummy_r"),
1004 dict(instrument="DummyCam", id=201, obs_id="201", physical_filter="dummy_r"),
1005 )
1006 registry.insertDimensionData(
1007 "visit_definition",
1008 dict(instrument="DummyCam", exposure=100, visit_system=1, visit=10),
1009 dict(instrument="DummyCam", exposure=101, visit_system=1, visit=10),
1010 dict(instrument="DummyCam", exposure=110, visit_system=1, visit=11),
1011 dict(instrument="DummyCam", exposure=111, visit_system=1, visit=11),
1012 dict(instrument="DummyCam", exposure=200, visit_system=1, visit=20),
1013 dict(instrument="DummyCam", exposure=201, visit_system=1, visit=20),
1014 )
1015 # dataset types
1016 run1 = "test1_r"
1017 run2 = "test2_r"
1018 tagged2 = "test2_t"
1019 registry.registerRun(run1)
1020 registry.registerRun(run2)
1021 registry.registerCollection(tagged2)
1022 storageClass = StorageClass("testDataset")
1023 registry.storageClasses.registerStorageClass(storageClass)
1024 rawType = DatasetType(
1025 name="RAW",
1026 dimensions=registry.dimensions.extract(("instrument", "exposure", "detector")),
1027 storageClass=storageClass,
1028 )
1029 registry.registerDatasetType(rawType)
1030 calexpType = DatasetType(
1031 name="CALEXP",
1032 dimensions=registry.dimensions.extract(("instrument", "visit", "detector")),
1033 storageClass=storageClass,
1034 )
1035 registry.registerDatasetType(calexpType)
1037 # add pre-existing datasets
1038 for exposure in (100, 101, 110, 111):
1039 for detector in (1, 2, 3):
1040 # note that only 3 of 5 detectors have datasets
1041 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1042 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run1)
1043 # exposures 100 and 101 appear in both run1 and tagged2.
1044 # 100 has different datasets in the different collections
1045 # 101 has the same dataset in both collections.
1046 if exposure == 100:
1047 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1048 if exposure in (100, 101):
1049 registry.associate(tagged2, [ref])
1050 # Add pre-existing datasets to tagged2.
1051 for exposure in (200, 201):
1052 for detector in (3, 4, 5):
1053 # note that only 3 of 5 detectors have datasets
1054 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1055 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1056 registry.associate(tagged2, [ref])
1058 dimensions = DimensionGraph(
1059 registry.dimensions, dimensions=(rawType.dimensions.required | calexpType.dimensions.required)
1060 )
1061 # Test that single dim string works as well as list of str
1062 rows = registry.queryDataIds("visit", datasets=rawType, collections=run1).expanded().toSet()
1063 rowsI = registry.queryDataIds(["visit"], datasets=rawType, collections=run1).expanded().toSet()
1064 self.assertEqual(rows, rowsI)
1065 # with empty expression
1066 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=run1).expanded().toSet()
1067 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1068 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111))
1069 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11))
1070 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1072 # second collection
1073 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=tagged2).toSet()
1074 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1075 for dataId in rows:
1076 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1077 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 200, 201))
1078 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 20))
1079 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1081 # with two input datasets
1082 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=[run1, tagged2]).toSet()
1083 self.assertEqual(len(set(rows)), 6 * 3) # 6 exposures times 3 detectors; set needed to de-dupe
1084 for dataId in rows:
1085 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1086 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111, 200, 201))
1087 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11, 20))
1088 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1090 # limit to single visit
1091 rows = registry.queryDataIds(
1092 dimensions, datasets=rawType, collections=run1, where="visit = 10", instrument="DummyCam"
1093 ).toSet()
1094 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1095 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1096 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1097 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1099 # more limiting expression, using link names instead of Table.column
1100 rows = registry.queryDataIds(
1101 dimensions,
1102 datasets=rawType,
1103 collections=run1,
1104 where="visit = 10 and detector > 1 and 'DummyCam'=instrument",
1105 ).toSet()
1106 self.assertEqual(len(rows), 2 * 2) # 2 exposures times 2 detectors
1107 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1108 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1109 self.assertCountEqual({dataId["detector"] for dataId in rows}, (2, 3))
1111 # queryDataIds with only one of `datasets` and `collections` is an
1112 # error.
1113 with self.assertRaises(CollectionError):
1114 registry.queryDataIds(dimensions, datasets=rawType)
1115 with self.assertRaises(ArgumentError):
1116 registry.queryDataIds(dimensions, collections=run1)
1118 # expression excludes everything
1119 rows = registry.queryDataIds(
1120 dimensions, datasets=rawType, collections=run1, where="visit > 1000", instrument="DummyCam"
1121 ).toSet()
1122 self.assertEqual(len(rows), 0)
1124 # Selecting by physical_filter, this is not in the dimensions, but it
1125 # is a part of the full expression so it should work too.
1126 rows = registry.queryDataIds(
1127 dimensions,
1128 datasets=rawType,
1129 collections=run1,
1130 where="physical_filter = 'dummy_r'",
1131 instrument="DummyCam",
1132 ).toSet()
1133 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1134 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (110, 111))
1135 self.assertCountEqual({dataId["visit"] for dataId in rows}, (11,))
1136 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1138 def testSkyMapDimensions(self):
1139 """Tests involving only skymap dimensions, no joins to instrument."""
1140 registry = self.makeRegistry()
1142 # need a bunch of dimensions and datasets for test, we want
1143 # "band" in the test so also have to add physical_filter
1144 # dimensions
1145 registry.insertDimensionData("instrument", dict(instrument="DummyCam"))
1146 registry.insertDimensionData(
1147 "physical_filter",
1148 dict(instrument="DummyCam", name="dummy_r", band="r"),
1149 dict(instrument="DummyCam", name="dummy_i", band="i"),
1150 )
1151 registry.insertDimensionData("skymap", dict(name="DummyMap", hash=b"sha!"))
1152 for tract in range(10):
1153 registry.insertDimensionData("tract", dict(skymap="DummyMap", id=tract))
1154 registry.insertDimensionData(
1155 "patch",
1156 *[dict(skymap="DummyMap", tract=tract, id=patch, cell_x=0, cell_y=0) for patch in range(10)],
1157 )
1159 # dataset types
1160 run = "tésτ"
1161 registry.registerRun(run)
1162 storageClass = StorageClass("testDataset")
1163 registry.storageClasses.registerStorageClass(storageClass)
1164 calexpType = DatasetType(
1165 name="deepCoadd_calexp",
1166 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1167 storageClass=storageClass,
1168 )
1169 registry.registerDatasetType(calexpType)
1170 mergeType = DatasetType(
1171 name="deepCoadd_mergeDet",
1172 dimensions=registry.dimensions.extract(("skymap", "tract", "patch")),
1173 storageClass=storageClass,
1174 )
1175 registry.registerDatasetType(mergeType)
1176 measType = DatasetType(
1177 name="deepCoadd_meas",
1178 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1179 storageClass=storageClass,
1180 )
1181 registry.registerDatasetType(measType)
1183 dimensions = DimensionGraph(
1184 registry.dimensions,
1185 dimensions=(
1186 calexpType.dimensions.required | mergeType.dimensions.required | measType.dimensions.required
1187 ),
1188 )
1190 # add pre-existing datasets
1191 for tract in (1, 3, 5):
1192 for patch in (2, 4, 6, 7):
1193 dataId = dict(skymap="DummyMap", tract=tract, patch=patch)
1194 registry.insertDatasets(mergeType, dataIds=[dataId], run=run)
1195 for aFilter in ("i", "r"):
1196 dataId = dict(skymap="DummyMap", tract=tract, patch=patch, band=aFilter)
1197 registry.insertDatasets(calexpType, dataIds=[dataId], run=run)
1199 # with empty expression
1200 rows = registry.queryDataIds(dimensions, datasets=[calexpType, mergeType], collections=run).toSet()
1201 self.assertEqual(len(rows), 3 * 4 * 2) # 4 tracts x 4 patches x 2 filters
1202 for dataId in rows:
1203 self.assertCountEqual(dataId.keys(), ("skymap", "tract", "patch", "band"))
1204 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1205 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1206 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1208 # limit to 2 tracts and 2 patches
1209 rows = registry.queryDataIds(
1210 dimensions,
1211 datasets=[calexpType, mergeType],
1212 collections=run,
1213 where="tract IN (1, 5) AND patch IN (2, 7)",
1214 skymap="DummyMap",
1215 ).toSet()
1216 self.assertEqual(len(rows), 2 * 2 * 2) # 2 tracts x 2 patches x 2 filters
1217 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 5))
1218 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 7))
1219 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1221 # limit to single filter
1222 rows = registry.queryDataIds(
1223 dimensions, datasets=[calexpType, mergeType], collections=run, where="band = 'i'"
1224 ).toSet()
1225 self.assertEqual(len(rows), 3 * 4 * 1) # 4 tracts x 4 patches x 2 filters
1226 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1227 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1228 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",))
1230 # Specifying non-existing skymap is an exception
1231 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
1232 rows = registry.queryDataIds(
1233 dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
1234 ).toSet()
1236 def testSpatialJoin(self):
1237 """Test queries that involve spatial overlap joins."""
1238 registry = self.makeRegistry()
1239 self.loadData(registry, "hsc-rc2-subset.yaml")
1241 # Dictionary of spatial DatabaseDimensionElements, keyed by the name of
1242 # the TopologicalFamily they belong to. We'll relate all elements in
1243 # each family to all of the elements in each other family.
1244 families = defaultdict(set)
1245 # Dictionary of {element.name: {dataId: region}}.
1246 regions = {}
1247 for element in registry.dimensions.getDatabaseElements():
1248 if element.spatial is not None:
1249 families[element.spatial.name].add(element)
1250 regions[element.name] = {
1251 record.dataId: record.region for record in registry.queryDimensionRecords(element)
1252 }
1254 # If this check fails, it's not necessarily a problem - it may just be
1255 # a reasonable change to the default dimension definitions - but the
1256 # test below depends on there being more than one family to do anything
1257 # useful.
1258 self.assertEqual(len(families), 2)
1260 # Overlap DatabaseDimensionElements with each other.
1261 for family1, family2 in itertools.combinations(families, 2):
1262 for element1, element2 in itertools.product(families[family1], families[family2]):
1263 graph = DimensionGraph.union(element1.graph, element2.graph)
1264 # Construct expected set of overlapping data IDs via a
1265 # brute-force comparison of the regions we've already fetched.
1266 expected = {
1267 DataCoordinate.standardize({**dataId1.byName(), **dataId2.byName()}, graph=graph)
1268 for (dataId1, region1), (dataId2, region2) in itertools.product(
1269 regions[element1.name].items(), regions[element2.name].items()
1270 )
1271 if not region1.isDisjointFrom(region2)
1272 }
1273 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1274 queried = set(registry.queryDataIds(graph))
1275 self.assertEqual(expected, queried)
1277 # Overlap each DatabaseDimensionElement with the commonSkyPix system.
1278 commonSkyPix = registry.dimensions.commonSkyPix
1279 for elementName, these_regions in regions.items():
1280 graph = DimensionGraph.union(registry.dimensions[elementName].graph, commonSkyPix.graph)
1281 expected = set()
1282 for dataId, region in these_regions.items():
1283 for begin, end in commonSkyPix.pixelization.envelope(region):
1284 expected.update(
1285 DataCoordinate.standardize({commonSkyPix.name: index, **dataId.byName()}, graph=graph)
1286 for index in range(begin, end)
1287 )
1288 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1289 queried = set(registry.queryDataIds(graph))
1290 self.assertEqual(expected, queried)
1292 def testAbstractQuery(self):
1293 """Test that we can run a query that just lists the known
1294 bands. This is tricky because band is
1295 backed by a query against physical_filter.
1296 """
1297 registry = self.makeRegistry()
1298 registry.insertDimensionData("instrument", dict(name="DummyCam"))
1299 registry.insertDimensionData(
1300 "physical_filter",
1301 dict(instrument="DummyCam", name="dummy_i", band="i"),
1302 dict(instrument="DummyCam", name="dummy_i2", band="i"),
1303 dict(instrument="DummyCam", name="dummy_r", band="r"),
1304 )
1305 rows = registry.queryDataIds(["band"]).toSet()
1306 self.assertCountEqual(
1307 rows,
1308 [
1309 DataCoordinate.standardize(band="i", universe=registry.dimensions),
1310 DataCoordinate.standardize(band="r", universe=registry.dimensions),
1311 ],
1312 )
1314 def testAttributeManager(self):
1315 """Test basic functionality of attribute manager."""
1316 # number of attributes with schema versions in a fresh database,
1317 # 6 managers with 2 records per manager, plus config for dimensions
1318 VERSION_COUNT = 6 * 2 + 1
1320 registry = self.makeRegistry()
1321 attributes = registry._managers.attributes
1323 # check what get() returns for non-existing key
1324 self.assertIsNone(attributes.get("attr"))
1325 self.assertEqual(attributes.get("attr", ""), "")
1326 self.assertEqual(attributes.get("attr", "Value"), "Value")
1327 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1329 # cannot store empty key or value
1330 with self.assertRaises(ValueError):
1331 attributes.set("", "value")
1332 with self.assertRaises(ValueError):
1333 attributes.set("attr", "")
1335 # set value of non-existing key
1336 attributes.set("attr", "value")
1337 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1338 self.assertEqual(attributes.get("attr"), "value")
1340 # update value of existing key
1341 with self.assertRaises(ButlerAttributeExistsError):
1342 attributes.set("attr", "value2")
1344 attributes.set("attr", "value2", force=True)
1345 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1346 self.assertEqual(attributes.get("attr"), "value2")
1348 # delete existing key
1349 self.assertTrue(attributes.delete("attr"))
1350 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1352 # delete non-existing key
1353 self.assertFalse(attributes.delete("non-attr"))
1355 # store bunch of keys and get the list back
1356 data = [
1357 ("version.core", "1.2.3"),
1358 ("version.dimensions", "3.2.1"),
1359 ("config.managers.opaque", "ByNameOpaqueTableStorageManager"),
1360 ]
1361 for key, value in data:
1362 attributes.set(key, value)
1363 items = dict(attributes.items())
1364 for key, value in data:
1365 self.assertEqual(items[key], value)
1367 def testQueryDatasetsDeduplication(self):
1368 """Test that the findFirst option to queryDatasets selects datasets
1369 from collections in the order given".
1370 """
1371 registry = self.makeRegistry()
1372 self.loadData(registry, "base.yaml")
1373 self.loadData(registry, "datasets.yaml")
1374 self.assertCountEqual(
1375 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"])),
1376 [
1377 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1378 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1379 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1380 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1381 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1382 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1383 ],
1384 )
1385 self.assertCountEqual(
1386 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"], findFirst=True)),
1387 [
1388 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1389 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1390 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1391 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1392 ],
1393 )
1394 self.assertCountEqual(
1395 list(registry.queryDatasets("bias", collections=["imported_r", "imported_g"], findFirst=True)),
1396 [
1397 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1398 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1399 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1400 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1401 ],
1402 )
1404 def testQueryResults(self):
1405 """Test querying for data IDs and then manipulating the QueryResults
1406 object returned to perform other queries.
1407 """
1408 registry = self.makeRegistry()
1409 self.loadData(registry, "base.yaml")
1410 self.loadData(registry, "datasets.yaml")
1411 bias = registry.getDatasetType("bias")
1412 flat = registry.getDatasetType("flat")
1413 # Obtain expected results from methods other than those we're testing
1414 # here. That includes:
1415 # - the dimensions of the data IDs we want to query:
1416 expectedGraph = DimensionGraph(registry.dimensions, names=["detector", "physical_filter"])
1417 # - the dimensions of some other data IDs we'll extract from that:
1418 expectedSubsetGraph = DimensionGraph(registry.dimensions, names=["detector"])
1419 # - the data IDs we expect to obtain from the first queries:
1420 expectedDataIds = DataCoordinateSet(
1421 {
1422 DataCoordinate.standardize(
1423 instrument="Cam1", detector=d, physical_filter=p, universe=registry.dimensions
1424 )
1425 for d, p in itertools.product({1, 2, 3}, {"Cam1-G", "Cam1-R1", "Cam1-R2"})
1426 },
1427 graph=expectedGraph,
1428 hasFull=False,
1429 hasRecords=False,
1430 )
1431 # - the flat datasets we expect to find from those data IDs, in just
1432 # one collection (so deduplication is irrelevant):
1433 expectedFlats = [
1434 registry.findDataset(
1435 flat, instrument="Cam1", detector=1, physical_filter="Cam1-R1", collections="imported_r"
1436 ),
1437 registry.findDataset(
1438 flat, instrument="Cam1", detector=2, physical_filter="Cam1-R1", collections="imported_r"
1439 ),
1440 registry.findDataset(
1441 flat, instrument="Cam1", detector=3, physical_filter="Cam1-R2", collections="imported_r"
1442 ),
1443 ]
1444 # - the data IDs we expect to extract from that:
1445 expectedSubsetDataIds = expectedDataIds.subset(expectedSubsetGraph)
1446 # - the bias datasets we expect to find from those data IDs, after we
1447 # subset-out the physical_filter dimension, both with duplicates:
1448 expectedAllBiases = [
1449 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1450 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_g"),
1451 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_g"),
1452 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1453 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1454 ]
1455 # - ...and without duplicates:
1456 expectedDeduplicatedBiases = [
1457 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1458 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1459 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1460 ]
1461 # Test against those expected results, using a "lazy" query for the
1462 # data IDs (which re-executes that query each time we use it to do
1463 # something new).
1464 dataIds = registry.queryDataIds(
1465 ["detector", "physical_filter"],
1466 where="detector.purpose = 'SCIENCE'", # this rejects detector=4
1467 instrument="Cam1",
1468 )
1469 self.assertEqual(dataIds.graph, expectedGraph)
1470 self.assertEqual(dataIds.toSet(), expectedDataIds)
1471 self.assertCountEqual(
1472 list(
1473 dataIds.findDatasets(
1474 flat,
1475 collections=["imported_r"],
1476 )
1477 ),
1478 expectedFlats,
1479 )
1480 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1481 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1482 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1483 self.assertCountEqual(
1484 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=False)),
1485 expectedAllBiases,
1486 )
1487 self.assertCountEqual(
1488 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)),
1489 expectedDeduplicatedBiases,
1490 )
1492 # Check dimensions match.
1493 with self.assertRaises(ValueError):
1494 subsetDataIds.findDatasets("flat", collections=["imported_r", "imported_g"], findFirst=True)
1496 # Use a component dataset type.
1497 self.assertCountEqual(
1498 [
1499 ref.makeComponentRef("image")
1500 for ref in subsetDataIds.findDatasets(
1501 bias,
1502 collections=["imported_r", "imported_g"],
1503 findFirst=False,
1504 )
1505 ],
1506 [ref.makeComponentRef("image") for ref in expectedAllBiases],
1507 )
1509 # Use a named dataset type that does not exist and a dataset type
1510 # object that does not exist.
1511 unknown_type = DatasetType("not_known", dimensions=bias.dimensions, storageClass="Exposure")
1513 # Test both string name and dataset type object.
1514 test_type: str | DatasetType
1515 for test_type, test_type_name in (
1516 (unknown_type, unknown_type.name),
1517 (unknown_type.name, unknown_type.name),
1518 ):
1519 with self.assertRaisesRegex(DatasetTypeError, expected_regex=test_type_name):
1520 list(
1521 subsetDataIds.findDatasets(
1522 test_type, collections=["imported_r", "imported_g"], findFirst=True
1523 )
1524 )
1526 # Materialize the bias dataset queries (only) by putting the results
1527 # into temporary tables, then repeat those tests.
1528 with subsetDataIds.findDatasets(
1529 bias, collections=["imported_r", "imported_g"], findFirst=False
1530 ).materialize() as biases:
1531 self.assertCountEqual(list(biases), expectedAllBiases)
1532 with subsetDataIds.findDatasets(
1533 bias, collections=["imported_r", "imported_g"], findFirst=True
1534 ).materialize() as biases:
1535 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1536 # Materialize the data ID subset query, but not the dataset queries.
1537 with subsetDataIds.materialize() as subsetDataIds:
1538 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1539 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1540 self.assertCountEqual(
1541 list(
1542 subsetDataIds.findDatasets(
1543 bias, collections=["imported_r", "imported_g"], findFirst=False
1544 )
1545 ),
1546 expectedAllBiases,
1547 )
1548 self.assertCountEqual(
1549 list(
1550 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1551 ),
1552 expectedDeduplicatedBiases,
1553 )
1554 # Materialize the dataset queries, too.
1555 with subsetDataIds.findDatasets(
1556 bias, collections=["imported_r", "imported_g"], findFirst=False
1557 ).materialize() as biases:
1558 self.assertCountEqual(list(biases), expectedAllBiases)
1559 with subsetDataIds.findDatasets(
1560 bias, collections=["imported_r", "imported_g"], findFirst=True
1561 ).materialize() as biases:
1562 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1563 # Materialize the original query, but none of the follow-up queries.
1564 with dataIds.materialize() as dataIds:
1565 self.assertEqual(dataIds.graph, expectedGraph)
1566 self.assertEqual(dataIds.toSet(), expectedDataIds)
1567 self.assertCountEqual(
1568 list(
1569 dataIds.findDatasets(
1570 flat,
1571 collections=["imported_r"],
1572 )
1573 ),
1574 expectedFlats,
1575 )
1576 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1577 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1578 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1579 self.assertCountEqual(
1580 list(
1581 subsetDataIds.findDatasets(
1582 bias, collections=["imported_r", "imported_g"], findFirst=False
1583 )
1584 ),
1585 expectedAllBiases,
1586 )
1587 self.assertCountEqual(
1588 list(
1589 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1590 ),
1591 expectedDeduplicatedBiases,
1592 )
1593 # Materialize just the bias dataset queries.
1594 with subsetDataIds.findDatasets(
1595 bias, collections=["imported_r", "imported_g"], findFirst=False
1596 ).materialize() as biases:
1597 self.assertCountEqual(list(biases), expectedAllBiases)
1598 with subsetDataIds.findDatasets(
1599 bias, collections=["imported_r", "imported_g"], findFirst=True
1600 ).materialize() as biases:
1601 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1602 # Materialize the subset data ID query, but not the dataset
1603 # queries.
1604 with subsetDataIds.materialize() as subsetDataIds:
1605 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1606 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1607 self.assertCountEqual(
1608 list(
1609 subsetDataIds.findDatasets(
1610 bias, collections=["imported_r", "imported_g"], findFirst=False
1611 )
1612 ),
1613 expectedAllBiases,
1614 )
1615 self.assertCountEqual(
1616 list(
1617 subsetDataIds.findDatasets(
1618 bias, collections=["imported_r", "imported_g"], findFirst=True
1619 )
1620 ),
1621 expectedDeduplicatedBiases,
1622 )
1623 # Materialize the bias dataset queries, too, so now we're
1624 # materializing every single step.
1625 with subsetDataIds.findDatasets(
1626 bias, collections=["imported_r", "imported_g"], findFirst=False
1627 ).materialize() as biases:
1628 self.assertCountEqual(list(biases), expectedAllBiases)
1629 with subsetDataIds.findDatasets(
1630 bias, collections=["imported_r", "imported_g"], findFirst=True
1631 ).materialize() as biases:
1632 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1634 def testStorageClassPropagation(self):
1635 """Test that queries for datasets respect the storage class passed in
1636 as part of a full dataset type.
1637 """
1638 registry = self.makeRegistry()
1639 self.loadData(registry, "base.yaml")
1640 dataset_type_in_registry = DatasetType(
1641 "tbl", dimensions=["instrument"], storageClass="Packages", universe=registry.dimensions
1642 )
1643 registry.registerDatasetType(dataset_type_in_registry)
1644 run = "run1"
1645 registry.registerRun(run)
1646 (inserted_ref,) = registry.insertDatasets(
1647 dataset_type_in_registry, [registry.expandDataId(instrument="Cam1")], run=run
1648 )
1649 self.assertEqual(inserted_ref.datasetType, dataset_type_in_registry)
1650 query_dataset_type = DatasetType(
1651 "tbl", dimensions=["instrument"], storageClass="StructuredDataDict", universe=registry.dimensions
1652 )
1653 self.assertNotEqual(dataset_type_in_registry, query_dataset_type)
1654 query_datasets_result = registry.queryDatasets(query_dataset_type, collections=[run])
1655 self.assertEqual(query_datasets_result.parentDatasetType, query_dataset_type) # type: ignore
1656 (query_datasets_ref,) = query_datasets_result
1657 self.assertEqual(query_datasets_ref.datasetType, query_dataset_type)
1658 query_data_ids_find_datasets_result = registry.queryDataIds(["instrument"]).findDatasets(
1659 query_dataset_type, collections=[run]
1660 )
1661 self.assertEqual(query_data_ids_find_datasets_result.parentDatasetType, query_dataset_type)
1662 (query_data_ids_find_datasets_ref,) = query_data_ids_find_datasets_result
1663 self.assertEqual(query_data_ids_find_datasets_ref.datasetType, query_dataset_type)
1664 query_dataset_types_result = registry.queryDatasetTypes(query_dataset_type)
1665 self.assertEqual(list(query_dataset_types_result), [query_dataset_type])
1666 find_dataset_ref = registry.findDataset(query_dataset_type, instrument="Cam1", collections=[run])
1667 self.assertEqual(find_dataset_ref.datasetType, query_dataset_type)
1669 def testEmptyDimensionsQueries(self):
1670 """Test Query and QueryResults objects in the case where there are no
1671 dimensions.
1672 """
1673 # Set up test data: one dataset type, two runs, one dataset in each.
1674 registry = self.makeRegistry()
1675 self.loadData(registry, "base.yaml")
1676 schema = DatasetType("schema", dimensions=registry.dimensions.empty, storageClass="Catalog")
1677 registry.registerDatasetType(schema)
1678 dataId = DataCoordinate.makeEmpty(registry.dimensions)
1679 run1 = "run1"
1680 run2 = "run2"
1681 registry.registerRun(run1)
1682 registry.registerRun(run2)
1683 (dataset1,) = registry.insertDatasets(schema, dataIds=[dataId], run=run1)
1684 (dataset2,) = registry.insertDatasets(schema, dataIds=[dataId], run=run2)
1685 # Query directly for both of the datasets, and each one, one at a time.
1686 self.checkQueryResults(
1687 registry.queryDatasets(schema, collections=[run1, run2], findFirst=False), [dataset1, dataset2]
1688 )
1689 self.checkQueryResults(
1690 registry.queryDatasets(schema, collections=[run1, run2], findFirst=True),
1691 [dataset1],
1692 )
1693 self.checkQueryResults(
1694 registry.queryDatasets(schema, collections=[run2, run1], findFirst=True),
1695 [dataset2],
1696 )
1697 # Query for data IDs with no dimensions.
1698 dataIds = registry.queryDataIds([])
1699 self.checkQueryResults(dataIds, [dataId])
1700 # Use queried data IDs to find the datasets.
1701 self.checkQueryResults(
1702 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1703 [dataset1, dataset2],
1704 )
1705 self.checkQueryResults(
1706 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1707 [dataset1],
1708 )
1709 self.checkQueryResults(
1710 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1711 [dataset2],
1712 )
1713 # Now materialize the data ID query results and repeat those tests.
1714 with dataIds.materialize() as dataIds:
1715 self.checkQueryResults(dataIds, [dataId])
1716 self.checkQueryResults(
1717 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1718 [dataset1],
1719 )
1720 self.checkQueryResults(
1721 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1722 [dataset2],
1723 )
1724 # Query for non-empty data IDs, then subset that to get the empty one.
1725 # Repeat the above tests starting from that.
1726 dataIds = registry.queryDataIds(["instrument"]).subset(registry.dimensions.empty, unique=True)
1727 self.checkQueryResults(dataIds, [dataId])
1728 self.checkQueryResults(
1729 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1730 [dataset1, dataset2],
1731 )
1732 self.checkQueryResults(
1733 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1734 [dataset1],
1735 )
1736 self.checkQueryResults(
1737 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1738 [dataset2],
1739 )
1740 with dataIds.materialize() as dataIds:
1741 self.checkQueryResults(dataIds, [dataId])
1742 self.checkQueryResults(
1743 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1744 [dataset1, dataset2],
1745 )
1746 self.checkQueryResults(
1747 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1748 [dataset1],
1749 )
1750 self.checkQueryResults(
1751 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1752 [dataset2],
1753 )
1754 # Query for non-empty data IDs, then materialize, then subset to get
1755 # the empty one. Repeat again.
1756 with registry.queryDataIds(["instrument"]).materialize() as nonEmptyDataIds:
1757 dataIds = nonEmptyDataIds.subset(registry.dimensions.empty, unique=True)
1758 self.checkQueryResults(dataIds, [dataId])
1759 self.checkQueryResults(
1760 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1761 [dataset1, dataset2],
1762 )
1763 self.checkQueryResults(
1764 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1765 [dataset1],
1766 )
1767 self.checkQueryResults(
1768 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1769 [dataset2],
1770 )
1771 with dataIds.materialize() as dataIds:
1772 self.checkQueryResults(dataIds, [dataId])
1773 self.checkQueryResults(
1774 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1775 [dataset1, dataset2],
1776 )
1777 self.checkQueryResults(
1778 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1779 [dataset1],
1780 )
1781 self.checkQueryResults(
1782 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1783 [dataset2],
1784 )
1785 # Query for non-empty data IDs with a constraint on an empty-data-ID
1786 # dataset that exists.
1787 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=...)
1788 self.checkQueryResults(
1789 dataIds.subset(unique=True),
1790 [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)],
1791 )
1792 # Again query for non-empty data IDs with a constraint on empty-data-ID
1793 # datasets, but when the datasets don't exist. We delete the existing
1794 # dataset and query just that collection rather than creating a new
1795 # empty collection because this is a bit less likely for our build-time
1796 # logic to shortcut-out (via the collection summaries), and such a
1797 # shortcut would make this test a bit more trivial than we'd like.
1798 registry.removeDatasets([dataset2])
1799 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=run2)
1800 self.checkQueryResults(dataIds, [])
1802 def testDimensionDataModifications(self):
1803 """Test that modifying dimension records via:
1804 syncDimensionData(..., update=True) and
1805 insertDimensionData(..., replace=True) works as expected, even in the
1806 presence of datasets using those dimensions and spatial overlap
1807 relationships.
1808 """
1810 def unpack_range_set(ranges: lsst.sphgeom.RangeSet) -> Iterator[int]:
1811 """Unpack a sphgeom.RangeSet into the integers it contains."""
1812 for begin, end in ranges:
1813 yield from range(begin, end)
1815 def range_set_hull(
1816 ranges: lsst.sphgeom.RangeSet,
1817 pixelization: lsst.sphgeom.HtmPixelization,
1818 ) -> lsst.sphgeom.ConvexPolygon:
1819 """Create a ConvexPolygon hull of the region defined by a set of
1820 HTM pixelization index ranges.
1821 """
1822 points = []
1823 for index in unpack_range_set(ranges):
1824 points.extend(pixelization.triangle(index).getVertices())
1825 return lsst.sphgeom.ConvexPolygon(points)
1827 # Use HTM to set up an initial parent region (one arbitrary trixel)
1828 # and four child regions (the trixels within the parent at the next
1829 # level. We'll use the parent as a tract/visit region and the children
1830 # as its patch/visit_detector regions.
1831 registry = self.makeRegistry()
1832 htm6 = registry.dimensions.skypix["htm"][6].pixelization
1833 commonSkyPix = registry.dimensions.commonSkyPix.pixelization
1834 index = 12288
1835 child_ranges_small = lsst.sphgeom.RangeSet(index).scaled(4)
1836 assert htm6.universe().contains(child_ranges_small)
1837 child_regions_small = [htm6.triangle(i) for i in unpack_range_set(child_ranges_small)]
1838 parent_region_small = lsst.sphgeom.ConvexPolygon(
1839 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_small))
1840 )
1841 assert all(parent_region_small.contains(c) for c in child_regions_small)
1842 # Make a larger version of each child region, defined to be the set of
1843 # htm6 trixels that overlap the original's bounding circle. Make a new
1844 # parent that's the convex hull of the new children.
1845 child_regions_large = [
1846 range_set_hull(htm6.envelope(c.getBoundingCircle()), htm6) for c in child_regions_small
1847 ]
1848 assert all(
1849 large.contains(small)
1850 for large, small in zip(child_regions_large, child_regions_small, strict=True)
1851 )
1852 parent_region_large = lsst.sphgeom.ConvexPolygon(
1853 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_large))
1854 )
1855 assert all(parent_region_large.contains(c) for c in child_regions_large)
1856 assert parent_region_large.contains(parent_region_small)
1857 assert not parent_region_small.contains(parent_region_large)
1858 assert not all(parent_region_small.contains(c) for c in child_regions_large)
1859 # Find some commonSkyPix indices that overlap the large regions but not
1860 # overlap the small regions. We use commonSkyPix here to make sure the
1861 # real tests later involve what's in the database, not just post-query
1862 # filtering of regions.
1863 child_difference_indices = []
1864 for large, small in zip(child_regions_large, child_regions_small, strict=True):
1865 difference = list(unpack_range_set(commonSkyPix.envelope(large) - commonSkyPix.envelope(small)))
1866 assert difference, "if this is empty, we can't test anything useful with these regions"
1867 assert all(
1868 not commonSkyPix.triangle(d).isDisjointFrom(large)
1869 and commonSkyPix.triangle(d).isDisjointFrom(small)
1870 for d in difference
1871 )
1872 child_difference_indices.append(difference)
1873 parent_difference_indices = list(
1874 unpack_range_set(
1875 commonSkyPix.envelope(parent_region_large) - commonSkyPix.envelope(parent_region_small)
1876 )
1877 )
1878 assert parent_difference_indices, "if this is empty, we can't test anything useful with these regions"
1879 assert all(
1880 (
1881 not commonSkyPix.triangle(d).isDisjointFrom(parent_region_large)
1882 and commonSkyPix.triangle(d).isDisjointFrom(parent_region_small)
1883 )
1884 for d in parent_difference_indices
1885 )
1886 # Now that we've finally got those regions, we'll insert the large ones
1887 # as tract/patch dimension records.
1888 skymap_name = "testing_v1"
1889 registry.insertDimensionData(
1890 "skymap",
1891 {
1892 "name": skymap_name,
1893 "hash": bytes([42]),
1894 "tract_max": 1,
1895 "patch_nx_max": 2,
1896 "patch_ny_max": 2,
1897 },
1898 )
1899 registry.insertDimensionData("tract", {"skymap": skymap_name, "id": 0, "region": parent_region_large})
1900 registry.insertDimensionData(
1901 "patch",
1902 *[
1903 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1904 for n, c in enumerate(child_regions_large)
1905 ],
1906 )
1907 # Add at dataset that uses these dimensions to make sure that modifying
1908 # them doesn't disrupt foreign keys (need to make sure DB doesn't
1909 # implement insert with replace=True as delete-then-insert).
1910 dataset_type = DatasetType(
1911 "coadd",
1912 dimensions=["tract", "patch"],
1913 universe=registry.dimensions,
1914 storageClass="Exposure",
1915 )
1916 registry.registerDatasetType(dataset_type)
1917 registry.registerCollection("the_run", CollectionType.RUN)
1918 registry.insertDatasets(
1919 dataset_type,
1920 [{"skymap": skymap_name, "tract": 0, "patch": 2}],
1921 run="the_run",
1922 )
1923 # Query for tracts and patches that overlap some "difference" htm9
1924 # pixels; there should be overlaps, because the database has
1925 # the "large" suite of regions.
1926 self.assertEqual(
1927 {0},
1928 {
1929 data_id["tract"]
1930 for data_id in registry.queryDataIds(
1931 ["tract"],
1932 skymap=skymap_name,
1933 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1934 )
1935 },
1936 )
1937 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1938 self.assertIn(
1939 patch_id,
1940 {
1941 data_id["patch"]
1942 for data_id in registry.queryDataIds(
1943 ["patch"],
1944 skymap=skymap_name,
1945 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1946 )
1947 },
1948 )
1949 # Use sync to update the tract region and insert to update the regions
1950 # of the patches, to the "small" suite.
1951 updated = registry.syncDimensionData(
1952 "tract",
1953 {"skymap": skymap_name, "id": 0, "region": parent_region_small},
1954 update=True,
1955 )
1956 self.assertEqual(updated, {"region": parent_region_large})
1957 registry.insertDimensionData(
1958 "patch",
1959 *[
1960 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1961 for n, c in enumerate(child_regions_small)
1962 ],
1963 replace=True,
1964 )
1965 # Query again; there now should be no such overlaps, because the
1966 # database has the "small" suite of regions.
1967 self.assertFalse(
1968 set(
1969 registry.queryDataIds(
1970 ["tract"],
1971 skymap=skymap_name,
1972 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1973 )
1974 )
1975 )
1976 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1977 self.assertNotIn(
1978 patch_id,
1979 {
1980 data_id["patch"]
1981 for data_id in registry.queryDataIds(
1982 ["patch"],
1983 skymap=skymap_name,
1984 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1985 )
1986 },
1987 )
1988 # Update back to the large regions and query one more time.
1989 updated = registry.syncDimensionData(
1990 "tract",
1991 {"skymap": skymap_name, "id": 0, "region": parent_region_large},
1992 update=True,
1993 )
1994 self.assertEqual(updated, {"region": parent_region_small})
1995 registry.insertDimensionData(
1996 "patch",
1997 *[
1998 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1999 for n, c in enumerate(child_regions_large)
2000 ],
2001 replace=True,
2002 )
2003 self.assertEqual(
2004 {0},
2005 {
2006 data_id["tract"]
2007 for data_id in registry.queryDataIds(
2008 ["tract"],
2009 skymap=skymap_name,
2010 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
2011 )
2012 },
2013 )
2014 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
2015 self.assertIn(
2016 patch_id,
2017 {
2018 data_id["patch"]
2019 for data_id in registry.queryDataIds(
2020 ["patch"],
2021 skymap=skymap_name,
2022 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
2023 )
2024 },
2025 )
2027 def testCalibrationCollections(self):
2028 """Test operations on `~CollectionType.CALIBRATION` collections,
2029 including `Registry.certify`, `Registry.decertify`, and
2030 `Registry.findDataset`.
2031 """
2032 # Setup - make a Registry, fill it with some datasets in
2033 # non-calibration collections.
2034 registry = self.makeRegistry()
2035 self.loadData(registry, "base.yaml")
2036 self.loadData(registry, "datasets.yaml")
2037 # Set up some timestamps.
2038 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")
2039 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai")
2040 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai")
2041 t4 = astropy.time.Time("2020-01-01T04:00:00", format="isot", scale="tai")
2042 t5 = astropy.time.Time("2020-01-01T05:00:00", format="isot", scale="tai")
2043 allTimespans = [
2044 Timespan(a, b) for a, b in itertools.combinations([None, t1, t2, t3, t4, t5, None], r=2)
2045 ]
2046 # Get references to some datasets.
2047 bias2a = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g")
2048 bias3a = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g")
2049 bias2b = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r")
2050 bias3b = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r")
2051 # Register the main calibration collection we'll be working with.
2052 collection = "Cam1/calibs/default"
2053 registry.registerCollection(collection, type=CollectionType.CALIBRATION)
2054 # Cannot associate into a calibration collection (no timespan).
2055 with self.assertRaises(CollectionTypeError):
2056 registry.associate(collection, [bias2a])
2057 # Certify 2a dataset with [t2, t4) validity.
2058 registry.certify(collection, [bias2a], Timespan(begin=t2, end=t4))
2059 # Test that we can query for this dataset via the new collection, both
2060 # on its own and with a RUN collection, as long as we don't try to join
2061 # in temporal dimensions or use findFirst=True.
2062 self.assertEqual(
2063 set(registry.queryDatasets("bias", findFirst=False, collections=collection)),
2064 {bias2a},
2065 )
2066 self.assertEqual(
2067 set(registry.queryDatasets("bias", findFirst=False, collections=[collection, "imported_r"])),
2068 {
2069 bias2a,
2070 bias2b,
2071 bias3b,
2072 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2073 },
2074 )
2075 self.assertEqual(
2076 set(registry.queryDataIds("detector", datasets="bias", collections=collection)),
2077 {registry.expandDataId(instrument="Cam1", detector=2)},
2078 )
2079 self.assertEqual(
2080 set(registry.queryDataIds("detector", datasets="bias", collections=[collection, "imported_r"])),
2081 {
2082 registry.expandDataId(instrument="Cam1", detector=2),
2083 registry.expandDataId(instrument="Cam1", detector=3),
2084 registry.expandDataId(instrument="Cam1", detector=4),
2085 },
2086 )
2088 # We should not be able to certify 2b with anything overlapping that
2089 # window.
2090 with self.assertRaises(ConflictingDefinitionError):
2091 registry.certify(collection, [bias2b], Timespan(begin=None, end=t3))
2092 with self.assertRaises(ConflictingDefinitionError):
2093 registry.certify(collection, [bias2b], Timespan(begin=None, end=t5))
2094 with self.assertRaises(ConflictingDefinitionError):
2095 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t3))
2096 with self.assertRaises(ConflictingDefinitionError):
2097 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t5))
2098 with self.assertRaises(ConflictingDefinitionError):
2099 registry.certify(collection, [bias2b], Timespan(begin=t1, end=None))
2100 with self.assertRaises(ConflictingDefinitionError):
2101 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t3))
2102 with self.assertRaises(ConflictingDefinitionError):
2103 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t5))
2104 with self.assertRaises(ConflictingDefinitionError):
2105 registry.certify(collection, [bias2b], Timespan(begin=t2, end=None))
2106 # We should be able to certify 3a with a range overlapping that window,
2107 # because it's for a different detector.
2108 # We'll certify 3a over [t1, t3).
2109 registry.certify(collection, [bias3a], Timespan(begin=t1, end=t3))
2110 # Now we'll certify 2b and 3b together over [t4, ∞).
2111 registry.certify(collection, [bias2b, bias3b], Timespan(begin=t4, end=None))
2113 # Fetch all associations and check that they are what we expect.
2114 self.assertCountEqual(
2115 list(
2116 registry.queryDatasetAssociations(
2117 "bias",
2118 collections=[collection, "imported_g", "imported_r"],
2119 )
2120 ),
2121 [
2122 DatasetAssociation(
2123 ref=registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
2124 collection="imported_g",
2125 timespan=None,
2126 ),
2127 DatasetAssociation(
2128 ref=registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2129 collection="imported_r",
2130 timespan=None,
2131 ),
2132 DatasetAssociation(ref=bias2a, collection="imported_g", timespan=None),
2133 DatasetAssociation(ref=bias3a, collection="imported_g", timespan=None),
2134 DatasetAssociation(ref=bias2b, collection="imported_r", timespan=None),
2135 DatasetAssociation(ref=bias3b, collection="imported_r", timespan=None),
2136 DatasetAssociation(ref=bias2a, collection=collection, timespan=Timespan(begin=t2, end=t4)),
2137 DatasetAssociation(ref=bias3a, collection=collection, timespan=Timespan(begin=t1, end=t3)),
2138 DatasetAssociation(ref=bias2b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2139 DatasetAssociation(ref=bias3b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2140 ],
2141 )
2143 class Ambiguous:
2144 """Tag class to denote lookups that should be ambiguous."""
2146 pass
2148 def assertLookup(
2149 detector: int, timespan: Timespan, expected: DatasetRef | type[Ambiguous] | None
2150 ) -> None:
2151 """Local function that asserts that a bias lookup returns the given
2152 expected result.
2153 """
2154 if expected is Ambiguous:
2155 with self.assertRaises((DatasetTypeError, LookupError)):
2156 registry.findDataset(
2157 "bias",
2158 collections=collection,
2159 instrument="Cam1",
2160 detector=detector,
2161 timespan=timespan,
2162 )
2163 else:
2164 self.assertEqual(
2165 expected,
2166 registry.findDataset(
2167 "bias",
2168 collections=collection,
2169 instrument="Cam1",
2170 detector=detector,
2171 timespan=timespan,
2172 ),
2173 )
2175 # Systematically test lookups against expected results.
2176 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2177 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2178 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2179 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2180 assertLookup(detector=2, timespan=Timespan(None, t5), expected=Ambiguous)
2181 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2182 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2183 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2184 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2185 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=Ambiguous)
2186 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2187 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2188 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2189 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=Ambiguous)
2190 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2191 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=bias2a)
2192 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=Ambiguous)
2193 assertLookup(detector=2, timespan=Timespan(t3, None), expected=Ambiguous)
2194 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=bias2b)
2195 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2196 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2197 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2198 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2199 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2200 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2201 assertLookup(detector=3, timespan=Timespan(None, t5), expected=Ambiguous)
2202 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2203 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2204 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2205 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2206 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=Ambiguous)
2207 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2208 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2209 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2210 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=Ambiguous)
2211 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2212 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2213 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=bias3b)
2214 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2215 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=bias3b)
2216 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2217 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2219 # Decertify [t3, t5) for all data IDs, and do test lookups again.
2220 # This should truncate bias2a to [t2, t3), leave bias3a unchanged at
2221 # [t1, t3), and truncate bias2b and bias3b to [t5, ∞).
2222 registry.decertify(collection=collection, datasetType="bias", timespan=Timespan(t3, t5))
2223 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2224 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2225 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2226 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2227 assertLookup(detector=2, timespan=Timespan(None, t5), expected=bias2a)
2228 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2229 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2230 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2231 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2232 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=bias2a)
2233 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2234 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2235 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2236 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=bias2a)
2237 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2238 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=None)
2239 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=None)
2240 assertLookup(detector=2, timespan=Timespan(t3, None), expected=bias2b)
2241 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=None)
2242 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2243 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2244 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2245 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2246 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2247 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2248 assertLookup(detector=3, timespan=Timespan(None, t5), expected=bias3a)
2249 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2250 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2251 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2252 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2253 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=bias3a)
2254 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2255 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2256 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2257 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=bias3a)
2258 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2259 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2260 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=None)
2261 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2262 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=None)
2263 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2264 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2266 # Decertify everything, this time with explicit data IDs, then check
2267 # that no lookups succeed.
2268 registry.decertify(
2269 collection,
2270 "bias",
2271 Timespan(None, None),
2272 dataIds=[
2273 dict(instrument="Cam1", detector=2),
2274 dict(instrument="Cam1", detector=3),
2275 ],
2276 )
2277 for detector in (2, 3):
2278 for timespan in allTimespans:
2279 assertLookup(detector=detector, timespan=timespan, expected=None)
2280 # Certify bias2a and bias3a over (-∞, ∞), check that all lookups return
2281 # those.
2282 registry.certify(
2283 collection,
2284 [bias2a, bias3a],
2285 Timespan(None, None),
2286 )
2287 for timespan in allTimespans:
2288 assertLookup(detector=2, timespan=timespan, expected=bias2a)
2289 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2290 # Decertify just bias2 over [t2, t4).
2291 # This should split a single certification row into two (and leave the
2292 # other existing row, for bias3a, alone).
2293 registry.decertify(
2294 collection, "bias", Timespan(t2, t4), dataIds=[dict(instrument="Cam1", detector=2)]
2295 )
2296 for timespan in allTimespans:
2297 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2298 overlapsBefore = timespan.overlaps(Timespan(None, t2))
2299 overlapsAfter = timespan.overlaps(Timespan(t4, None))
2300 if overlapsBefore and overlapsAfter:
2301 expected = Ambiguous
2302 elif overlapsBefore or overlapsAfter:
2303 expected = bias2a
2304 else:
2305 expected = None
2306 assertLookup(detector=2, timespan=timespan, expected=expected)
2308 def testSkipCalibs(self):
2309 """Test how queries handle skipping of calibration collections."""
2310 registry = self.makeRegistry()
2311 self.loadData(registry, "base.yaml")
2312 self.loadData(registry, "datasets.yaml")
2314 coll_calib = "Cam1/calibs/default"
2315 registry.registerCollection(coll_calib, type=CollectionType.CALIBRATION)
2317 # Add all biases to the calibration collection.
2318 # Without this, the logic that prunes dataset subqueries based on
2319 # datasetType-collection summary information will fire before the logic
2320 # we want to test below. This is a good thing (it avoids the dreaded
2321 # NotImplementedError a bit more often) everywhere but here.
2322 registry.certify(coll_calib, registry.queryDatasets("bias", collections=...), Timespan(None, None))
2324 coll_list = [coll_calib, "imported_g", "imported_r"]
2325 chain = "Cam1/chain"
2326 registry.registerCollection(chain, type=CollectionType.CHAINED)
2327 registry.setCollectionChain(chain, coll_list)
2329 # explicit list will raise if findFirst=True or there are temporal
2330 # dimensions
2331 with self.assertRaises(NotImplementedError):
2332 registry.queryDatasets("bias", collections=coll_list, findFirst=True)
2333 with self.assertRaises(NotImplementedError):
2334 registry.queryDataIds(
2335 ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
2336 ).count()
2338 # chain will skip
2339 datasets = list(registry.queryDatasets("bias", collections=chain))
2340 self.assertGreater(len(datasets), 0)
2342 dataIds = list(registry.queryDataIds(["instrument", "detector"], datasets="bias", collections=chain))
2343 self.assertGreater(len(dataIds), 0)
2345 # glob will skip too
2346 datasets = list(registry.queryDatasets("bias", collections="*d*"))
2347 self.assertGreater(len(datasets), 0)
2349 # regular expression will skip too
2350 pattern = re.compile(".*")
2351 datasets = list(registry.queryDatasets("bias", collections=pattern))
2352 self.assertGreater(len(datasets), 0)
2354 # ellipsis should work as usual
2355 datasets = list(registry.queryDatasets("bias", collections=...))
2356 self.assertGreater(len(datasets), 0)
2358 # few tests with findFirst
2359 datasets = list(registry.queryDatasets("bias", collections=chain, findFirst=True))
2360 self.assertGreater(len(datasets), 0)
2362 def testIngestTimeQuery(self):
2363 registry = self.makeRegistry()
2364 self.loadData(registry, "base.yaml")
2365 dt0 = datetime.utcnow()
2366 self.loadData(registry, "datasets.yaml")
2367 dt1 = datetime.utcnow()
2369 datasets = list(registry.queryDatasets(..., collections=...))
2370 len0 = len(datasets)
2371 self.assertGreater(len0, 0)
2373 where = "ingest_date > T'2000-01-01'"
2374 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2375 len1 = len(datasets)
2376 self.assertEqual(len0, len1)
2378 # no one will ever use this piece of software in 30 years
2379 where = "ingest_date > T'2050-01-01'"
2380 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2381 len2 = len(datasets)
2382 self.assertEqual(len2, 0)
2384 # Check more exact timing to make sure there is no 37 seconds offset
2385 # (after fixing DM-30124). SQLite time precision is 1 second, make
2386 # sure that we don't test with higher precision.
2387 tests = [
2388 # format: (timestamp, operator, expected_len)
2389 (dt0 - timedelta(seconds=1), ">", len0),
2390 (dt0 - timedelta(seconds=1), "<", 0),
2391 (dt1 + timedelta(seconds=1), "<", len0),
2392 (dt1 + timedelta(seconds=1), ">", 0),
2393 ]
2394 for dt, op, expect_len in tests:
2395 dt_str = dt.isoformat(sep=" ")
2397 where = f"ingest_date {op} T'{dt_str}'"
2398 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2399 self.assertEqual(len(datasets), expect_len)
2401 # same with bind using datetime or astropy Time
2402 where = f"ingest_date {op} ingest_time"
2403 datasets = list(
2404 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt})
2405 )
2406 self.assertEqual(len(datasets), expect_len)
2408 dt_astropy = astropy.time.Time(dt, format="datetime")
2409 datasets = list(
2410 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt_astropy})
2411 )
2412 self.assertEqual(len(datasets), expect_len)
2414 def testTimespanQueries(self):
2415 """Test query expressions involving timespans."""
2416 registry = self.makeRegistry()
2417 self.loadData(registry, "hsc-rc2-subset.yaml")
2418 # All exposures in the database; mapping from ID to timespan.
2419 visits = {record.id: record.timespan for record in registry.queryDimensionRecords("visit")}
2420 # Just those IDs, sorted (which is also temporal sorting, because HSC
2421 # exposure IDs are monotonically increasing).
2422 ids = sorted(visits.keys())
2423 self.assertGreater(len(ids), 20)
2424 # Pick some quasi-random indexes into `ids` to play with.
2425 i1 = int(len(ids) * 0.1)
2426 i2 = int(len(ids) * 0.3)
2427 i3 = int(len(ids) * 0.6)
2428 i4 = int(len(ids) * 0.8)
2429 # Extract some times from those: just before the beginning of i1 (which
2430 # should be after the end of the exposure before), exactly the
2431 # beginning of i2, just after the beginning of i3 (and before its end),
2432 # and the exact end of i4.
2433 t1 = visits[ids[i1]].begin - astropy.time.TimeDelta(1.0, format="sec")
2434 self.assertGreater(t1, visits[ids[i1 - 1]].end)
2435 t2 = visits[ids[i2]].begin
2436 t3 = visits[ids[i3]].begin + astropy.time.TimeDelta(1.0, format="sec")
2437 self.assertLess(t3, visits[ids[i3]].end)
2438 t4 = visits[ids[i4]].end
2439 # Make sure those are actually in order.
2440 self.assertEqual([t1, t2, t3, t4], sorted([t4, t3, t2, t1]))
2442 bind = {
2443 "t1": t1,
2444 "t2": t2,
2445 "t3": t3,
2446 "t4": t4,
2447 "ts23": Timespan(t2, t3),
2448 }
2450 def query(where):
2451 """Return results as a sorted, deduplicated list of visit IDs."""
2452 return sorted(
2453 {
2454 dataId["visit"]
2455 for dataId in registry.queryDataIds("visit", instrument="HSC", bind=bind, where=where)
2456 }
2457 )
2459 # Try a bunch of timespan queries, mixing up the bounds themselves,
2460 # where they appear in the expression, and how we get the timespan into
2461 # the expression.
2463 # t1 is before the start of i1, so this should not include i1.
2464 self.assertEqual(ids[:i1], query("visit.timespan OVERLAPS (null, t1)"))
2465 # t2 is exactly at the start of i2, but ends are exclusive, so these
2466 # should not include i2.
2467 self.assertEqual(ids[i1:i2], query("(t1, t2) OVERLAPS visit.timespan"))
2468 self.assertEqual(ids[:i2], query("visit.timespan < (t2, t4)"))
2469 # t3 is in the middle of i3, so this should include i3.
2470 self.assertEqual(ids[i2 : i3 + 1], query("visit.timespan OVERLAPS ts23"))
2471 # This one should not include t3 by the same reasoning.
2472 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > (t1, t3)"))
2473 # t4 is exactly at the end of i4, so this should include i4.
2474 self.assertEqual(ids[i3 : i4 + 1], query(f"visit.timespan OVERLAPS (T'{t3.tai.isot}', t4)"))
2475 # i4's upper bound of t4 is exclusive so this should not include t4.
2476 self.assertEqual(ids[i4 + 1 :], query("visit.timespan OVERLAPS (t4, NULL)"))
2478 # Now some timespan vs. time scalar queries.
2479 self.assertEqual(ids[:i2], query("visit.timespan < t2"))
2480 self.assertEqual(ids[:i2], query("t2 > visit.timespan"))
2481 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > t3"))
2482 self.assertEqual(ids[i3 + 1 :], query("t3 < visit.timespan"))
2483 self.assertEqual(ids[i3 : i3 + 1], query("visit.timespan OVERLAPS t3"))
2484 self.assertEqual(ids[i3 : i3 + 1], query(f"T'{t3.tai.isot}' OVERLAPS visit.timespan"))
2486 # Empty timespans should not overlap anything.
2487 self.assertEqual([], query("visit.timespan OVERLAPS (t3, t2)"))
2489 def testCollectionSummaries(self):
2490 """Test recording and retrieval of collection summaries."""
2491 self.maxDiff = None
2492 registry = self.makeRegistry()
2493 # Importing datasets from yaml should go through the code path where
2494 # we update collection summaries as we insert datasets.
2495 self.loadData(registry, "base.yaml")
2496 self.loadData(registry, "datasets.yaml")
2497 flat = registry.getDatasetType("flat")
2498 expected1 = CollectionSummary()
2499 expected1.dataset_types.add(registry.getDatasetType("bias"))
2500 expected1.add_data_ids(
2501 flat, [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)]
2502 )
2503 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2504 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2505 # Create a chained collection with both of the imported runs; the
2506 # summary should be the same, because it's a union with itself.
2507 chain = "chain"
2508 registry.registerCollection(chain, CollectionType.CHAINED)
2509 registry.setCollectionChain(chain, ["imported_r", "imported_g"])
2510 self.assertEqual(registry.getCollectionSummary(chain), expected1)
2511 # Associate flats only into a tagged collection and a calibration
2512 # collection to check summaries of those.
2513 tag = "tag"
2514 registry.registerCollection(tag, CollectionType.TAGGED)
2515 registry.associate(tag, registry.queryDatasets(flat, collections="imported_g"))
2516 calibs = "calibs"
2517 registry.registerCollection(calibs, CollectionType.CALIBRATION)
2518 registry.certify(
2519 calibs, registry.queryDatasets(flat, collections="imported_g"), timespan=Timespan(None, None)
2520 )
2521 expected2 = expected1.copy()
2522 expected2.dataset_types.discard("bias")
2523 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2524 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2525 # Explicitly calling Registry.refresh() should load those same
2526 # summaries, via a totally different code path.
2527 registry.refresh()
2528 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2529 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2530 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2531 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2533 def testBindInQueryDatasets(self):
2534 """Test that the bind parameter is correctly forwarded in
2535 queryDatasets recursion.
2536 """
2537 registry = self.makeRegistry()
2538 # Importing datasets from yaml should go through the code path where
2539 # we update collection summaries as we insert datasets.
2540 self.loadData(registry, "base.yaml")
2541 self.loadData(registry, "datasets.yaml")
2542 self.assertEqual(
2543 set(registry.queryDatasets("flat", band="r", collections=...)),
2544 set(registry.queryDatasets("flat", where="band=my_band", bind={"my_band": "r"}, collections=...)),
2545 )
2547 def testQueryIntRangeExpressions(self):
2548 """Test integer range expressions in ``where`` arguments.
2550 Note that our expressions use inclusive stop values, unlike Python's.
2551 """
2552 registry = self.makeRegistry()
2553 self.loadData(registry, "base.yaml")
2554 self.assertEqual(
2555 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..2)")),
2556 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 2]},
2557 )
2558 self.assertEqual(
2559 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..4:2)")),
2560 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 3]},
2561 )
2562 self.assertEqual(
2563 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (2..4:2)")),
2564 {registry.expandDataId(instrument="Cam1", detector=n) for n in [2, 4]},
2565 )
2567 def testQueryResultSummaries(self):
2568 """Test summary methods like `count`, `any`, and `explain_no_results`
2569 on `DataCoordinateQueryResults` and `DatasetQueryResults`.
2570 """
2571 registry = self.makeRegistry()
2572 self.loadData(registry, "base.yaml")
2573 self.loadData(registry, "datasets.yaml")
2574 self.loadData(registry, "spatial.yaml")
2575 # Default test dataset has two collections, each with both flats and
2576 # biases. Add a new collection with only biases.
2577 registry.registerCollection("biases", CollectionType.TAGGED)
2578 registry.associate("biases", registry.queryDatasets("bias", collections=["imported_g"]))
2579 # First query yields two results, and involves no postprocessing.
2580 query1 = registry.queryDataIds(["physical_filter"], band="r")
2581 self.assertTrue(query1.any(execute=False, exact=False))
2582 self.assertTrue(query1.any(execute=True, exact=False))
2583 self.assertTrue(query1.any(execute=True, exact=True))
2584 self.assertEqual(query1.count(exact=False), 2)
2585 self.assertEqual(query1.count(exact=True), 2)
2586 self.assertFalse(list(query1.explain_no_results()))
2587 # Second query should yield no results, which we should see when
2588 # we attempt to expand the data ID.
2589 query2 = registry.queryDataIds(["physical_filter"], band="h")
2590 # There's no execute=False, exact=Fals test here because the behavior
2591 # not something we want to guarantee in this case (and exact=False
2592 # says either answer is legal).
2593 self.assertFalse(query2.any(execute=True, exact=False))
2594 self.assertFalse(query2.any(execute=True, exact=True))
2595 self.assertEqual(query2.count(exact=False), 0)
2596 self.assertEqual(query2.count(exact=True), 0)
2597 self.assertTrue(list(query2.explain_no_results()))
2598 # These queries yield no results due to various problems that can be
2599 # spotted prior to execution, yielding helpful diagnostics.
2600 base_query = registry.queryDataIds(["detector", "physical_filter"])
2601 queries_and_snippets = [
2602 (
2603 # Dataset type name doesn't match any existing dataset types.
2604 registry.queryDatasets("nonexistent", collections=...),
2605 ["nonexistent"],
2606 ),
2607 (
2608 # Dataset type object isn't registered.
2609 registry.queryDatasets(
2610 DatasetType(
2611 "nonexistent",
2612 dimensions=["instrument"],
2613 universe=registry.dimensions,
2614 storageClass="Image",
2615 ),
2616 collections=...,
2617 ),
2618 ["nonexistent"],
2619 ),
2620 (
2621 # No datasets of this type in this collection.
2622 registry.queryDatasets("flat", collections=["biases"]),
2623 ["flat", "biases"],
2624 ),
2625 (
2626 # No datasets of this type in this collection.
2627 base_query.findDatasets("flat", collections=["biases"]),
2628 ["flat", "biases"],
2629 ),
2630 (
2631 # No collections matching at all.
2632 registry.queryDatasets("flat", collections=re.compile("potato.+")),
2633 ["potato"],
2634 ),
2635 ]
2636 # The behavior of these additional queries is slated to change in the
2637 # future, so we also check for deprecation warnings.
2638 with self.assertWarns(FutureWarning):
2639 queries_and_snippets.append(
2640 (
2641 # Dataset type name doesn't match any existing dataset
2642 # types.
2643 registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...),
2644 ["nonexistent"],
2645 )
2646 )
2647 with self.assertWarns(FutureWarning):
2648 queries_and_snippets.append(
2649 (
2650 # Dataset type name doesn't match any existing dataset
2651 # types.
2652 registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...),
2653 ["nonexistent"],
2654 )
2655 )
2656 for query, snippets in queries_and_snippets:
2657 self.assertFalse(query.any(execute=False, exact=False))
2658 self.assertFalse(query.any(execute=True, exact=False))
2659 self.assertFalse(query.any(execute=True, exact=True))
2660 self.assertEqual(query.count(exact=False), 0)
2661 self.assertEqual(query.count(exact=True), 0)
2662 messages = list(query.explain_no_results())
2663 self.assertTrue(messages)
2664 # Want all expected snippets to appear in at least one message.
2665 self.assertTrue(
2666 any(
2667 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2668 ),
2669 messages,
2670 )
2672 # This query does yield results, but should also emit a warning because
2673 # dataset type patterns to queryDataIds is deprecated; just look for
2674 # the warning.
2675 with self.assertWarns(FutureWarning):
2676 registry.queryDataIds(["detector"], datasets=re.compile("^nonexistent$"), collections=...)
2678 # These queries yield no results due to problems that can be identified
2679 # by cheap follow-up queries, yielding helpful diagnostics.
2680 for query, snippets in [
2681 (
2682 # No records for one of the involved dimensions.
2683 registry.queryDataIds(["subfilter"]),
2684 ["no rows", "subfilter"],
2685 ),
2686 (
2687 # No records for one of the involved dimensions.
2688 registry.queryDimensionRecords("subfilter"),
2689 ["no rows", "subfilter"],
2690 ),
2691 ]:
2692 self.assertFalse(query.any(execute=True, exact=False))
2693 self.assertFalse(query.any(execute=True, exact=True))
2694 self.assertEqual(query.count(exact=True), 0)
2695 messages = list(query.explain_no_results())
2696 self.assertTrue(messages)
2697 # Want all expected snippets to appear in at least one message.
2698 self.assertTrue(
2699 any(
2700 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2701 ),
2702 messages,
2703 )
2705 # This query yields four overlaps in the database, but one is filtered
2706 # out in postprocessing. The count queries aren't accurate because
2707 # they don't account for duplication that happens due to an internal
2708 # join against commonSkyPix.
2709 query3 = registry.queryDataIds(["visit", "tract"], instrument="Cam1", skymap="SkyMap1")
2710 self.assertEqual(
2711 {
2712 DataCoordinate.standardize(
2713 instrument="Cam1",
2714 skymap="SkyMap1",
2715 visit=v,
2716 tract=t,
2717 universe=registry.dimensions,
2718 )
2719 for v, t in [(1, 0), (2, 0), (2, 1)]
2720 },
2721 set(query3),
2722 )
2723 self.assertTrue(query3.any(execute=False, exact=False))
2724 self.assertTrue(query3.any(execute=True, exact=False))
2725 self.assertTrue(query3.any(execute=True, exact=True))
2726 self.assertGreaterEqual(query3.count(exact=False), 4)
2727 self.assertGreaterEqual(query3.count(exact=True, discard=True), 3)
2728 self.assertFalse(list(query3.explain_no_results()))
2729 # This query yields overlaps in the database, but all are filtered
2730 # out in postprocessing. The count queries again aren't very useful.
2731 # We have to use `where=` here to avoid an optimization that
2732 # (currently) skips the spatial postprocess-filtering because it
2733 # recognizes that no spatial join is necessary. That's not ideal, but
2734 # fixing it is out of scope for this ticket.
2735 query4 = registry.queryDataIds(
2736 ["visit", "tract"],
2737 instrument="Cam1",
2738 skymap="SkyMap1",
2739 where="visit=1 AND detector=1 AND tract=0 AND patch=4",
2740 )
2741 self.assertFalse(set(query4))
2742 self.assertTrue(query4.any(execute=False, exact=False))
2743 self.assertTrue(query4.any(execute=True, exact=False))
2744 self.assertFalse(query4.any(execute=True, exact=True))
2745 self.assertGreaterEqual(query4.count(exact=False), 1)
2746 self.assertEqual(query4.count(exact=True, discard=True), 0)
2747 messages = query4.explain_no_results()
2748 self.assertTrue(messages)
2749 self.assertTrue(any("overlap" in message for message in messages))
2750 # This query should yield results from one dataset type but not the
2751 # other, which is not registered.
2752 query5 = registry.queryDatasets(["bias", "nonexistent"], collections=["biases"])
2753 self.assertTrue(set(query5))
2754 self.assertTrue(query5.any(execute=False, exact=False))
2755 self.assertTrue(query5.any(execute=True, exact=False))
2756 self.assertTrue(query5.any(execute=True, exact=True))
2757 self.assertGreaterEqual(query5.count(exact=False), 1)
2758 self.assertGreaterEqual(query5.count(exact=True), 1)
2759 self.assertFalse(list(query5.explain_no_results()))
2760 # This query applies a selection that yields no results, fully in the
2761 # database. Explaining why it fails involves traversing the relation
2762 # tree and running a LIMIT 1 query at each level that has the potential
2763 # to remove rows.
2764 query6 = registry.queryDimensionRecords(
2765 "detector", where="detector.purpose = 'no-purpose'", instrument="Cam1"
2766 )
2767 self.assertEqual(query6.count(exact=True), 0)
2768 messages = query6.explain_no_results()
2769 self.assertTrue(messages)
2770 self.assertTrue(any("no-purpose" in message for message in messages))
2772 def testQueryDataIdsExpressionError(self):
2773 """Test error checking of 'where' expressions in queryDataIds."""
2774 registry = self.makeRegistry()
2775 self.loadData(registry, "base.yaml")
2776 bind = {"time": astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")}
2777 with self.assertRaisesRegex(LookupError, r"No dimension element with name 'foo' in 'foo\.bar'\."):
2778 registry.queryDataIds(["detector"], where="foo.bar = 12")
2779 with self.assertRaisesRegex(
2780 LookupError, "Dimension element name cannot be inferred in this context."
2781 ):
2782 registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind)
2784 def testQueryDataIdsOrderBy(self):
2785 """Test order_by and limit on result returned by queryDataIds()."""
2786 registry = self.makeRegistry()
2787 self.loadData(registry, "base.yaml")
2788 self.loadData(registry, "datasets.yaml")
2789 self.loadData(registry, "spatial.yaml")
2791 def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
2792 return registry.queryDataIds(
2793 dimensions, datasets=datasets, collections=collections, instrument="Cam1", skymap="SkyMap1"
2794 )
2796 Test = namedtuple(
2797 "testQueryDataIdsOrderByTest",
2798 ("order_by", "keys", "result", "limit", "datasets", "collections"),
2799 defaults=(None, None, None),
2800 )
2802 test_data = (
2803 Test("tract,visit", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2804 Test("-tract,visit", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))),
2805 Test("tract,-visit", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))),
2806 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2), (0, 2), (0, 1), (0, 1))),
2807 Test(
2808 "tract.id,visit.id",
2809 "tract,visit",
2810 ((0, 1), (0, 1), (0, 2)),
2811 limit=(3,),
2812 ),
2813 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2)), limit=(3,)),
2814 Test("tract,visit", "tract,visit", ((0, 2), (1, 2), (1, 2)), limit=(3, 3)),
2815 Test("-tract,-visit", "tract,visit", ((0, 1),), limit=(3, 5)),
2816 Test(
2817 "tract,visit.exposure_time", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))
2818 ),
2819 Test(
2820 "-tract,-visit.exposure_time", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))
2821 ),
2822 Test("tract,-exposure_time", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2823 Test("tract,visit.name", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2824 Test(
2825 "tract,-timespan.begin,timespan.end",
2826 "tract,visit",
2827 ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2)),
2828 ),
2829 Test("visit.day_obs,exposure.day_obs", "visit,exposure", ()),
2830 Test("visit.timespan.begin,-exposure.timespan.begin", "visit,exposure", ()),
2831 Test(
2832 "tract,detector",
2833 "tract,detector",
2834 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2835 datasets="flat",
2836 collections="imported_r",
2837 ),
2838 Test(
2839 "tract,detector.full_name",
2840 "tract,detector",
2841 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2842 datasets="flat",
2843 collections="imported_r",
2844 ),
2845 Test(
2846 "tract,detector.raft,detector.name_in_raft",
2847 "tract,detector",
2848 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2849 datasets="flat",
2850 collections="imported_r",
2851 ),
2852 )
2854 for test in test_data:
2855 order_by = test.order_by.split(",")
2856 keys = test.keys.split(",")
2857 query = do_query(keys, test.datasets, test.collections).order_by(*order_by)
2858 if test.limit is not None:
2859 query = query.limit(*test.limit)
2860 dataIds = tuple(tuple(dataId[k] for k in keys) for dataId in query)
2861 self.assertEqual(dataIds, test.result)
2863 # and materialize
2864 query = do_query(keys).order_by(*order_by)
2865 if test.limit is not None:
2866 query = query.limit(*test.limit)
2867 with self.assertRaises(RelationalAlgebraError):
2868 with query.materialize():
2869 pass
2871 # errors in a name
2872 for order_by in ("", "-"):
2873 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2874 list(do_query().order_by(order_by))
2876 for order_by in ("undimension.name", "-undimension.name"):
2877 with self.assertRaisesRegex(ValueError, "Unknown dimension element 'undimension'"):
2878 list(do_query().order_by(order_by))
2880 for order_by in ("attract", "-attract"):
2881 with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
2882 list(do_query().order_by(order_by))
2884 with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
2885 list(do_query(("exposure", "visit")).order_by("exposure_time"))
2887 with self.assertRaisesRegex(
2888 ValueError,
2889 r"Timespan exists in more than one dimension element \(exposure, visit\); "
2890 r"qualify timespan with specific dimension name\.",
2891 ):
2892 list(do_query(("exposure", "visit")).order_by("timespan.begin"))
2894 with self.assertRaisesRegex(
2895 ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
2896 ):
2897 list(do_query("tract").order_by("timespan.begin"))
2899 with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
2900 list(do_query("tract").order_by("tract.timespan.begin"))
2902 with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
2903 list(do_query("tract").order_by("tract.name"))
2905 with self.assertRaisesRegex(
2906 ValueError, r"Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?"
2907 ):
2908 list(do_query("visit").order_by("timestamp.begin"))
2910 def testQueryDataIdsGovernorExceptions(self):
2911 """Test exceptions raised by queryDataIds() for incorrect governors."""
2912 registry = self.makeRegistry()
2913 self.loadData(registry, "base.yaml")
2914 self.loadData(registry, "datasets.yaml")
2915 self.loadData(registry, "spatial.yaml")
2917 def do_query(dimensions, dataId=None, where="", bind=None, **kwargs):
2918 return registry.queryDataIds(dimensions, dataId=dataId, where=where, bind=bind, **kwargs)
2920 Test = namedtuple(
2921 "testQueryDataIdExceptionsTest",
2922 ("dimensions", "dataId", "where", "bind", "kwargs", "exception", "count"),
2923 defaults=(None, None, None, {}, None, 0),
2924 )
2926 test_data = (
2927 Test("tract,visit", count=6),
2928 Test("tract,visit", kwargs={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2929 Test(
2930 "tract,visit", kwargs={"instrument": "Cam2", "skymap": "SkyMap1"}, exception=DataIdValueError
2931 ),
2932 Test("tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2933 Test(
2934 "tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap2"}, exception=DataIdValueError
2935 ),
2936 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap1'", count=6),
2937 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap5'", exception=DataIdValueError),
2938 Test(
2939 "tract,visit",
2940 where="instrument=cam AND skymap=map",
2941 bind={"cam": "Cam1", "map": "SkyMap1"},
2942 count=6,
2943 ),
2944 Test(
2945 "tract,visit",
2946 where="instrument=cam AND skymap=map",
2947 bind={"cam": "Cam", "map": "SkyMap"},
2948 exception=DataIdValueError,
2949 ),
2950 )
2952 for test in test_data:
2953 dimensions = test.dimensions.split(",")
2954 if test.exception:
2955 with self.assertRaises(test.exception):
2956 do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs).count()
2957 else:
2958 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2959 self.assertEqual(query.count(discard=True), test.count)
2961 # and materialize
2962 if test.exception:
2963 with self.assertRaises(test.exception):
2964 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2965 with query.materialize() as materialized:
2966 materialized.count(discard=True)
2967 else:
2968 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2969 with query.materialize() as materialized:
2970 self.assertEqual(materialized.count(discard=True), test.count)
2972 def testQueryDimensionRecordsOrderBy(self):
2973 """Test order_by and limit on result returned by
2974 queryDimensionRecords().
2975 """
2976 registry = self.makeRegistry()
2977 self.loadData(registry, "base.yaml")
2978 self.loadData(registry, "datasets.yaml")
2979 self.loadData(registry, "spatial.yaml")
2981 def do_query(element, datasets=None, collections=None):
2982 return registry.queryDimensionRecords(
2983 element, instrument="Cam1", datasets=datasets, collections=collections
2984 )
2986 query = do_query("detector")
2987 self.assertEqual(len(list(query)), 4)
2989 Test = namedtuple(
2990 "testQueryDataIdsOrderByTest",
2991 ("element", "order_by", "result", "limit", "datasets", "collections"),
2992 defaults=(None, None, None),
2993 )
2995 test_data = (
2996 Test("detector", "detector", (1, 2, 3, 4)),
2997 Test("detector", "-detector", (4, 3, 2, 1)),
2998 Test("detector", "raft,-name_in_raft", (2, 1, 4, 3)),
2999 Test("detector", "-detector.purpose", (4,), limit=(1,)),
3000 Test("detector", "-purpose,detector.raft,name_in_raft", (2, 3), limit=(2, 2)),
3001 Test("visit", "visit", (1, 2)),
3002 Test("visit", "-visit.id", (2, 1)),
3003 Test("visit", "zenith_angle", (1, 2)),
3004 Test("visit", "-visit.name", (2, 1)),
3005 Test("visit", "day_obs,-timespan.begin", (2, 1)),
3006 )
3008 for test in test_data:
3009 order_by = test.order_by.split(",")
3010 query = do_query(test.element).order_by(*order_by)
3011 if test.limit is not None:
3012 query = query.limit(*test.limit)
3013 dataIds = tuple(rec.id for rec in query)
3014 self.assertEqual(dataIds, test.result)
3016 # errors in a name
3017 for order_by in ("", "-"):
3018 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
3019 list(do_query("detector").order_by(order_by))
3021 for order_by in ("undimension.name", "-undimension.name"):
3022 with self.assertRaisesRegex(ValueError, "Element name mismatch: 'undimension'"):
3023 list(do_query("detector").order_by(order_by))
3025 for order_by in ("attract", "-attract"):
3026 with self.assertRaisesRegex(ValueError, "Field 'attract' does not exist in 'detector'."):
3027 list(do_query("detector").order_by(order_by))
3029 for order_by in ("timestamp.begin", "-timestamp.begin"):
3030 with self.assertRaisesRegex(
3031 ValueError,
3032 r"Element name mismatch: 'timestamp' instead of 'visit'; "
3033 r"perhaps you meant 'timespan.begin'\?",
3034 ):
3035 list(do_query("visit").order_by(order_by))
3037 def testQueryDimensionRecordsExceptions(self):
3038 """Test exceptions raised by queryDimensionRecords()."""
3039 registry = self.makeRegistry()
3040 self.loadData(registry, "base.yaml")
3041 self.loadData(registry, "datasets.yaml")
3042 self.loadData(registry, "spatial.yaml")
3044 result = registry.queryDimensionRecords("detector")
3045 self.assertEqual(result.count(), 4)
3046 result = registry.queryDimensionRecords("detector", instrument="Cam1")
3047 self.assertEqual(result.count(), 4)
3048 result = registry.queryDimensionRecords("detector", dataId={"instrument": "Cam1"})
3049 self.assertEqual(result.count(), 4)
3050 result = registry.queryDimensionRecords("detector", where="instrument='Cam1'")
3051 self.assertEqual(result.count(), 4)
3052 result = registry.queryDimensionRecords("detector", where="instrument=instr", bind={"instr": "Cam1"})
3053 self.assertEqual(result.count(), 4)
3055 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3056 result = registry.queryDimensionRecords("detector", instrument="NotCam1")
3057 result.count()
3059 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3060 result = registry.queryDimensionRecords("detector", dataId={"instrument": "NotCam1"})
3061 result.count()
3063 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3064 result = registry.queryDimensionRecords("detector", where="instrument='NotCam1'")
3065 result.count()
3067 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3068 result = registry.queryDimensionRecords(
3069 "detector", where="instrument=instr", bind={"instr": "NotCam1"}
3070 )
3071 result.count()
3073 def testDatasetConstrainedDimensionRecordQueries(self):
3074 """Test that queryDimensionRecords works even when given a dataset
3075 constraint whose dimensions extend beyond the requested dimension
3076 element's.
3077 """
3078 registry = self.makeRegistry()
3079 self.loadData(registry, "base.yaml")
3080 self.loadData(registry, "datasets.yaml")
3081 # Query for physical_filter dimension records, using a dataset that
3082 # has both physical_filter and dataset dimensions.
3083 records = registry.queryDimensionRecords(
3084 "physical_filter",
3085 datasets=["flat"],
3086 collections="imported_r",
3087 )
3088 self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"})
3089 # Trying to constrain by all dataset types is an error.
3090 with self.assertRaises(TypeError):
3091 list(registry.queryDimensionRecords("physical_filter", datasets=..., collections="imported_r"))
3093 def testSkyPixDatasetQueries(self):
3094 """Test that we can build queries involving skypix dimensions as long
3095 as a dataset type that uses those dimensions is included.
3096 """
3097 registry = self.makeRegistry()
3098 self.loadData(registry, "base.yaml")
3099 dataset_type = DatasetType(
3100 "a", dimensions=["htm7", "instrument"], universe=registry.dimensions, storageClass="int"
3101 )
3102 registry.registerDatasetType(dataset_type)
3103 run = "r"
3104 registry.registerRun(run)
3105 # First try queries where there are no datasets; the concern is whether
3106 # we can even build and execute these queries without raising, even
3107 # when "doomed" query shortcuts are in play.
3108 self.assertFalse(
3109 list(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run))
3110 )
3111 self.assertFalse(list(registry.queryDatasets(dataset_type, collections=run)))
3112 # Now add a dataset and see that we can get it back.
3113 htm7 = registry.dimensions.skypix["htm"][7].pixelization
3114 data_id = registry.expandDataId(instrument="Cam1", htm7=htm7.universe()[0][0])
3115 (ref,) = registry.insertDatasets(dataset_type, [data_id], run=run)
3116 self.assertEqual(
3117 set(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run)),
3118 {data_id},
3119 )
3120 self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})
3122 def testDatasetIdFactory(self):
3123 """Simple test for DatasetIdFactory, mostly to catch potential changes
3124 in its API.
3125 """
3126 registry = self.makeRegistry()
3127 factory = DatasetIdFactory()
3128 dataset_type = DatasetType(
3129 "datasetType",
3130 dimensions=["detector", "instrument"],
3131 universe=registry.dimensions,
3132 storageClass="int",
3133 )
3134 run = "run"
3135 data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)
3137 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
3138 self.assertIsInstance(datasetId, uuid.UUID)
3139 self.assertEqual(datasetId.version, 4)
3141 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
3142 self.assertIsInstance(datasetId, uuid.UUID)
3143 self.assertEqual(datasetId.version, 5)
3145 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
3146 self.assertIsInstance(datasetId, uuid.UUID)
3147 self.assertEqual(datasetId.version, 5)
3149 def testExposureQueries(self):
3150 """Test query methods using arguments sourced from the exposure log
3151 service.
3153 The most complete test dataset currently available to daf_butler tests
3154 is hsc-rc2-subset.yaml export (which is unfortunately distinct from the
3155 the lsst/rc2_subset GitHub repo), but that does not have 'exposure'
3156 dimension records as it was focused on providing nontrivial spatial
3157 overlaps between visit+detector and tract+patch. So in this test we
3158 need to translate queries that originally used the exposure dimension
3159 to use the (very similar) visit dimension instead.
3160 """
3161 registry = self.makeRegistry()
3162 self.loadData(registry, "hsc-rc2-subset.yaml")
3163 self.assertEqual(
3164 [
3165 record.id
3166 for record in registry.queryDimensionRecords("visit", instrument="HSC")
3167 .order_by("id")
3168 .limit(5)
3169 ],
3170 [318, 322, 326, 330, 332],
3171 )
3172 self.assertEqual(
3173 [
3174 data_id["visit"]
3175 for data_id in registry.queryDataIds(["visit"], instrument="HSC").order_by("id").limit(5)
3176 ],
3177 [318, 322, 326, 330, 332],
3178 )
3179 self.assertEqual(
3180 [
3181 record.id
3182 for record in registry.queryDimensionRecords("detector", instrument="HSC")
3183 .order_by("full_name")
3184 .limit(5)
3185 ],
3186 [73, 72, 71, 70, 65],
3187 )
3188 self.assertEqual(
3189 [
3190 data_id["detector"]
3191 for data_id in registry.queryDataIds(["detector"], instrument="HSC")
3192 .order_by("full_name")
3193 .limit(5)
3194 ],
3195 [73, 72, 71, 70, 65],
3196 )
3198 def test_long_query_names(self) -> None:
3199 """Test that queries involving very long names are handled correctly.
3201 This is especially important for PostgreSQL, which truncates symbols
3202 longer than 64 chars, but it's worth testing for all DBs.
3203 """
3204 registry = self.makeRegistry()
3205 name = "abcd" * 17
3206 registry.registerDatasetType(
3207 DatasetType(
3208 name,
3209 dimensions=(),
3210 storageClass="Exposure",
3211 universe=registry.dimensions,
3212 )
3213 )
3214 # Need to search more than one collection actually containing a
3215 # matching dataset to avoid optimizations that sidestep bugs due to
3216 # truncation by making findFirst=True a no-op.
3217 run1 = "run1"
3218 registry.registerRun(run1)
3219 run2 = "run2"
3220 registry.registerRun(run2)
3221 (ref1,) = registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run1)
3222 registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run2)
3223 self.assertEqual(
3224 set(registry.queryDatasets(name, collections=[run1, run2], findFirst=True)),
3225 {ref1},
3226 )
3228 def test_skypix_constraint_queries(self) -> None:
3229 """Test queries spatially constrained by a skypix data ID."""
3230 registry = self.makeRegistry()
3231 self.loadData(registry, "hsc-rc2-subset.yaml")
3232 patch_regions = {
3233 (data_id["tract"], data_id["patch"]): data_id.region
3234 for data_id in registry.queryDataIds(["patch"]).expanded()
3235 }
3236 skypix_dimension: SkyPixDimension = registry.dimensions["htm11"]
3237 # This check ensures the test doesn't become trivial due to a config
3238 # change; if it does, just pick a different HTML level.
3239 self.assertNotEqual(skypix_dimension, registry.dimensions.commonSkyPix)
3240 # Gather all skypix IDs that definitely overlap at least one of these
3241 # patches.
3242 relevant_skypix_ids = lsst.sphgeom.RangeSet()
3243 for patch_region in patch_regions.values():
3244 relevant_skypix_ids |= skypix_dimension.pixelization.interior(patch_region)
3245 # Look for a "nontrivial" skypix_id that overlaps at least one patch
3246 # and does not overlap at least one other patch.
3247 for skypix_id in itertools.chain.from_iterable(
3248 range(begin, end) for begin, end in relevant_skypix_ids
3249 ):
3250 skypix_region = skypix_dimension.pixelization.pixel(skypix_id)
3251 overlapping_patches = {
3252 patch_key
3253 for patch_key, patch_region in patch_regions.items()
3254 if not patch_region.isDisjointFrom(skypix_region)
3255 }
3256 if overlapping_patches and overlapping_patches != patch_regions.keys():
3257 break
3258 else:
3259 raise RuntimeError("Could not find usable skypix ID for this dimension configuration.")
3260 self.assertEqual(
3261 {
3262 (data_id["tract"], data_id["patch"])
3263 for data_id in registry.queryDataIds(
3264 ["patch"],
3265 dataId={skypix_dimension.name: skypix_id},
3266 )
3267 },
3268 overlapping_patches,
3269 )
3270 # Test that a three-way join that includes the common skypix system in
3271 # the dimensions doesn't generate redundant join terms in the query.
3272 full_data_ids = set(
3273 registry.queryDataIds(
3274 ["tract", "visit", "htm7"], skymap="hsc_rings_v1", instrument="HSC"
3275 ).expanded()
3276 )
3277 self.assertGreater(len(full_data_ids), 0)
3278 for data_id in full_data_ids:
3279 self.assertFalse(data_id.records["tract"].region.isDisjointFrom(data_id.records["htm7"].region))
3280 self.assertFalse(data_id.records["visit"].region.isDisjointFrom(data_id.records["htm7"].region))
3282 def test_spatial_constraint_queries(self) -> None:
3283 """Test queries in which one spatial dimension in the constraint (data
3284 ID or ``where`` string) constrains a different spatial dimension in the
3285 query result columns.
3286 """
3287 registry = self.makeRegistry()
3288 self.loadData(registry, "hsc-rc2-subset.yaml")
3289 patch_regions = {
3290 (data_id["tract"], data_id["patch"]): data_id.region
3291 for data_id in registry.queryDataIds(["patch"]).expanded()
3292 }
3293 observation_regions = {
3294 (data_id["visit"], data_id["detector"]): data_id.region
3295 for data_id in registry.queryDataIds(["visit", "detector"]).expanded()
3296 }
3297 all_combos = {
3298 (patch_key, observation_key)
3299 for patch_key, observation_key in itertools.product(patch_regions, observation_regions)
3300 }
3301 overlapping_combos = {
3302 (patch_key, observation_key)
3303 for patch_key, observation_key in all_combos
3304 if not patch_regions[patch_key].isDisjointFrom(observation_regions[observation_key])
3305 }
3306 # Check a direct spatial join with no constraint first.
3307 self.assertEqual(
3308 {
3309 ((data_id["tract"], data_id["patch"]), (data_id["visit"], data_id["detector"]))
3310 for data_id in registry.queryDataIds(["patch", "visit", "detector"])
3311 },
3312 overlapping_combos,
3313 )
3314 overlaps_by_patch: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3315 overlaps_by_observation: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3316 for patch_key, observation_key in overlapping_combos:
3317 overlaps_by_patch[patch_key].add(observation_key)
3318 overlaps_by_observation[observation_key].add(patch_key)
3319 # Find patches and observations that overlap at least one of the other
3320 # but not all of the other.
3321 nontrivial_patch = next(
3322 iter(
3323 patch_key
3324 for patch_key, observation_keys in overlaps_by_patch.items()
3325 if observation_keys and observation_keys != observation_regions.keys()
3326 )
3327 )
3328 nontrivial_observation = next(
3329 iter(
3330 observation_key
3331 for observation_key, patch_keys in overlaps_by_observation.items()
3332 if patch_keys and patch_keys != patch_regions.keys()
3333 )
3334 )
3335 # Use the nontrivial patches and observations as constraints on the
3336 # other dimensions in various ways, first via a 'where' expression.
3337 # It's better in general to us 'bind' instead of f-strings, but these
3338 # all integers so there are no quoting concerns.
3339 self.assertEqual(
3340 {
3341 (data_id["visit"], data_id["detector"])
3342 for data_id in registry.queryDataIds(
3343 ["visit", "detector"],
3344 where=f"tract={nontrivial_patch[0]} AND patch={nontrivial_patch[1]}",
3345 skymap="hsc_rings_v1",
3346 )
3347 },
3348 overlaps_by_patch[nontrivial_patch],
3349 )
3350 self.assertEqual(
3351 {
3352 (data_id["tract"], data_id["patch"])
3353 for data_id in registry.queryDataIds(
3354 ["patch"],
3355 where=f"visit={nontrivial_observation[0]} AND detector={nontrivial_observation[1]}",
3356 instrument="HSC",
3357 )
3358 },
3359 overlaps_by_observation[nontrivial_observation],
3360 )
3361 # and then via the dataId argument.
3362 self.assertEqual(
3363 {
3364 (data_id["visit"], data_id["detector"])
3365 for data_id in registry.queryDataIds(
3366 ["visit", "detector"],
3367 dataId={
3368 "tract": nontrivial_patch[0],
3369 "patch": nontrivial_patch[1],
3370 },
3371 skymap="hsc_rings_v1",
3372 )
3373 },
3374 overlaps_by_patch[nontrivial_patch],
3375 )
3376 self.assertEqual(
3377 {
3378 (data_id["tract"], data_id["patch"])
3379 for data_id in registry.queryDataIds(
3380 ["patch"],
3381 dataId={
3382 "visit": nontrivial_observation[0],
3383 "detector": nontrivial_observation[1],
3384 },
3385 instrument="HSC",
3386 )
3387 },
3388 overlaps_by_observation[nontrivial_observation],
3389 )
3391 def test_query_projection_drop_postprocessing(self) -> None:
3392 """Test that projections and deduplications on query objects can
3393 drop post-query region filtering to ensure the query remains in
3394 the SQL engine.
3395 """
3396 registry = self.makeRegistry()
3397 self.loadData(registry, "base.yaml")
3398 self.loadData(registry, "spatial.yaml")
3400 def pop_transfer(tree: Relation) -> Relation:
3401 """If a relation tree terminates with a transfer to a new engine,
3402 return the relation prior to that transfer. If not, return the
3403 original relation.
3404 """
3405 match tree:
3406 case Transfer(target=target):
3407 return target
3408 case _:
3409 return tree
3411 # There's no public way to get a Query object yet, so we get one from a
3412 # DataCoordinateQueryResults private attribute. When a public API is
3413 # available this test should use it.
3414 query = registry.queryDataIds(["visit", "detector", "tract", "patch"])._query
3415 # We expect this query to terminate in the iteration engine originally,
3416 # because region-filtering is necessary.
3417 self.assertIsInstance(pop_transfer(query.relation).engine, iteration.Engine)
3418 # If we deduplicate, we usually have to do that downstream of the
3419 # filtering. That means the deduplication has to happen in the
3420 # iteration engine.
3421 self.assertIsInstance(pop_transfer(query.projected(unique=True).relation).engine, iteration.Engine)
3422 # If we pass drop_postprocessing, we instead drop the region filtering
3423 # so the deduplication can happen in SQL (though there might still be
3424 # transfer to iteration at the tail of the tree that we can ignore;
3425 # that's what the pop_transfer takes care of here).
3426 self.assertIsInstance(
3427 pop_transfer(query.projected(unique=True, drop_postprocessing=True).relation).engine,
3428 sql.Engine,
3429 )
3431 def test_query_find_datasets_drop_postprocessing(self) -> None:
3432 """Test that DataCoordinateQueryResults.findDatasets avoids commutator
3433 problems with the FindFirstDataset relation operation.
3434 """
3435 # Setup: load some visit, tract, and patch records, and insert two
3436 # datasets with dimensions {visit, patch}, with one in each of two
3437 # RUN collections.
3438 registry = self.makeRegistry()
3439 self.loadData(registry, "base.yaml")
3440 self.loadData(registry, "spatial.yaml")
3441 storage_class = StorageClass("Warpy")
3442 registry.storageClasses.registerStorageClass(storage_class)
3443 dataset_type = DatasetType(
3444 "warp", {"visit", "patch"}, storageClass=storage_class, universe=registry.dimensions
3445 )
3446 registry.registerDatasetType(dataset_type)
3447 (data_id,) = registry.queryDataIds(["visit", "patch"]).limit(1)
3448 registry.registerRun("run1")
3449 registry.registerRun("run2")
3450 (ref1,) = registry.insertDatasets(dataset_type, [data_id], run="run1")
3451 (ref2,) = registry.insertDatasets(dataset_type, [data_id], run="run2")
3452 # Query for the dataset using queryDataIds(...).findDatasets(...)
3453 # against only one of the two collections. This should work even
3454 # though the relation returned by queryDataIds ends with
3455 # iteration-engine region-filtering, because we can recognize before
3456 # running the query that there is only one collecton to search and
3457 # hence the (default) findFirst=True is irrelevant, and joining in the
3458 # dataset query commutes past the iteration-engine postprocessing.
3459 query1 = registry.queryDataIds(
3460 {"visit", "patch"}, visit=data_id["visit"], instrument=data_id["instrument"]
3461 )
3462 self.assertEqual(
3463 set(query1.findDatasets(dataset_type.name, collections=["run1"])),
3464 {ref1},
3465 )
3466 # Query for the dataset using queryDataIds(...).findDatasets(...)
3467 # against both collections. This can only work if the FindFirstDataset
3468 # operation can be commuted past the iteration-engine options into SQL.
3469 query2 = registry.queryDataIds(
3470 {"visit", "patch"}, visit=data_id["visit"], instrument=data_id["instrument"]
3471 )
3472 self.assertEqual(
3473 set(query2.findDatasets(dataset_type.name, collections=["run2", "run1"])),
3474 {ref2},
3475 )
3477 def test_query_empty_collections(self) -> None:
3478 """Test for registry query methods with empty collections. The methods
3479 should return empty result set (or None when applicable) and provide
3480 "doomed" diagnostics.
3481 """
3482 registry = self.makeRegistry()
3483 self.loadData(registry, "base.yaml")
3484 self.loadData(registry, "datasets.yaml")
3486 # Tests for registry.findDataset()
3487 with self.assertRaises(NoDefaultCollectionError):
3488 registry.findDataset("bias", instrument="Cam1", detector=1)
3489 self.assertIsNotNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=...))
3490 self.assertIsNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=[]))
3492 # Tests for registry.queryDatasets()
3493 with self.assertRaises(NoDefaultCollectionError):
3494 registry.queryDatasets("bias")
3495 self.assertTrue(list(registry.queryDatasets("bias", collections=...)))
3497 result = registry.queryDatasets("bias", collections=[])
3498 self.assertEqual(len(list(result)), 0)
3499 messages = list(result.explain_no_results())
3500 self.assertTrue(messages)
3501 self.assertTrue(any("because collection list is empty" in message for message in messages))
3503 # Tests for registry.queryDataIds()
3504 with self.assertRaises(NoDefaultCollectionError):
3505 registry.queryDataIds("detector", datasets="bias")
3506 self.assertTrue(list(registry.queryDataIds("detector", datasets="bias", collections=...)))
3508 result = registry.queryDataIds("detector", datasets="bias", collections=[])
3509 self.assertEqual(len(list(result)), 0)
3510 messages = list(result.explain_no_results())
3511 self.assertTrue(messages)
3512 self.assertTrue(any("because collection list is empty" in message for message in messages))
3514 # Tests for registry.queryDimensionRecords()
3515 with self.assertRaises(NoDefaultCollectionError):
3516 registry.queryDimensionRecords("detector", datasets="bias")
3517 self.assertTrue(list(registry.queryDimensionRecords("detector", datasets="bias", collections=...)))
3519 result = registry.queryDimensionRecords("detector", datasets="bias", collections=[])
3520 self.assertEqual(len(list(result)), 0)
3521 messages = list(result.explain_no_results())
3522 self.assertTrue(messages)
3523 self.assertTrue(any("because collection list is empty" in message for message in messages))