Coverage for python/lsst/daf/butler/registry/tests/_registry.py: 4%
1466 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 09:13 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 09:13 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RegistryTests"]
25import itertools
26import logging
27import os
28import re
29import unittest
30import uuid
31from abc import ABC, abstractmethod
32from collections import defaultdict, namedtuple
33from collections.abc import Iterator
34from datetime import datetime, timedelta
35from typing import TYPE_CHECKING
37import astropy.time
38import sqlalchemy
40try:
41 import numpy as np
42except ImportError:
43 np = None
45import lsst.sphgeom
46from lsst.daf.relation import Relation, RelationalAlgebraError, Transfer, iteration, sql
48from ...core import (
49 DataCoordinate,
50 DataCoordinateSet,
51 DatasetAssociation,
52 DatasetIdGenEnum,
53 DatasetRef,
54 DatasetType,
55 DimensionGraph,
56 NamedValueSet,
57 SkyPixDimension,
58 StorageClass,
59 Timespan,
60 ddl,
61)
62from .._collection_summary import CollectionSummary
63from .._collectionType import CollectionType
64from .._config import RegistryConfig
65from .._exceptions import (
66 ArgumentError,
67 CollectionError,
68 CollectionTypeError,
69 ConflictingDefinitionError,
70 DataIdValueError,
71 DatasetTypeError,
72 InconsistentDataIdError,
73 MissingCollectionError,
74 MissingDatasetTypeError,
75 NoDefaultCollectionError,
76 OrphanedRecordError,
77)
78from ..interfaces import ButlerAttributeExistsError
80if TYPE_CHECKING:
81 from .._registry import Registry
84class RegistryTests(ABC):
85 """Generic tests for the `Registry` class that can be subclassed to
86 generate tests for different configurations.
87 """
89 collectionsManager: str | None = None
90 """Name of the collections manager class, if subclass provides value for
91 this member then it overrides name specified in default configuration
92 (`str`).
93 """
95 datasetsManager: str | dict[str, str] | None = None
96 """Name or configuration dictionary of the datasets manager class, if
97 subclass provides value for this member then it overrides name specified
98 in default configuration (`str` or `dict`).
99 """
101 @classmethod
102 @abstractmethod
103 def getDataDir(cls) -> str:
104 """Return the root directory containing test data YAML files."""
105 raise NotImplementedError()
107 def makeRegistryConfig(self) -> RegistryConfig:
108 """Create RegistryConfig used to create a registry.
110 This method should be called by a subclass from `makeRegistry`.
111 Returned instance will be pre-configured based on the values of class
112 members, and default-configured for all other parameters. Subclasses
113 that need default configuration should just instantiate
114 `RegistryConfig` directly.
115 """
116 config = RegistryConfig()
117 if self.collectionsManager:
118 config["managers", "collections"] = self.collectionsManager
119 if self.datasetsManager:
120 config["managers", "datasets"] = self.datasetsManager
121 return config
123 @abstractmethod
124 def makeRegistry(self, share_repo_with: Registry | None = None) -> Registry | None:
125 """Return the Registry instance to be tested.
127 Parameters
128 ----------
129 share_repo_with : `Registry`, optional
130 If provided, the new registry should point to the same data
131 repository as this existing registry.
133 Returns
134 -------
135 registry : `Registry`
136 New `Registry` instance, or `None` *only* if `share_repo_with` is
137 not `None` and this test case does not support that argument
138 (e.g. it is impossible with in-memory SQLite DBs).
139 """
140 raise NotImplementedError()
142 def loadData(self, registry: Registry, filename: str):
143 """Load registry test data from ``getDataDir/<filename>``,
144 which should be a YAML import/export file.
145 """
146 from ...transfers import YamlRepoImportBackend
148 with open(os.path.join(self.getDataDir(), filename)) as stream:
149 backend = YamlRepoImportBackend(stream, registry)
150 backend.register()
151 backend.load(datastore=None)
153 def checkQueryResults(self, results, expected):
154 """Check that a query results object contains expected values.
156 Parameters
157 ----------
158 results : `DataCoordinateQueryResults` or `DatasetQueryResults`
159 A lazy-evaluation query results object.
160 expected : `list`
161 A list of `DataCoordinate` o `DatasetRef` objects that should be
162 equal to results of the query, aside from ordering.
163 """
164 self.assertCountEqual(list(results), expected)
165 self.assertEqual(results.count(), len(expected))
166 if expected:
167 self.assertTrue(results.any())
168 else:
169 self.assertFalse(results.any())
171 def testOpaque(self):
172 """Tests for `Registry.registerOpaqueTable`,
173 `Registry.insertOpaqueData`, `Registry.fetchOpaqueData`, and
174 `Registry.deleteOpaqueData`.
175 """
176 registry = self.makeRegistry()
177 table = "opaque_table_for_testing"
178 registry.registerOpaqueTable(
179 table,
180 spec=ddl.TableSpec(
181 fields=[
182 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, primaryKey=True),
183 ddl.FieldSpec("name", dtype=sqlalchemy.String, length=16, nullable=False),
184 ddl.FieldSpec("count", dtype=sqlalchemy.SmallInteger, nullable=True),
185 ],
186 ),
187 )
188 rows = [
189 {"id": 1, "name": "one", "count": None},
190 {"id": 2, "name": "two", "count": 5},
191 {"id": 3, "name": "three", "count": 6},
192 ]
193 registry.insertOpaqueData(table, *rows)
194 self.assertCountEqual(rows, list(registry.fetchOpaqueData(table)))
195 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=1)))
196 self.assertEqual(rows[1:2], list(registry.fetchOpaqueData(table, name="two")))
197 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=(1, 3), name=("one", "two"))))
198 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=(1, 2, 3))))
199 # Test very long IN clause which exceeds sqlite limit on number of
200 # parameters. SQLite says the limit is 32k but it looks like it is
201 # much higher.
202 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=list(range(300_000)))))
203 # Two IN clauses, each longer than 1k batch size, first with
204 # duplicates, second has matching elements in different batches (after
205 # sorting).
206 self.assertEqual(
207 rows[0:2],
208 list(
209 registry.fetchOpaqueData(
210 table,
211 id=list(range(1000)) + list(range(100, 0, -1)),
212 name=["one"] + [f"q{i}" for i in range(2200)] + ["two"],
213 )
214 ),
215 )
216 self.assertEqual([], list(registry.fetchOpaqueData(table, id=1, name="two")))
217 registry.deleteOpaqueData(table, id=3)
218 self.assertCountEqual(rows[:2], list(registry.fetchOpaqueData(table)))
219 registry.deleteOpaqueData(table)
220 self.assertEqual([], list(registry.fetchOpaqueData(table)))
222 def testDatasetType(self):
223 """Tests for `Registry.registerDatasetType` and
224 `Registry.getDatasetType`.
225 """
226 registry = self.makeRegistry()
227 # Check valid insert
228 datasetTypeName = "test"
229 storageClass = StorageClass("testDatasetType")
230 registry.storageClasses.registerStorageClass(storageClass)
231 dimensions = registry.dimensions.extract(("instrument", "visit"))
232 differentDimensions = registry.dimensions.extract(("instrument", "patch"))
233 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
234 # Inserting for the first time should return True
235 self.assertTrue(registry.registerDatasetType(inDatasetType))
236 outDatasetType1 = registry.getDatasetType(datasetTypeName)
237 self.assertEqual(outDatasetType1, inDatasetType)
239 # Re-inserting should work
240 self.assertFalse(registry.registerDatasetType(inDatasetType))
241 # Except when they are not identical
242 with self.assertRaises(ConflictingDefinitionError):
243 nonIdenticalDatasetType = DatasetType(datasetTypeName, differentDimensions, storageClass)
244 registry.registerDatasetType(nonIdenticalDatasetType)
246 # Template can be None
247 datasetTypeName = "testNoneTemplate"
248 storageClass = StorageClass("testDatasetType2")
249 registry.storageClasses.registerStorageClass(storageClass)
250 dimensions = registry.dimensions.extract(("instrument", "visit"))
251 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
252 registry.registerDatasetType(inDatasetType)
253 outDatasetType2 = registry.getDatasetType(datasetTypeName)
254 self.assertEqual(outDatasetType2, inDatasetType)
256 allTypes = set(registry.queryDatasetTypes())
257 self.assertEqual(allTypes, {outDatasetType1, outDatasetType2})
259 def testDimensions(self):
260 """Tests for `Registry.insertDimensionData`,
261 `Registry.syncDimensionData`, and `Registry.expandDataId`.
262 """
263 registry = self.makeRegistry()
264 dimensionName = "instrument"
265 dimension = registry.dimensions[dimensionName]
266 dimensionValue = {
267 "name": "DummyCam",
268 "visit_max": 10,
269 "visit_system": 0,
270 "exposure_max": 10,
271 "detector_max": 2,
272 "class_name": "lsst.pipe.base.Instrument",
273 }
274 registry.insertDimensionData(dimensionName, dimensionValue)
275 # Inserting the same value twice should fail
276 with self.assertRaises(sqlalchemy.exc.IntegrityError):
277 registry.insertDimensionData(dimensionName, dimensionValue)
278 # expandDataId should retrieve the record we just inserted
279 self.assertEqual(
280 registry.expandDataId(instrument="DummyCam", graph=dimension.graph)
281 .records[dimensionName]
282 .toDict(),
283 dimensionValue,
284 )
285 # expandDataId should raise if there is no record with the given ID.
286 with self.assertRaises(DataIdValueError):
287 registry.expandDataId({"instrument": "Unknown"}, graph=dimension.graph)
288 # band doesn't have a table; insert should fail.
289 with self.assertRaises(TypeError):
290 registry.insertDimensionData("band", {"band": "i"})
291 dimensionName2 = "physical_filter"
292 dimension2 = registry.dimensions[dimensionName2]
293 dimensionValue2 = {"name": "DummyCam_i", "band": "i"}
294 # Missing required dependency ("instrument") should fail
295 with self.assertRaises(KeyError):
296 registry.insertDimensionData(dimensionName2, dimensionValue2)
297 # Adding required dependency should fix the failure
298 dimensionValue2["instrument"] = "DummyCam"
299 registry.insertDimensionData(dimensionName2, dimensionValue2)
300 # expandDataId should retrieve the record we just inserted.
301 self.assertEqual(
302 registry.expandDataId(instrument="DummyCam", physical_filter="DummyCam_i", graph=dimension2.graph)
303 .records[dimensionName2]
304 .toDict(),
305 dimensionValue2,
306 )
307 # Use syncDimensionData to insert a new record successfully.
308 dimensionName3 = "detector"
309 dimensionValue3 = {
310 "instrument": "DummyCam",
311 "id": 1,
312 "full_name": "one",
313 "name_in_raft": "zero",
314 "purpose": "SCIENCE",
315 }
316 self.assertTrue(registry.syncDimensionData(dimensionName3, dimensionValue3))
317 # Sync that again. Note that one field ("raft") is NULL, and that
318 # should be okay.
319 self.assertFalse(registry.syncDimensionData(dimensionName3, dimensionValue3))
320 # Now try that sync with the same primary key but a different value.
321 # This should fail.
322 with self.assertRaises(ConflictingDefinitionError):
323 registry.syncDimensionData(
324 dimensionName3,
325 {
326 "instrument": "DummyCam",
327 "id": 1,
328 "full_name": "one",
329 "name_in_raft": "four",
330 "purpose": "SCIENCE",
331 },
332 )
334 @unittest.skipIf(np is None, "numpy not available.")
335 def testNumpyDataId(self):
336 """Test that we can use a numpy int in a dataId."""
337 registry = self.makeRegistry()
338 dimensionEntries = [
339 ("instrument", {"instrument": "DummyCam"}),
340 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
341 # Using an np.int64 here fails unless Records.fromDict is also
342 # patched to look for numbers.Integral
343 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
344 ]
345 for args in dimensionEntries:
346 registry.insertDimensionData(*args)
348 # Try a normal integer and something that looks like an int but
349 # is not.
350 for visit_id in (42, np.int64(42)):
351 with self.subTest(visit_id=visit_id, id_type=type(visit_id).__name__):
352 expanded = registry.expandDataId({"instrument": "DummyCam", "visit": visit_id})
353 self.assertEqual(expanded["visit"], int(visit_id))
354 self.assertIsInstance(expanded["visit"], int)
356 def testDataIdRelationships(self):
357 """Test that `Registry.expandDataId` raises an exception when the given
358 keys are inconsistent.
359 """
360 registry = self.makeRegistry()
361 self.loadData(registry, "base.yaml")
362 # Insert a few more dimension records for the next test.
363 registry.insertDimensionData(
364 "exposure",
365 {"instrument": "Cam1", "id": 1, "obs_id": "one", "physical_filter": "Cam1-G"},
366 )
367 registry.insertDimensionData(
368 "exposure",
369 {"instrument": "Cam1", "id": 2, "obs_id": "two", "physical_filter": "Cam1-G"},
370 )
371 registry.insertDimensionData(
372 "visit_system",
373 {"instrument": "Cam1", "id": 0, "name": "one-to-one"},
374 )
375 registry.insertDimensionData(
376 "visit",
377 {"instrument": "Cam1", "id": 1, "name": "one", "physical_filter": "Cam1-G", "visit_system": 0},
378 )
379 registry.insertDimensionData(
380 "visit_definition",
381 {"instrument": "Cam1", "visit": 1, "exposure": 1, "visit_system": 0},
382 )
383 with self.assertRaises(InconsistentDataIdError):
384 registry.expandDataId(
385 {"instrument": "Cam1", "visit": 1, "exposure": 2},
386 )
388 def testDataset(self):
389 """Basic tests for `Registry.insertDatasets`, `Registry.getDataset`,
390 and `Registry.removeDatasets`.
391 """
392 registry = self.makeRegistry()
393 self.loadData(registry, "base.yaml")
394 run = "tésτ"
395 registry.registerRun(run)
396 datasetType = registry.getDatasetType("bias")
397 dataId = {"instrument": "Cam1", "detector": 2}
398 (ref,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
399 outRef = registry.getDataset(ref.id)
400 self.assertIsNotNone(ref.id)
401 self.assertEqual(ref, outRef)
402 with self.assertRaises(ConflictingDefinitionError):
403 registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
404 registry.removeDatasets([ref])
405 self.assertIsNone(registry.findDataset(datasetType, dataId, collections=[run]))
407 def testFindDataset(self):
408 """Tests for `Registry.findDataset`."""
409 registry = self.makeRegistry()
410 self.loadData(registry, "base.yaml")
411 run = "tésτ"
412 datasetType = registry.getDatasetType("bias")
413 dataId = {"instrument": "Cam1", "detector": 4}
414 registry.registerRun(run)
415 (inputRef,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
416 outputRef = registry.findDataset(datasetType, dataId, collections=[run])
417 self.assertEqual(outputRef, inputRef)
418 # Check that retrieval with invalid dataId raises
419 with self.assertRaises(LookupError):
420 dataId = {"instrument": "Cam1"} # no detector
421 registry.findDataset(datasetType, dataId, collections=run)
422 # Check that different dataIds match to different datasets
423 dataId1 = {"instrument": "Cam1", "detector": 1}
424 (inputRef1,) = registry.insertDatasets(datasetType, dataIds=[dataId1], run=run)
425 dataId2 = {"instrument": "Cam1", "detector": 2}
426 (inputRef2,) = registry.insertDatasets(datasetType, dataIds=[dataId2], run=run)
427 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef1)
428 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef2)
429 self.assertNotEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef2)
430 self.assertNotEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef1)
431 # Check that requesting a non-existing dataId returns None
432 nonExistingDataId = {"instrument": "Cam1", "detector": 3}
433 self.assertIsNone(registry.findDataset(datasetType, nonExistingDataId, collections=run))
434 # Search more than one collection, in which two have the right
435 # dataset type and another does not.
436 registry.registerRun("empty")
437 self.loadData(registry, "datasets-uuid.yaml")
438 bias1 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_g"])
439 self.assertIsNotNone(bias1)
440 bias2 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_r"])
441 self.assertIsNotNone(bias2)
442 self.assertEqual(
443 bias1,
444 registry.findDataset(
445 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "imported_r"]
446 ),
447 )
448 self.assertEqual(
449 bias2,
450 registry.findDataset(
451 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_r", "imported_g"]
452 ),
453 )
454 # Search more than one collection, with one of them a CALIBRATION
455 # collection.
456 registry.registerCollection("Cam1/calib", CollectionType.CALIBRATION)
457 timespan = Timespan(
458 begin=astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai"),
459 end=astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai"),
460 )
461 registry.certify("Cam1/calib", [bias2], timespan=timespan)
462 self.assertEqual(
463 bias1,
464 registry.findDataset(
465 "bias",
466 instrument="Cam1",
467 detector=2,
468 collections=["empty", "imported_g", "Cam1/calib"],
469 timespan=timespan,
470 ),
471 )
472 self.assertEqual(
473 bias2,
474 registry.findDataset(
475 "bias",
476 instrument="Cam1",
477 detector=2,
478 collections=["empty", "Cam1/calib", "imported_g"],
479 timespan=timespan,
480 ),
481 )
482 # If we try to search those same collections without a timespan, it
483 # should still work, since the CALIBRATION collection is ignored.
484 self.assertEqual(
485 bias1,
486 registry.findDataset(
487 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "Cam1/calib"]
488 ),
489 )
490 self.assertEqual(
491 bias1,
492 registry.findDataset(
493 "bias", instrument="Cam1", detector=2, collections=["empty", "Cam1/calib", "imported_g"]
494 ),
495 )
497 def testRemoveDatasetTypeSuccess(self):
498 """Test that Registry.removeDatasetType works when there are no
499 datasets of that type present.
500 """
501 registry = self.makeRegistry()
502 self.loadData(registry, "base.yaml")
503 registry.removeDatasetType("flat")
504 with self.assertRaises(MissingDatasetTypeError):
505 registry.getDatasetType("flat")
507 def testRemoveDatasetTypeFailure(self):
508 """Test that Registry.removeDatasetType raises when there are datasets
509 of that type present or if the dataset type is for a component.
510 """
511 registry = self.makeRegistry()
512 self.loadData(registry, "base.yaml")
513 self.loadData(registry, "datasets.yaml")
514 with self.assertRaises(OrphanedRecordError):
515 registry.removeDatasetType("flat")
516 with self.assertRaises(ValueError):
517 registry.removeDatasetType(DatasetType.nameWithComponent("flat", "image"))
519 def testImportDatasetsUUID(self):
520 """Test for `Registry._importDatasets` with UUID dataset ID."""
521 if isinstance(self.datasetsManager, str):
522 if not self.datasetsManager.endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
523 self.skipTest(f"Unexpected dataset manager {self.datasetsManager}")
524 elif isinstance(self.datasetsManager, dict):
525 if not self.datasetsManager["cls"].endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
526 self.skipTest(f"Unexpected dataset manager {self.datasetsManager['cls']}")
528 registry = self.makeRegistry()
529 self.loadData(registry, "base.yaml")
530 for run in range(6):
531 registry.registerRun(f"run{run}")
532 datasetTypeBias = registry.getDatasetType("bias")
533 datasetTypeFlat = registry.getDatasetType("flat")
534 dataIdBias1 = {"instrument": "Cam1", "detector": 1}
535 dataIdBias2 = {"instrument": "Cam1", "detector": 2}
536 dataIdFlat1 = {"instrument": "Cam1", "detector": 1, "physical_filter": "Cam1-G", "band": "g"}
538 ref = DatasetRef(datasetTypeBias, dataIdBias1, run="run0")
539 (ref1,) = registry._importDatasets([ref])
540 # UUID is used without change
541 self.assertEqual(ref.id, ref1.id)
543 # All different failure modes
544 refs = (
545 # Importing same DatasetRef with different dataset ID is an error
546 DatasetRef(datasetTypeBias, dataIdBias1, run="run0"),
547 # Same DatasetId but different DataId
548 DatasetRef(datasetTypeBias, dataIdBias2, id=ref1.id, run="run0"),
549 DatasetRef(datasetTypeFlat, dataIdFlat1, id=ref1.id, run="run0"),
550 # Same DatasetRef and DatasetId but different run
551 DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run="run1"),
552 )
553 for ref in refs:
554 with self.assertRaises(ConflictingDefinitionError):
555 registry._importDatasets([ref])
557 # Test for non-unique IDs, they can be re-imported multiple times.
558 for run, idGenMode in ((2, DatasetIdGenEnum.DATAID_TYPE), (4, DatasetIdGenEnum.DATAID_TYPE_RUN)):
559 with self.subTest(idGenMode=idGenMode):
560 # Make dataset ref with reproducible dataset ID.
561 ref = DatasetRef(datasetTypeBias, dataIdBias1, run=f"run{run}", id_generation_mode=idGenMode)
562 (ref1,) = registry._importDatasets([ref])
563 self.assertIsInstance(ref1.id, uuid.UUID)
564 self.assertEqual(ref1.id.version, 5)
565 self.assertEqual(ref1.id, ref.id)
567 # Importing it again is OK
568 (ref2,) = registry._importDatasets([ref1])
569 self.assertEqual(ref2.id, ref1.id)
571 # Cannot import to different run with the same ID
572 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run=f"run{run+1}")
573 with self.assertRaises(ConflictingDefinitionError):
574 registry._importDatasets([ref])
576 ref = DatasetRef(
577 datasetTypeBias, dataIdBias1, run=f"run{run+1}", id_generation_mode=idGenMode
578 )
579 if idGenMode is DatasetIdGenEnum.DATAID_TYPE:
580 # Cannot import same DATAID_TYPE ref into a new run
581 with self.assertRaises(ConflictingDefinitionError):
582 (ref2,) = registry._importDatasets([ref])
583 else:
584 # DATAID_TYPE_RUN ref can be imported into a new run
585 (ref2,) = registry._importDatasets([ref])
587 def testDatasetTypeComponentQueries(self):
588 """Test component options when querying for dataset types.
590 All of the behavior here is deprecated, so many of these tests are
591 currently wrapped in a context to check that we get a warning whenever
592 a component dataset is actually returned.
593 """
594 registry = self.makeRegistry()
595 self.loadData(registry, "base.yaml")
596 self.loadData(registry, "datasets.yaml")
597 # Test querying for dataset types with different inputs.
598 # First query for all dataset types; components should only be included
599 # when components=True.
600 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes()).names)
601 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes(components=False)).names)
602 with self.assertWarns(FutureWarning):
603 self.assertLess(
604 {"bias", "flat", "bias.wcs", "flat.photoCalib"},
605 NamedValueSet(registry.queryDatasetTypes(components=True)).names,
606 )
607 # Use a pattern that can match either parent or components. Again,
608 # components are only returned if components=True.
609 self.assertEqual({"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"))).names)
610 self.assertEqual(
611 {"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=False)).names
612 )
613 with self.assertWarns(FutureWarning):
614 self.assertLess(
615 {"bias", "bias.wcs"},
616 NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=True)).names,
617 )
618 # This pattern matches only a component. In this case we also return
619 # that component dataset type if components=None.
620 with self.assertWarns(FutureWarning):
621 self.assertEqual(
622 {"bias.wcs"}, NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"))).names
623 )
624 self.assertEqual(
625 set(),
626 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=False)).names,
627 )
628 with self.assertWarns(FutureWarning):
629 self.assertEqual(
630 {"bias.wcs"},
631 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=True)).names,
632 )
633 # Add a dataset type using a StorageClass that we'll then remove; check
634 # that this does not affect our ability to query for dataset types
635 # (though it will warn).
636 tempStorageClass = StorageClass(
637 name="TempStorageClass",
638 components={
639 "data1": registry.storageClasses.getStorageClass("StructuredDataDict"),
640 "data2": registry.storageClasses.getStorageClass("StructuredDataDict"),
641 },
642 )
643 registry.storageClasses.registerStorageClass(tempStorageClass)
644 datasetType = DatasetType(
645 "temporary",
646 dimensions=["instrument"],
647 storageClass=tempStorageClass,
648 universe=registry.dimensions,
649 )
650 registry.registerDatasetType(datasetType)
651 registry.storageClasses._unregisterStorageClass(tempStorageClass.name)
652 datasetType._storageClass = None
653 del tempStorageClass
654 # Querying for all dataset types, including components, should include
655 # at least all non-component dataset types (and I don't want to
656 # enumerate all of the Exposure components for bias and flat here).
657 with self.assertWarns(FutureWarning):
658 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
659 everything = NamedValueSet(registry.queryDatasetTypes(components=True))
660 self.assertIn("TempStorageClass", cm.output[0])
661 self.assertLess({"bias", "flat", "temporary"}, everything.names)
662 # It should not include "temporary.columns", because we tried to remove
663 # the storage class that would tell it about that. So if the next line
664 # fails (i.e. "temporary.columns" _is_ in everything.names), it means
665 # this part of the test isn't doing anything, because the _unregister
666 # call about isn't simulating the real-life case we want it to
667 # simulate, in which different versions of daf_butler in entirely
668 # different Python processes interact with the same repo.
669 self.assertNotIn("temporary.data", everything.names)
670 # Query for dataset types that start with "temp". This should again
671 # not include the component, and also not fail.
672 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
673 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=True))
674 self.assertIn("TempStorageClass", cm.output[0])
675 self.assertEqual({"temporary"}, startsWithTemp.names)
676 # Querying with no components should not warn at all.
677 with self.assertLogs("lsst.daf.butler.registries", logging.WARN) as cm:
678 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=False))
679 # Must issue a warning of our own to be captured.
680 logging.getLogger("lsst.daf.butler.registries").warning("test message")
681 self.assertEqual(len(cm.output), 1)
682 self.assertIn("test message", cm.output[0])
684 def testComponentLookups(self):
685 """Test searching for component datasets via their parents.
687 All of the behavior here is deprecated, so many of these tests are
688 currently wrapped in a context to check that we get a warning whenever
689 a component dataset is actually returned.
690 """
691 registry = self.makeRegistry()
692 self.loadData(registry, "base.yaml")
693 self.loadData(registry, "datasets.yaml")
694 # Test getting the child dataset type (which does still exist in the
695 # Registry), and check for consistency with
696 # DatasetRef.makeComponentRef.
697 collection = "imported_g"
698 parentType = registry.getDatasetType("bias")
699 childType = registry.getDatasetType("bias.wcs")
700 parentRefResolved = registry.findDataset(
701 parentType, collections=collection, instrument="Cam1", detector=1
702 )
703 self.assertIsInstance(parentRefResolved, DatasetRef)
704 self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType)
705 # Search for a single dataset with findDataset.
706 childRef1 = registry.findDataset("bias.wcs", collections=collection, dataId=parentRefResolved.dataId)
707 self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs"))
708 # Search for detector data IDs constrained by component dataset
709 # existence with queryDataIds.
710 with self.assertWarns(FutureWarning):
711 dataIds = registry.queryDataIds(
712 ["detector"],
713 datasets=["bias.wcs"],
714 collections=collection,
715 ).toSet()
716 self.assertEqual(
717 dataIds,
718 DataCoordinateSet(
719 {
720 DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions)
721 for d in (1, 2, 3)
722 },
723 parentType.dimensions,
724 ),
725 )
726 # Search for multiple datasets of a single type with queryDatasets.
727 with self.assertWarns(FutureWarning):
728 childRefs2 = set(
729 registry.queryDatasets(
730 "bias.wcs",
731 collections=collection,
732 )
733 )
734 self.assertEqual({ref.datasetType for ref in childRefs2}, {childType})
735 self.assertEqual({ref.dataId for ref in childRefs2}, set(dataIds))
737 def testCollections(self):
738 """Tests for registry methods that manage collections."""
739 registry = self.makeRegistry()
740 other_registry = self.makeRegistry(share_repo_with=registry)
741 self.loadData(registry, "base.yaml")
742 self.loadData(registry, "datasets.yaml")
743 run1 = "imported_g"
744 run2 = "imported_r"
745 # Test setting a collection docstring after it has been created.
746 registry.setCollectionDocumentation(run1, "doc for run1")
747 self.assertEqual(registry.getCollectionDocumentation(run1), "doc for run1")
748 registry.setCollectionDocumentation(run1, None)
749 self.assertIsNone(registry.getCollectionDocumentation(run1))
750 datasetType = "bias"
751 # Find some datasets via their run's collection.
752 dataId1 = {"instrument": "Cam1", "detector": 1}
753 ref1 = registry.findDataset(datasetType, dataId1, collections=run1)
754 self.assertIsNotNone(ref1)
755 dataId2 = {"instrument": "Cam1", "detector": 2}
756 ref2 = registry.findDataset(datasetType, dataId2, collections=run1)
757 self.assertIsNotNone(ref2)
758 # Associate those into a new collection, then look for them there.
759 tag1 = "tag1"
760 registry.registerCollection(tag1, type=CollectionType.TAGGED, doc="doc for tag1")
761 # Check that we can query for old and new collections by type.
762 self.assertEqual(set(registry.queryCollections(collectionTypes=CollectionType.RUN)), {run1, run2})
763 self.assertEqual(
764 set(registry.queryCollections(collectionTypes={CollectionType.TAGGED, CollectionType.RUN})),
765 {tag1, run1, run2},
766 )
767 self.assertEqual(registry.getCollectionDocumentation(tag1), "doc for tag1")
768 registry.associate(tag1, [ref1, ref2])
769 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
770 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
771 # Disassociate one and verify that we can't it there anymore...
772 registry.disassociate(tag1, [ref1])
773 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=tag1))
774 # ...but we can still find ref2 in tag1, and ref1 in the run.
775 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run1), ref1)
776 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
777 collections = set(registry.queryCollections())
778 self.assertEqual(collections, {run1, run2, tag1})
779 # Associate both refs into tag1 again; ref2 is already there, but that
780 # should be a harmless no-op.
781 registry.associate(tag1, [ref1, ref2])
782 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
783 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
784 # Get a different dataset (from a different run) that has the same
785 # dataset type and data ID as ref2.
786 ref2b = registry.findDataset(datasetType, dataId2, collections=run2)
787 self.assertNotEqual(ref2, ref2b)
788 # Attempting to associate that into tag1 should be an error.
789 with self.assertRaises(ConflictingDefinitionError):
790 registry.associate(tag1, [ref2b])
791 # That error shouldn't have messed up what we had before.
792 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
793 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
794 # Attempt to associate the conflicting dataset again, this time with
795 # a dataset that isn't in the collection and won't cause a conflict.
796 # Should also fail without modifying anything.
797 dataId3 = {"instrument": "Cam1", "detector": 3}
798 ref3 = registry.findDataset(datasetType, dataId3, collections=run1)
799 with self.assertRaises(ConflictingDefinitionError):
800 registry.associate(tag1, [ref3, ref2b])
801 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
802 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
803 self.assertIsNone(registry.findDataset(datasetType, dataId3, collections=tag1))
804 # Register a chained collection that searches [tag1, run2]
805 chain1 = "chain1"
806 registry.registerCollection(chain1, type=CollectionType.CHAINED)
807 self.assertIs(registry.getCollectionType(chain1), CollectionType.CHAINED)
808 # Chained collection exists, but has no collections in it.
809 self.assertFalse(registry.getCollectionChain(chain1))
810 # If we query for all collections, we should get the chained collection
811 # only if we don't ask to flatten it (i.e. yield only its children).
812 self.assertEqual(set(registry.queryCollections(flattenChains=False)), {tag1, run1, run2, chain1})
813 self.assertEqual(set(registry.queryCollections(flattenChains=True)), {tag1, run1, run2})
814 # Attempt to set its child collections to something circular; that
815 # should fail.
816 with self.assertRaises(ValueError):
817 registry.setCollectionChain(chain1, [tag1, chain1])
818 # Add the child collections.
819 registry.setCollectionChain(chain1, [tag1, run2])
820 self.assertEqual(list(registry.getCollectionChain(chain1)), [tag1, run2])
821 self.assertEqual(registry.getCollectionParentChains(tag1), {chain1})
822 self.assertEqual(registry.getCollectionParentChains(run2), {chain1})
823 # Refresh the other registry that points to the same repo, and make
824 # sure it can see the things we've done (note that this does require
825 # an explicit refresh(); that's the documented behavior, because
826 # caching is ~impossible otherwise).
827 if other_registry is not None:
828 other_registry.refresh()
829 self.assertEqual(list(other_registry.getCollectionChain(chain1)), [tag1, run2])
830 self.assertEqual(other_registry.getCollectionParentChains(tag1), {chain1})
831 self.assertEqual(other_registry.getCollectionParentChains(run2), {chain1})
832 # Searching for dataId1 or dataId2 in the chain should return ref1 and
833 # ref2, because both are in tag1.
834 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain1), ref1)
835 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain1), ref2)
836 # Now disassociate ref2 from tag1. The search (for bias) with
837 # dataId2 in chain1 should then:
838 # 1. not find it in tag1
839 # 2. find a different dataset in run2
840 registry.disassociate(tag1, [ref2])
841 ref2b = registry.findDataset(datasetType, dataId2, collections=chain1)
842 self.assertNotEqual(ref2b, ref2)
843 self.assertEqual(ref2b, registry.findDataset(datasetType, dataId2, collections=run2))
844 # Define a new chain so we can test recursive chains.
845 chain2 = "chain2"
846 registry.registerCollection(chain2, type=CollectionType.CHAINED)
847 registry.setCollectionChain(chain2, [run2, chain1])
848 self.assertEqual(registry.getCollectionParentChains(chain1), {chain2})
849 self.assertEqual(registry.getCollectionParentChains(run2), {chain1, chain2})
850 # Query for collections matching a regex.
851 self.assertCountEqual(
852 list(registry.queryCollections(re.compile("imported_."), flattenChains=False)),
853 ["imported_r", "imported_g"],
854 )
855 # Query for collections matching a regex or an explicit str.
856 self.assertCountEqual(
857 list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)),
858 ["imported_r", "imported_g", "chain1"],
859 )
860 # Search for bias with dataId1 should find it via tag1 in chain2,
861 # recursing, because is not in run1.
862 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=run2))
863 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain2), ref1)
864 # Search for bias with dataId2 should find it in run2 (ref2b).
865 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain2), ref2b)
866 # Search for a flat that is in run2. That should not be found
867 # at the front of chain2, because of the restriction to bias
868 # on run2 there, but it should be found in at the end of chain1.
869 dataId4 = {"instrument": "Cam1", "detector": 3, "physical_filter": "Cam1-R2"}
870 ref4 = registry.findDataset("flat", dataId4, collections=run2)
871 self.assertIsNotNone(ref4)
872 self.assertEqual(ref4, registry.findDataset("flat", dataId4, collections=chain2))
873 # Deleting a collection that's part of a CHAINED collection is not
874 # allowed, and is exception-safe.
875 with self.assertRaises(Exception):
876 registry.removeCollection(run2)
877 self.assertEqual(registry.getCollectionType(run2), CollectionType.RUN)
878 with self.assertRaises(Exception):
879 registry.removeCollection(chain1)
880 self.assertEqual(registry.getCollectionType(chain1), CollectionType.CHAINED)
881 # Actually remove chain2, test that it's gone by asking for its type.
882 registry.removeCollection(chain2)
883 with self.assertRaises(MissingCollectionError):
884 registry.getCollectionType(chain2)
885 # Actually remove run2 and chain1, which should work now.
886 registry.removeCollection(chain1)
887 registry.removeCollection(run2)
888 with self.assertRaises(MissingCollectionError):
889 registry.getCollectionType(run2)
890 with self.assertRaises(MissingCollectionError):
891 registry.getCollectionType(chain1)
892 # Remove tag1 as well, just to test that we can remove TAGGED
893 # collections.
894 registry.removeCollection(tag1)
895 with self.assertRaises(MissingCollectionError):
896 registry.getCollectionType(tag1)
898 def testCollectionChainFlatten(self):
899 """Test that Registry.setCollectionChain obeys its 'flatten' option."""
900 registry = self.makeRegistry()
901 registry.registerCollection("inner", CollectionType.CHAINED)
902 registry.registerCollection("innermost", CollectionType.RUN)
903 registry.setCollectionChain("inner", ["innermost"])
904 registry.registerCollection("outer", CollectionType.CHAINED)
905 registry.setCollectionChain("outer", ["inner"], flatten=False)
906 self.assertEqual(list(registry.getCollectionChain("outer")), ["inner"])
907 registry.setCollectionChain("outer", ["inner"], flatten=True)
908 self.assertEqual(list(registry.getCollectionChain("outer")), ["innermost"])
910 def testBasicTransaction(self):
911 """Test that all operations within a single transaction block are
912 rolled back if an exception propagates out of the block.
913 """
914 registry = self.makeRegistry()
915 storageClass = StorageClass("testDatasetType")
916 registry.storageClasses.registerStorageClass(storageClass)
917 with registry.transaction():
918 registry.insertDimensionData("instrument", {"name": "Cam1", "class_name": "A"})
919 with self.assertRaises(ValueError):
920 with registry.transaction():
921 registry.insertDimensionData("instrument", {"name": "Cam2"})
922 raise ValueError("Oops, something went wrong")
923 # Cam1 should exist
924 self.assertEqual(registry.expandDataId(instrument="Cam1").records["instrument"].class_name, "A")
925 # But Cam2 and Cam3 should both not exist
926 with self.assertRaises(DataIdValueError):
927 registry.expandDataId(instrument="Cam2")
928 with self.assertRaises(DataIdValueError):
929 registry.expandDataId(instrument="Cam3")
931 def testNestedTransaction(self):
932 """Test that operations within a transaction block are not rolled back
933 if an exception propagates out of an inner transaction block and is
934 then caught.
935 """
936 registry = self.makeRegistry()
937 dimension = registry.dimensions["instrument"]
938 dataId1 = {"instrument": "DummyCam"}
939 dataId2 = {"instrument": "DummyCam2"}
940 checkpointReached = False
941 with registry.transaction():
942 # This should be added and (ultimately) committed.
943 registry.insertDimensionData(dimension, dataId1)
944 with self.assertRaises(sqlalchemy.exc.IntegrityError):
945 with registry.transaction(savepoint=True):
946 # This does not conflict, and should succeed (but not
947 # be committed).
948 registry.insertDimensionData(dimension, dataId2)
949 checkpointReached = True
950 # This should conflict and raise, triggerring a rollback
951 # of the previous insertion within the same transaction
952 # context, but not the original insertion in the outer
953 # block.
954 registry.insertDimensionData(dimension, dataId1)
955 self.assertTrue(checkpointReached)
956 self.assertIsNotNone(registry.expandDataId(dataId1, graph=dimension.graph))
957 with self.assertRaises(DataIdValueError):
958 registry.expandDataId(dataId2, graph=dimension.graph)
960 def testInstrumentDimensions(self):
961 """Test queries involving only instrument dimensions, with no joins to
962 skymap."""
963 registry = self.makeRegistry()
965 # need a bunch of dimensions and datasets for test
966 registry.insertDimensionData(
967 "instrument", dict(name="DummyCam", visit_max=25, exposure_max=300, detector_max=6)
968 )
969 registry.insertDimensionData(
970 "physical_filter",
971 dict(instrument="DummyCam", name="dummy_r", band="r"),
972 dict(instrument="DummyCam", name="dummy_i", band="i"),
973 )
974 registry.insertDimensionData(
975 "detector", *[dict(instrument="DummyCam", id=i, full_name=str(i)) for i in range(1, 6)]
976 )
977 registry.insertDimensionData(
978 "visit_system",
979 dict(instrument="DummyCam", id=1, name="default"),
980 )
981 registry.insertDimensionData(
982 "visit",
983 dict(instrument="DummyCam", id=10, name="ten", physical_filter="dummy_i", visit_system=1),
984 dict(instrument="DummyCam", id=11, name="eleven", physical_filter="dummy_r", visit_system=1),
985 dict(instrument="DummyCam", id=20, name="twelve", physical_filter="dummy_r", visit_system=1),
986 )
987 for i in range(1, 6):
988 registry.insertDimensionData(
989 "visit_detector_region",
990 dict(instrument="DummyCam", visit=10, detector=i),
991 dict(instrument="DummyCam", visit=11, detector=i),
992 dict(instrument="DummyCam", visit=20, detector=i),
993 )
994 registry.insertDimensionData(
995 "exposure",
996 dict(instrument="DummyCam", id=100, obs_id="100", physical_filter="dummy_i"),
997 dict(instrument="DummyCam", id=101, obs_id="101", physical_filter="dummy_i"),
998 dict(instrument="DummyCam", id=110, obs_id="110", physical_filter="dummy_r"),
999 dict(instrument="DummyCam", id=111, obs_id="111", physical_filter="dummy_r"),
1000 dict(instrument="DummyCam", id=200, obs_id="200", physical_filter="dummy_r"),
1001 dict(instrument="DummyCam", id=201, obs_id="201", physical_filter="dummy_r"),
1002 )
1003 registry.insertDimensionData(
1004 "visit_definition",
1005 dict(instrument="DummyCam", exposure=100, visit_system=1, visit=10),
1006 dict(instrument="DummyCam", exposure=101, visit_system=1, visit=10),
1007 dict(instrument="DummyCam", exposure=110, visit_system=1, visit=11),
1008 dict(instrument="DummyCam", exposure=111, visit_system=1, visit=11),
1009 dict(instrument="DummyCam", exposure=200, visit_system=1, visit=20),
1010 dict(instrument="DummyCam", exposure=201, visit_system=1, visit=20),
1011 )
1012 # dataset types
1013 run1 = "test1_r"
1014 run2 = "test2_r"
1015 tagged2 = "test2_t"
1016 registry.registerRun(run1)
1017 registry.registerRun(run2)
1018 registry.registerCollection(tagged2)
1019 storageClass = StorageClass("testDataset")
1020 registry.storageClasses.registerStorageClass(storageClass)
1021 rawType = DatasetType(
1022 name="RAW",
1023 dimensions=registry.dimensions.extract(("instrument", "exposure", "detector")),
1024 storageClass=storageClass,
1025 )
1026 registry.registerDatasetType(rawType)
1027 calexpType = DatasetType(
1028 name="CALEXP",
1029 dimensions=registry.dimensions.extract(("instrument", "visit", "detector")),
1030 storageClass=storageClass,
1031 )
1032 registry.registerDatasetType(calexpType)
1034 # add pre-existing datasets
1035 for exposure in (100, 101, 110, 111):
1036 for detector in (1, 2, 3):
1037 # note that only 3 of 5 detectors have datasets
1038 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1039 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run1)
1040 # exposures 100 and 101 appear in both run1 and tagged2.
1041 # 100 has different datasets in the different collections
1042 # 101 has the same dataset in both collections.
1043 if exposure == 100:
1044 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1045 if exposure in (100, 101):
1046 registry.associate(tagged2, [ref])
1047 # Add pre-existing datasets to tagged2.
1048 for exposure in (200, 201):
1049 for detector in (3, 4, 5):
1050 # note that only 3 of 5 detectors have datasets
1051 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1052 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1053 registry.associate(tagged2, [ref])
1055 dimensions = DimensionGraph(
1056 registry.dimensions, dimensions=(rawType.dimensions.required | calexpType.dimensions.required)
1057 )
1058 # Test that single dim string works as well as list of str
1059 rows = registry.queryDataIds("visit", datasets=rawType, collections=run1).expanded().toSet()
1060 rowsI = registry.queryDataIds(["visit"], datasets=rawType, collections=run1).expanded().toSet()
1061 self.assertEqual(rows, rowsI)
1062 # with empty expression
1063 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=run1).expanded().toSet()
1064 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1065 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111))
1066 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11))
1067 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1069 # second collection
1070 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=tagged2).toSet()
1071 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1072 for dataId in rows:
1073 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1074 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 200, 201))
1075 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 20))
1076 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1078 # with two input datasets
1079 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=[run1, tagged2]).toSet()
1080 self.assertEqual(len(set(rows)), 6 * 3) # 6 exposures times 3 detectors; set needed to de-dupe
1081 for dataId in rows:
1082 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1083 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111, 200, 201))
1084 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11, 20))
1085 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1087 # limit to single visit
1088 rows = registry.queryDataIds(
1089 dimensions, datasets=rawType, collections=run1, where="visit = 10", instrument="DummyCam"
1090 ).toSet()
1091 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1092 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1093 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1094 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1096 # more limiting expression, using link names instead of Table.column
1097 rows = registry.queryDataIds(
1098 dimensions,
1099 datasets=rawType,
1100 collections=run1,
1101 where="visit = 10 and detector > 1 and 'DummyCam'=instrument",
1102 ).toSet()
1103 self.assertEqual(len(rows), 2 * 2) # 2 exposures times 2 detectors
1104 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1105 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1106 self.assertCountEqual({dataId["detector"] for dataId in rows}, (2, 3))
1108 # queryDataIds with only one of `datasets` and `collections` is an
1109 # error.
1110 with self.assertRaises(CollectionError):
1111 registry.queryDataIds(dimensions, datasets=rawType)
1112 with self.assertRaises(ArgumentError):
1113 registry.queryDataIds(dimensions, collections=run1)
1115 # expression excludes everything
1116 rows = registry.queryDataIds(
1117 dimensions, datasets=rawType, collections=run1, where="visit > 1000", instrument="DummyCam"
1118 ).toSet()
1119 self.assertEqual(len(rows), 0)
1121 # Selecting by physical_filter, this is not in the dimensions, but it
1122 # is a part of the full expression so it should work too.
1123 rows = registry.queryDataIds(
1124 dimensions,
1125 datasets=rawType,
1126 collections=run1,
1127 where="physical_filter = 'dummy_r'",
1128 instrument="DummyCam",
1129 ).toSet()
1130 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1131 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (110, 111))
1132 self.assertCountEqual({dataId["visit"] for dataId in rows}, (11,))
1133 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1135 def testSkyMapDimensions(self):
1136 """Tests involving only skymap dimensions, no joins to instrument."""
1137 registry = self.makeRegistry()
1139 # need a bunch of dimensions and datasets for test, we want
1140 # "band" in the test so also have to add physical_filter
1141 # dimensions
1142 registry.insertDimensionData("instrument", dict(instrument="DummyCam"))
1143 registry.insertDimensionData(
1144 "physical_filter",
1145 dict(instrument="DummyCam", name="dummy_r", band="r"),
1146 dict(instrument="DummyCam", name="dummy_i", band="i"),
1147 )
1148 registry.insertDimensionData("skymap", dict(name="DummyMap", hash=b"sha!"))
1149 for tract in range(10):
1150 registry.insertDimensionData("tract", dict(skymap="DummyMap", id=tract))
1151 registry.insertDimensionData(
1152 "patch",
1153 *[dict(skymap="DummyMap", tract=tract, id=patch, cell_x=0, cell_y=0) for patch in range(10)],
1154 )
1156 # dataset types
1157 run = "tésτ"
1158 registry.registerRun(run)
1159 storageClass = StorageClass("testDataset")
1160 registry.storageClasses.registerStorageClass(storageClass)
1161 calexpType = DatasetType(
1162 name="deepCoadd_calexp",
1163 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1164 storageClass=storageClass,
1165 )
1166 registry.registerDatasetType(calexpType)
1167 mergeType = DatasetType(
1168 name="deepCoadd_mergeDet",
1169 dimensions=registry.dimensions.extract(("skymap", "tract", "patch")),
1170 storageClass=storageClass,
1171 )
1172 registry.registerDatasetType(mergeType)
1173 measType = DatasetType(
1174 name="deepCoadd_meas",
1175 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1176 storageClass=storageClass,
1177 )
1178 registry.registerDatasetType(measType)
1180 dimensions = DimensionGraph(
1181 registry.dimensions,
1182 dimensions=(
1183 calexpType.dimensions.required | mergeType.dimensions.required | measType.dimensions.required
1184 ),
1185 )
1187 # add pre-existing datasets
1188 for tract in (1, 3, 5):
1189 for patch in (2, 4, 6, 7):
1190 dataId = dict(skymap="DummyMap", tract=tract, patch=patch)
1191 registry.insertDatasets(mergeType, dataIds=[dataId], run=run)
1192 for aFilter in ("i", "r"):
1193 dataId = dict(skymap="DummyMap", tract=tract, patch=patch, band=aFilter)
1194 registry.insertDatasets(calexpType, dataIds=[dataId], run=run)
1196 # with empty expression
1197 rows = registry.queryDataIds(dimensions, datasets=[calexpType, mergeType], collections=run).toSet()
1198 self.assertEqual(len(rows), 3 * 4 * 2) # 4 tracts x 4 patches x 2 filters
1199 for dataId in rows:
1200 self.assertCountEqual(dataId.keys(), ("skymap", "tract", "patch", "band"))
1201 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1202 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1203 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1205 # limit to 2 tracts and 2 patches
1206 rows = registry.queryDataIds(
1207 dimensions,
1208 datasets=[calexpType, mergeType],
1209 collections=run,
1210 where="tract IN (1, 5) AND patch IN (2, 7)",
1211 skymap="DummyMap",
1212 ).toSet()
1213 self.assertEqual(len(rows), 2 * 2 * 2) # 2 tracts x 2 patches x 2 filters
1214 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 5))
1215 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 7))
1216 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1218 # limit to single filter
1219 rows = registry.queryDataIds(
1220 dimensions, datasets=[calexpType, mergeType], collections=run, where="band = 'i'"
1221 ).toSet()
1222 self.assertEqual(len(rows), 3 * 4 * 1) # 4 tracts x 4 patches x 2 filters
1223 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1224 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1225 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",))
1227 # Specifying non-existing skymap is an exception
1228 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
1229 rows = registry.queryDataIds(
1230 dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
1231 ).toSet()
1233 def testSpatialJoin(self):
1234 """Test queries that involve spatial overlap joins."""
1235 registry = self.makeRegistry()
1236 self.loadData(registry, "hsc-rc2-subset.yaml")
1238 # Dictionary of spatial DatabaseDimensionElements, keyed by the name of
1239 # the TopologicalFamily they belong to. We'll relate all elements in
1240 # each family to all of the elements in each other family.
1241 families = defaultdict(set)
1242 # Dictionary of {element.name: {dataId: region}}.
1243 regions = {}
1244 for element in registry.dimensions.getDatabaseElements():
1245 if element.spatial is not None:
1246 families[element.spatial.name].add(element)
1247 regions[element.name] = {
1248 record.dataId: record.region for record in registry.queryDimensionRecords(element)
1249 }
1251 # If this check fails, it's not necessarily a problem - it may just be
1252 # a reasonable change to the default dimension definitions - but the
1253 # test below depends on there being more than one family to do anything
1254 # useful.
1255 self.assertEqual(len(families), 2)
1257 # Overlap DatabaseDimensionElements with each other.
1258 for family1, family2 in itertools.combinations(families, 2):
1259 for element1, element2 in itertools.product(families[family1], families[family2]):
1260 graph = DimensionGraph.union(element1.graph, element2.graph)
1261 # Construct expected set of overlapping data IDs via a
1262 # brute-force comparison of the regions we've already fetched.
1263 expected = {
1264 DataCoordinate.standardize({**dataId1.byName(), **dataId2.byName()}, graph=graph)
1265 for (dataId1, region1), (dataId2, region2) in itertools.product(
1266 regions[element1.name].items(), regions[element2.name].items()
1267 )
1268 if not region1.isDisjointFrom(region2)
1269 }
1270 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1271 queried = set(registry.queryDataIds(graph))
1272 self.assertEqual(expected, queried)
1274 # Overlap each DatabaseDimensionElement with the commonSkyPix system.
1275 commonSkyPix = registry.dimensions.commonSkyPix
1276 for elementName, regions in regions.items():
1277 graph = DimensionGraph.union(registry.dimensions[elementName].graph, commonSkyPix.graph)
1278 expected = set()
1279 for dataId, region in regions.items():
1280 for begin, end in commonSkyPix.pixelization.envelope(region):
1281 expected.update(
1282 DataCoordinate.standardize({commonSkyPix.name: index, **dataId.byName()}, graph=graph)
1283 for index in range(begin, end)
1284 )
1285 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1286 queried = set(registry.queryDataIds(graph))
1287 self.assertEqual(expected, queried)
1289 def testAbstractQuery(self):
1290 """Test that we can run a query that just lists the known
1291 bands. This is tricky because band is
1292 backed by a query against physical_filter.
1293 """
1294 registry = self.makeRegistry()
1295 registry.insertDimensionData("instrument", dict(name="DummyCam"))
1296 registry.insertDimensionData(
1297 "physical_filter",
1298 dict(instrument="DummyCam", name="dummy_i", band="i"),
1299 dict(instrument="DummyCam", name="dummy_i2", band="i"),
1300 dict(instrument="DummyCam", name="dummy_r", band="r"),
1301 )
1302 rows = registry.queryDataIds(["band"]).toSet()
1303 self.assertCountEqual(
1304 rows,
1305 [
1306 DataCoordinate.standardize(band="i", universe=registry.dimensions),
1307 DataCoordinate.standardize(band="r", universe=registry.dimensions),
1308 ],
1309 )
1311 def testAttributeManager(self):
1312 """Test basic functionality of attribute manager."""
1313 # number of attributes with schema versions in a fresh database,
1314 # 6 managers with 2 records per manager, plus config for dimensions
1315 VERSION_COUNT = 6 * 2 + 1
1317 registry = self.makeRegistry()
1318 attributes = registry._managers.attributes
1320 # check what get() returns for non-existing key
1321 self.assertIsNone(attributes.get("attr"))
1322 self.assertEqual(attributes.get("attr", ""), "")
1323 self.assertEqual(attributes.get("attr", "Value"), "Value")
1324 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1326 # cannot store empty key or value
1327 with self.assertRaises(ValueError):
1328 attributes.set("", "value")
1329 with self.assertRaises(ValueError):
1330 attributes.set("attr", "")
1332 # set value of non-existing key
1333 attributes.set("attr", "value")
1334 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1335 self.assertEqual(attributes.get("attr"), "value")
1337 # update value of existing key
1338 with self.assertRaises(ButlerAttributeExistsError):
1339 attributes.set("attr", "value2")
1341 attributes.set("attr", "value2", force=True)
1342 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1343 self.assertEqual(attributes.get("attr"), "value2")
1345 # delete existing key
1346 self.assertTrue(attributes.delete("attr"))
1347 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1349 # delete non-existing key
1350 self.assertFalse(attributes.delete("non-attr"))
1352 # store bunch of keys and get the list back
1353 data = [
1354 ("version.core", "1.2.3"),
1355 ("version.dimensions", "3.2.1"),
1356 ("config.managers.opaque", "ByNameOpaqueTableStorageManager"),
1357 ]
1358 for key, value in data:
1359 attributes.set(key, value)
1360 items = dict(attributes.items())
1361 for key, value in data:
1362 self.assertEqual(items[key], value)
1364 def testQueryDatasetsDeduplication(self):
1365 """Test that the findFirst option to queryDatasets selects datasets
1366 from collections in the order given".
1367 """
1368 registry = self.makeRegistry()
1369 self.loadData(registry, "base.yaml")
1370 self.loadData(registry, "datasets.yaml")
1371 self.assertCountEqual(
1372 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"])),
1373 [
1374 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1375 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1376 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1377 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1378 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1379 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1380 ],
1381 )
1382 self.assertCountEqual(
1383 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"], findFirst=True)),
1384 [
1385 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1386 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1387 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1388 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1389 ],
1390 )
1391 self.assertCountEqual(
1392 list(registry.queryDatasets("bias", collections=["imported_r", "imported_g"], findFirst=True)),
1393 [
1394 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1395 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1396 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1397 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1398 ],
1399 )
1401 def testQueryResults(self):
1402 """Test querying for data IDs and then manipulating the QueryResults
1403 object returned to perform other queries.
1404 """
1405 registry = self.makeRegistry()
1406 self.loadData(registry, "base.yaml")
1407 self.loadData(registry, "datasets.yaml")
1408 bias = registry.getDatasetType("bias")
1409 flat = registry.getDatasetType("flat")
1410 # Obtain expected results from methods other than those we're testing
1411 # here. That includes:
1412 # - the dimensions of the data IDs we want to query:
1413 expectedGraph = DimensionGraph(registry.dimensions, names=["detector", "physical_filter"])
1414 # - the dimensions of some other data IDs we'll extract from that:
1415 expectedSubsetGraph = DimensionGraph(registry.dimensions, names=["detector"])
1416 # - the data IDs we expect to obtain from the first queries:
1417 expectedDataIds = DataCoordinateSet(
1418 {
1419 DataCoordinate.standardize(
1420 instrument="Cam1", detector=d, physical_filter=p, universe=registry.dimensions
1421 )
1422 for d, p in itertools.product({1, 2, 3}, {"Cam1-G", "Cam1-R1", "Cam1-R2"})
1423 },
1424 graph=expectedGraph,
1425 hasFull=False,
1426 hasRecords=False,
1427 )
1428 # - the flat datasets we expect to find from those data IDs, in just
1429 # one collection (so deduplication is irrelevant):
1430 expectedFlats = [
1431 registry.findDataset(
1432 flat, instrument="Cam1", detector=1, physical_filter="Cam1-R1", collections="imported_r"
1433 ),
1434 registry.findDataset(
1435 flat, instrument="Cam1", detector=2, physical_filter="Cam1-R1", collections="imported_r"
1436 ),
1437 registry.findDataset(
1438 flat, instrument="Cam1", detector=3, physical_filter="Cam1-R2", collections="imported_r"
1439 ),
1440 ]
1441 # - the data IDs we expect to extract from that:
1442 expectedSubsetDataIds = expectedDataIds.subset(expectedSubsetGraph)
1443 # - the bias datasets we expect to find from those data IDs, after we
1444 # subset-out the physical_filter dimension, both with duplicates:
1445 expectedAllBiases = [
1446 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1447 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_g"),
1448 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_g"),
1449 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1450 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1451 ]
1452 # - ...and without duplicates:
1453 expectedDeduplicatedBiases = [
1454 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1455 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1456 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1457 ]
1458 # Test against those expected results, using a "lazy" query for the
1459 # data IDs (which re-executes that query each time we use it to do
1460 # something new).
1461 dataIds = registry.queryDataIds(
1462 ["detector", "physical_filter"],
1463 where="detector.purpose = 'SCIENCE'", # this rejects detector=4
1464 instrument="Cam1",
1465 )
1466 self.assertEqual(dataIds.graph, expectedGraph)
1467 self.assertEqual(dataIds.toSet(), expectedDataIds)
1468 self.assertCountEqual(
1469 list(
1470 dataIds.findDatasets(
1471 flat,
1472 collections=["imported_r"],
1473 )
1474 ),
1475 expectedFlats,
1476 )
1477 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1478 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1479 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1480 self.assertCountEqual(
1481 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=False)),
1482 expectedAllBiases,
1483 )
1484 self.assertCountEqual(
1485 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)),
1486 expectedDeduplicatedBiases,
1487 )
1489 # Check dimensions match.
1490 with self.assertRaises(ValueError):
1491 subsetDataIds.findDatasets("flat", collections=["imported_r", "imported_g"], findFirst=True)
1493 # Use a component dataset type.
1494 self.assertCountEqual(
1495 [
1496 ref.makeComponentRef("image")
1497 for ref in subsetDataIds.findDatasets(
1498 bias,
1499 collections=["imported_r", "imported_g"],
1500 findFirst=False,
1501 )
1502 ],
1503 [ref.makeComponentRef("image") for ref in expectedAllBiases],
1504 )
1506 # Use a named dataset type that does not exist and a dataset type
1507 # object that does not exist.
1508 unknown_type = DatasetType("not_known", dimensions=bias.dimensions, storageClass="Exposure")
1510 # Test both string name and dataset type object.
1511 test_type: str | DatasetType
1512 for test_type, test_type_name in (
1513 (unknown_type, unknown_type.name),
1514 (unknown_type.name, unknown_type.name),
1515 ):
1516 with self.assertRaisesRegex(DatasetTypeError, expected_regex=test_type_name):
1517 list(
1518 subsetDataIds.findDatasets(
1519 test_type, collections=["imported_r", "imported_g"], findFirst=True
1520 )
1521 )
1523 # Materialize the bias dataset queries (only) by putting the results
1524 # into temporary tables, then repeat those tests.
1525 with subsetDataIds.findDatasets(
1526 bias, collections=["imported_r", "imported_g"], findFirst=False
1527 ).materialize() as biases:
1528 self.assertCountEqual(list(biases), expectedAllBiases)
1529 with subsetDataIds.findDatasets(
1530 bias, collections=["imported_r", "imported_g"], findFirst=True
1531 ).materialize() as biases:
1532 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1533 # Materialize the data ID subset query, but not the dataset queries.
1534 with subsetDataIds.materialize() as subsetDataIds:
1535 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1536 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1537 self.assertCountEqual(
1538 list(
1539 subsetDataIds.findDatasets(
1540 bias, collections=["imported_r", "imported_g"], findFirst=False
1541 )
1542 ),
1543 expectedAllBiases,
1544 )
1545 self.assertCountEqual(
1546 list(
1547 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1548 ),
1549 expectedDeduplicatedBiases,
1550 )
1551 # Materialize the dataset queries, too.
1552 with subsetDataIds.findDatasets(
1553 bias, collections=["imported_r", "imported_g"], findFirst=False
1554 ).materialize() as biases:
1555 self.assertCountEqual(list(biases), expectedAllBiases)
1556 with subsetDataIds.findDatasets(
1557 bias, collections=["imported_r", "imported_g"], findFirst=True
1558 ).materialize() as biases:
1559 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1560 # Materialize the original query, but none of the follow-up queries.
1561 with dataIds.materialize() as dataIds:
1562 self.assertEqual(dataIds.graph, expectedGraph)
1563 self.assertEqual(dataIds.toSet(), expectedDataIds)
1564 self.assertCountEqual(
1565 list(
1566 dataIds.findDatasets(
1567 flat,
1568 collections=["imported_r"],
1569 )
1570 ),
1571 expectedFlats,
1572 )
1573 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1574 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1575 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1576 self.assertCountEqual(
1577 list(
1578 subsetDataIds.findDatasets(
1579 bias, collections=["imported_r", "imported_g"], findFirst=False
1580 )
1581 ),
1582 expectedAllBiases,
1583 )
1584 self.assertCountEqual(
1585 list(
1586 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1587 ),
1588 expectedDeduplicatedBiases,
1589 )
1590 # Materialize just the bias dataset queries.
1591 with subsetDataIds.findDatasets(
1592 bias, collections=["imported_r", "imported_g"], findFirst=False
1593 ).materialize() as biases:
1594 self.assertCountEqual(list(biases), expectedAllBiases)
1595 with subsetDataIds.findDatasets(
1596 bias, collections=["imported_r", "imported_g"], findFirst=True
1597 ).materialize() as biases:
1598 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1599 # Materialize the subset data ID query, but not the dataset
1600 # queries.
1601 with subsetDataIds.materialize() as subsetDataIds:
1602 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1603 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1604 self.assertCountEqual(
1605 list(
1606 subsetDataIds.findDatasets(
1607 bias, collections=["imported_r", "imported_g"], findFirst=False
1608 )
1609 ),
1610 expectedAllBiases,
1611 )
1612 self.assertCountEqual(
1613 list(
1614 subsetDataIds.findDatasets(
1615 bias, collections=["imported_r", "imported_g"], findFirst=True
1616 )
1617 ),
1618 expectedDeduplicatedBiases,
1619 )
1620 # Materialize the bias dataset queries, too, so now we're
1621 # materializing every single step.
1622 with subsetDataIds.findDatasets(
1623 bias, collections=["imported_r", "imported_g"], findFirst=False
1624 ).materialize() as biases:
1625 self.assertCountEqual(list(biases), expectedAllBiases)
1626 with subsetDataIds.findDatasets(
1627 bias, collections=["imported_r", "imported_g"], findFirst=True
1628 ).materialize() as biases:
1629 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1631 def testStorageClassPropagation(self):
1632 """Test that queries for datasets respect the storage class passed in
1633 as part of a full dataset type.
1634 """
1635 registry = self.makeRegistry()
1636 self.loadData(registry, "base.yaml")
1637 dataset_type_in_registry = DatasetType(
1638 "tbl", dimensions=["instrument"], storageClass="DataFrame", universe=registry.dimensions
1639 )
1640 registry.registerDatasetType(dataset_type_in_registry)
1641 run = "run1"
1642 registry.registerRun(run)
1643 (inserted_ref,) = registry.insertDatasets(
1644 dataset_type_in_registry, [registry.expandDataId(instrument="Cam1")], run=run
1645 )
1646 self.assertEqual(inserted_ref.datasetType, dataset_type_in_registry)
1647 query_dataset_type = DatasetType(
1648 "tbl", dimensions=["instrument"], storageClass="ArrowAstropy", universe=registry.dimensions
1649 )
1650 self.assertNotEqual(dataset_type_in_registry, query_dataset_type)
1651 query_datasets_result = registry.queryDatasets(query_dataset_type, collections=[run])
1652 self.assertEqual(query_datasets_result.parentDatasetType, query_dataset_type) # type: ignore
1653 (query_datasets_ref,) = query_datasets_result
1654 self.assertEqual(query_datasets_ref.datasetType, query_dataset_type)
1655 query_data_ids_find_datasets_result = registry.queryDataIds(["instrument"]).findDatasets(
1656 query_dataset_type, collections=[run]
1657 )
1658 self.assertEqual(query_data_ids_find_datasets_result.parentDatasetType, query_dataset_type)
1659 (query_data_ids_find_datasets_ref,) = query_data_ids_find_datasets_result
1660 self.assertEqual(query_data_ids_find_datasets_ref.datasetType, query_dataset_type)
1661 query_dataset_types_result = registry.queryDatasetTypes(query_dataset_type)
1662 self.assertEqual(list(query_dataset_types_result), [query_dataset_type])
1663 find_dataset_ref = registry.findDataset(query_dataset_type, instrument="Cam1", collections=[run])
1664 self.assertEqual(find_dataset_ref.datasetType, query_dataset_type)
1666 def testEmptyDimensionsQueries(self):
1667 """Test Query and QueryResults objects in the case where there are no
1668 dimensions.
1669 """
1670 # Set up test data: one dataset type, two runs, one dataset in each.
1671 registry = self.makeRegistry()
1672 self.loadData(registry, "base.yaml")
1673 schema = DatasetType("schema", dimensions=registry.dimensions.empty, storageClass="Catalog")
1674 registry.registerDatasetType(schema)
1675 dataId = DataCoordinate.makeEmpty(registry.dimensions)
1676 run1 = "run1"
1677 run2 = "run2"
1678 registry.registerRun(run1)
1679 registry.registerRun(run2)
1680 (dataset1,) = registry.insertDatasets(schema, dataIds=[dataId], run=run1)
1681 (dataset2,) = registry.insertDatasets(schema, dataIds=[dataId], run=run2)
1682 # Query directly for both of the datasets, and each one, one at a time.
1683 self.checkQueryResults(
1684 registry.queryDatasets(schema, collections=[run1, run2], findFirst=False), [dataset1, dataset2]
1685 )
1686 self.checkQueryResults(
1687 registry.queryDatasets(schema, collections=[run1, run2], findFirst=True),
1688 [dataset1],
1689 )
1690 self.checkQueryResults(
1691 registry.queryDatasets(schema, collections=[run2, run1], findFirst=True),
1692 [dataset2],
1693 )
1694 # Query for data IDs with no dimensions.
1695 dataIds = registry.queryDataIds([])
1696 self.checkQueryResults(dataIds, [dataId])
1697 # Use queried data IDs to find the datasets.
1698 self.checkQueryResults(
1699 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1700 [dataset1, dataset2],
1701 )
1702 self.checkQueryResults(
1703 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1704 [dataset1],
1705 )
1706 self.checkQueryResults(
1707 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1708 [dataset2],
1709 )
1710 # Now materialize the data ID query results and repeat those tests.
1711 with dataIds.materialize() as dataIds:
1712 self.checkQueryResults(dataIds, [dataId])
1713 self.checkQueryResults(
1714 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1715 [dataset1],
1716 )
1717 self.checkQueryResults(
1718 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1719 [dataset2],
1720 )
1721 # Query for non-empty data IDs, then subset that to get the empty one.
1722 # Repeat the above tests starting from that.
1723 dataIds = registry.queryDataIds(["instrument"]).subset(registry.dimensions.empty, unique=True)
1724 self.checkQueryResults(dataIds, [dataId])
1725 self.checkQueryResults(
1726 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1727 [dataset1, dataset2],
1728 )
1729 self.checkQueryResults(
1730 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1731 [dataset1],
1732 )
1733 self.checkQueryResults(
1734 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1735 [dataset2],
1736 )
1737 with dataIds.materialize() as dataIds:
1738 self.checkQueryResults(dataIds, [dataId])
1739 self.checkQueryResults(
1740 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1741 [dataset1, dataset2],
1742 )
1743 self.checkQueryResults(
1744 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1745 [dataset1],
1746 )
1747 self.checkQueryResults(
1748 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1749 [dataset2],
1750 )
1751 # Query for non-empty data IDs, then materialize, then subset to get
1752 # the empty one. Repeat again.
1753 with registry.queryDataIds(["instrument"]).materialize() as nonEmptyDataIds:
1754 dataIds = nonEmptyDataIds.subset(registry.dimensions.empty, unique=True)
1755 self.checkQueryResults(dataIds, [dataId])
1756 self.checkQueryResults(
1757 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1758 [dataset1, dataset2],
1759 )
1760 self.checkQueryResults(
1761 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1762 [dataset1],
1763 )
1764 self.checkQueryResults(
1765 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1766 [dataset2],
1767 )
1768 with dataIds.materialize() as dataIds:
1769 self.checkQueryResults(dataIds, [dataId])
1770 self.checkQueryResults(
1771 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1772 [dataset1, dataset2],
1773 )
1774 self.checkQueryResults(
1775 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1776 [dataset1],
1777 )
1778 self.checkQueryResults(
1779 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1780 [dataset2],
1781 )
1782 # Query for non-empty data IDs with a constraint on an empty-data-ID
1783 # dataset that exists.
1784 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=...)
1785 self.checkQueryResults(
1786 dataIds.subset(unique=True),
1787 [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)],
1788 )
1789 # Again query for non-empty data IDs with a constraint on empty-data-ID
1790 # datasets, but when the datasets don't exist. We delete the existing
1791 # dataset and query just that collection rather than creating a new
1792 # empty collection because this is a bit less likely for our build-time
1793 # logic to shortcut-out (via the collection summaries), and such a
1794 # shortcut would make this test a bit more trivial than we'd like.
1795 registry.removeDatasets([dataset2])
1796 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=run2)
1797 self.checkQueryResults(dataIds, [])
1799 def testDimensionDataModifications(self):
1800 """Test that modifying dimension records via:
1801 syncDimensionData(..., update=True) and
1802 insertDimensionData(..., replace=True) works as expected, even in the
1803 presence of datasets using those dimensions and spatial overlap
1804 relationships.
1805 """
1807 def unpack_range_set(ranges: lsst.sphgeom.RangeSet) -> Iterator[int]:
1808 """Unpack a sphgeom.RangeSet into the integers it contains."""
1809 for begin, end in ranges:
1810 yield from range(begin, end)
1812 def range_set_hull(
1813 ranges: lsst.sphgeom.RangeSet,
1814 pixelization: lsst.sphgeom.HtmPixelization,
1815 ) -> lsst.sphgeom.ConvexPolygon:
1816 """Create a ConvexPolygon hull of the region defined by a set of
1817 HTM pixelization index ranges.
1818 """
1819 points = []
1820 for index in unpack_range_set(ranges):
1821 points.extend(pixelization.triangle(index).getVertices())
1822 return lsst.sphgeom.ConvexPolygon(points)
1824 # Use HTM to set up an initial parent region (one arbitrary trixel)
1825 # and four child regions (the trixels within the parent at the next
1826 # level. We'll use the parent as a tract/visit region and the children
1827 # as its patch/visit_detector regions.
1828 registry = self.makeRegistry()
1829 htm6 = registry.dimensions.skypix["htm"][6].pixelization
1830 commonSkyPix = registry.dimensions.commonSkyPix.pixelization
1831 index = 12288
1832 child_ranges_small = lsst.sphgeom.RangeSet(index).scaled(4)
1833 assert htm6.universe().contains(child_ranges_small)
1834 child_regions_small = [htm6.triangle(i) for i in unpack_range_set(child_ranges_small)]
1835 parent_region_small = lsst.sphgeom.ConvexPolygon(
1836 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_small))
1837 )
1838 assert all(parent_region_small.contains(c) for c in child_regions_small)
1839 # Make a larger version of each child region, defined to be the set of
1840 # htm6 trixels that overlap the original's bounding circle. Make a new
1841 # parent that's the convex hull of the new children.
1842 child_regions_large = [
1843 range_set_hull(htm6.envelope(c.getBoundingCircle()), htm6) for c in child_regions_small
1844 ]
1845 assert all(large.contains(small) for large, small in zip(child_regions_large, child_regions_small))
1846 parent_region_large = lsst.sphgeom.ConvexPolygon(
1847 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_large))
1848 )
1849 assert all(parent_region_large.contains(c) for c in child_regions_large)
1850 assert parent_region_large.contains(parent_region_small)
1851 assert not parent_region_small.contains(parent_region_large)
1852 assert not all(parent_region_small.contains(c) for c in child_regions_large)
1853 # Find some commonSkyPix indices that overlap the large regions but not
1854 # overlap the small regions. We use commonSkyPix here to make sure the
1855 # real tests later involve what's in the database, not just post-query
1856 # filtering of regions.
1857 child_difference_indices = []
1858 for large, small in zip(child_regions_large, child_regions_small):
1859 difference = list(unpack_range_set(commonSkyPix.envelope(large) - commonSkyPix.envelope(small)))
1860 assert difference, "if this is empty, we can't test anything useful with these regions"
1861 assert all(
1862 not commonSkyPix.triangle(d).isDisjointFrom(large)
1863 and commonSkyPix.triangle(d).isDisjointFrom(small)
1864 for d in difference
1865 )
1866 child_difference_indices.append(difference)
1867 parent_difference_indices = list(
1868 unpack_range_set(
1869 commonSkyPix.envelope(parent_region_large) - commonSkyPix.envelope(parent_region_small)
1870 )
1871 )
1872 assert parent_difference_indices, "if this is empty, we can't test anything useful with these regions"
1873 assert all(
1874 (
1875 not commonSkyPix.triangle(d).isDisjointFrom(parent_region_large)
1876 and commonSkyPix.triangle(d).isDisjointFrom(parent_region_small)
1877 )
1878 for d in parent_difference_indices
1879 )
1880 # Now that we've finally got those regions, we'll insert the large ones
1881 # as tract/patch dimension records.
1882 skymap_name = "testing_v1"
1883 registry.insertDimensionData(
1884 "skymap",
1885 {
1886 "name": skymap_name,
1887 "hash": bytes([42]),
1888 "tract_max": 1,
1889 "patch_nx_max": 2,
1890 "patch_ny_max": 2,
1891 },
1892 )
1893 registry.insertDimensionData("tract", {"skymap": skymap_name, "id": 0, "region": parent_region_large})
1894 registry.insertDimensionData(
1895 "patch",
1896 *[
1897 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1898 for n, c in enumerate(child_regions_large)
1899 ],
1900 )
1901 # Add at dataset that uses these dimensions to make sure that modifying
1902 # them doesn't disrupt foreign keys (need to make sure DB doesn't
1903 # implement insert with replace=True as delete-then-insert).
1904 dataset_type = DatasetType(
1905 "coadd",
1906 dimensions=["tract", "patch"],
1907 universe=registry.dimensions,
1908 storageClass="Exposure",
1909 )
1910 registry.registerDatasetType(dataset_type)
1911 registry.registerCollection("the_run", CollectionType.RUN)
1912 registry.insertDatasets(
1913 dataset_type,
1914 [{"skymap": skymap_name, "tract": 0, "patch": 2}],
1915 run="the_run",
1916 )
1917 # Query for tracts and patches that overlap some "difference" htm9
1918 # pixels; there should be overlaps, because the database has
1919 # the "large" suite of regions.
1920 self.assertEqual(
1921 {0},
1922 {
1923 data_id["tract"]
1924 for data_id in registry.queryDataIds(
1925 ["tract"],
1926 skymap=skymap_name,
1927 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1928 )
1929 },
1930 )
1931 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1932 self.assertIn(
1933 patch_id,
1934 {
1935 data_id["patch"]
1936 for data_id in registry.queryDataIds(
1937 ["patch"],
1938 skymap=skymap_name,
1939 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1940 )
1941 },
1942 )
1943 # Use sync to update the tract region and insert to update the regions
1944 # of the patches, to the "small" suite.
1945 updated = registry.syncDimensionData(
1946 "tract",
1947 {"skymap": skymap_name, "id": 0, "region": parent_region_small},
1948 update=True,
1949 )
1950 self.assertEqual(updated, {"region": parent_region_large})
1951 registry.insertDimensionData(
1952 "patch",
1953 *[
1954 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1955 for n, c in enumerate(child_regions_small)
1956 ],
1957 replace=True,
1958 )
1959 # Query again; there now should be no such overlaps, because the
1960 # database has the "small" suite of regions.
1961 self.assertFalse(
1962 set(
1963 registry.queryDataIds(
1964 ["tract"],
1965 skymap=skymap_name,
1966 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1967 )
1968 )
1969 )
1970 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1971 self.assertNotIn(
1972 patch_id,
1973 {
1974 data_id["patch"]
1975 for data_id in registry.queryDataIds(
1976 ["patch"],
1977 skymap=skymap_name,
1978 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1979 )
1980 },
1981 )
1982 # Update back to the large regions and query one more time.
1983 updated = registry.syncDimensionData(
1984 "tract",
1985 {"skymap": skymap_name, "id": 0, "region": parent_region_large},
1986 update=True,
1987 )
1988 self.assertEqual(updated, {"region": parent_region_small})
1989 registry.insertDimensionData(
1990 "patch",
1991 *[
1992 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1993 for n, c in enumerate(child_regions_large)
1994 ],
1995 replace=True,
1996 )
1997 self.assertEqual(
1998 {0},
1999 {
2000 data_id["tract"]
2001 for data_id in registry.queryDataIds(
2002 ["tract"],
2003 skymap=skymap_name,
2004 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
2005 )
2006 },
2007 )
2008 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
2009 self.assertIn(
2010 patch_id,
2011 {
2012 data_id["patch"]
2013 for data_id in registry.queryDataIds(
2014 ["patch"],
2015 skymap=skymap_name,
2016 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
2017 )
2018 },
2019 )
2021 def testCalibrationCollections(self):
2022 """Test operations on `~CollectionType.CALIBRATION` collections,
2023 including `Registry.certify`, `Registry.decertify`, and
2024 `Registry.findDataset`.
2025 """
2026 # Setup - make a Registry, fill it with some datasets in
2027 # non-calibration collections.
2028 registry = self.makeRegistry()
2029 self.loadData(registry, "base.yaml")
2030 self.loadData(registry, "datasets.yaml")
2031 # Set up some timestamps.
2032 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")
2033 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai")
2034 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai")
2035 t4 = astropy.time.Time("2020-01-01T04:00:00", format="isot", scale="tai")
2036 t5 = astropy.time.Time("2020-01-01T05:00:00", format="isot", scale="tai")
2037 allTimespans = [
2038 Timespan(a, b) for a, b in itertools.combinations([None, t1, t2, t3, t4, t5, None], r=2)
2039 ]
2040 # Get references to some datasets.
2041 bias2a = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g")
2042 bias3a = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g")
2043 bias2b = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r")
2044 bias3b = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r")
2045 # Register the main calibration collection we'll be working with.
2046 collection = "Cam1/calibs/default"
2047 registry.registerCollection(collection, type=CollectionType.CALIBRATION)
2048 # Cannot associate into a calibration collection (no timespan).
2049 with self.assertRaises(CollectionTypeError):
2050 registry.associate(collection, [bias2a])
2051 # Certify 2a dataset with [t2, t4) validity.
2052 registry.certify(collection, [bias2a], Timespan(begin=t2, end=t4))
2053 # Test that we can query for this dataset via the new collection, both
2054 # on its own and with a RUN collection, as long as we don't try to join
2055 # in temporal dimensions or use findFirst=True.
2056 self.assertEqual(
2057 set(registry.queryDatasets("bias", findFirst=False, collections=collection)),
2058 {bias2a},
2059 )
2060 self.assertEqual(
2061 set(registry.queryDatasets("bias", findFirst=False, collections=[collection, "imported_r"])),
2062 {
2063 bias2a,
2064 bias2b,
2065 bias3b,
2066 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2067 },
2068 )
2069 self.assertEqual(
2070 set(registry.queryDataIds("detector", datasets="bias", collections=collection)),
2071 {registry.expandDataId(instrument="Cam1", detector=2)},
2072 )
2073 self.assertEqual(
2074 set(registry.queryDataIds("detector", datasets="bias", collections=[collection, "imported_r"])),
2075 {
2076 registry.expandDataId(instrument="Cam1", detector=2),
2077 registry.expandDataId(instrument="Cam1", detector=3),
2078 registry.expandDataId(instrument="Cam1", detector=4),
2079 },
2080 )
2082 # We should not be able to certify 2b with anything overlapping that
2083 # window.
2084 with self.assertRaises(ConflictingDefinitionError):
2085 registry.certify(collection, [bias2b], Timespan(begin=None, end=t3))
2086 with self.assertRaises(ConflictingDefinitionError):
2087 registry.certify(collection, [bias2b], Timespan(begin=None, end=t5))
2088 with self.assertRaises(ConflictingDefinitionError):
2089 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t3))
2090 with self.assertRaises(ConflictingDefinitionError):
2091 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t5))
2092 with self.assertRaises(ConflictingDefinitionError):
2093 registry.certify(collection, [bias2b], Timespan(begin=t1, end=None))
2094 with self.assertRaises(ConflictingDefinitionError):
2095 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t3))
2096 with self.assertRaises(ConflictingDefinitionError):
2097 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t5))
2098 with self.assertRaises(ConflictingDefinitionError):
2099 registry.certify(collection, [bias2b], Timespan(begin=t2, end=None))
2100 # We should be able to certify 3a with a range overlapping that window,
2101 # because it's for a different detector.
2102 # We'll certify 3a over [t1, t3).
2103 registry.certify(collection, [bias3a], Timespan(begin=t1, end=t3))
2104 # Now we'll certify 2b and 3b together over [t4, ∞).
2105 registry.certify(collection, [bias2b, bias3b], Timespan(begin=t4, end=None))
2107 # Fetch all associations and check that they are what we expect.
2108 self.assertCountEqual(
2109 list(
2110 registry.queryDatasetAssociations(
2111 "bias",
2112 collections=[collection, "imported_g", "imported_r"],
2113 )
2114 ),
2115 [
2116 DatasetAssociation(
2117 ref=registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
2118 collection="imported_g",
2119 timespan=None,
2120 ),
2121 DatasetAssociation(
2122 ref=registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2123 collection="imported_r",
2124 timespan=None,
2125 ),
2126 DatasetAssociation(ref=bias2a, collection="imported_g", timespan=None),
2127 DatasetAssociation(ref=bias3a, collection="imported_g", timespan=None),
2128 DatasetAssociation(ref=bias2b, collection="imported_r", timespan=None),
2129 DatasetAssociation(ref=bias3b, collection="imported_r", timespan=None),
2130 DatasetAssociation(ref=bias2a, collection=collection, timespan=Timespan(begin=t2, end=t4)),
2131 DatasetAssociation(ref=bias3a, collection=collection, timespan=Timespan(begin=t1, end=t3)),
2132 DatasetAssociation(ref=bias2b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2133 DatasetAssociation(ref=bias3b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2134 ],
2135 )
2137 class Ambiguous:
2138 """Tag class to denote lookups that should be ambiguous."""
2140 pass
2142 def assertLookup(
2143 detector: int, timespan: Timespan, expected: DatasetRef | type[Ambiguous] | None
2144 ) -> None:
2145 """Local function that asserts that a bias lookup returns the given
2146 expected result.
2147 """
2148 if expected is Ambiguous:
2149 with self.assertRaises((DatasetTypeError, LookupError)):
2150 registry.findDataset(
2151 "bias",
2152 collections=collection,
2153 instrument="Cam1",
2154 detector=detector,
2155 timespan=timespan,
2156 )
2157 else:
2158 self.assertEqual(
2159 expected,
2160 registry.findDataset(
2161 "bias",
2162 collections=collection,
2163 instrument="Cam1",
2164 detector=detector,
2165 timespan=timespan,
2166 ),
2167 )
2169 # Systematically test lookups against expected results.
2170 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2171 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2172 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2173 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2174 assertLookup(detector=2, timespan=Timespan(None, t5), expected=Ambiguous)
2175 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2176 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2177 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2178 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2179 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=Ambiguous)
2180 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2181 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2182 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2183 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=Ambiguous)
2184 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2185 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=bias2a)
2186 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=Ambiguous)
2187 assertLookup(detector=2, timespan=Timespan(t3, None), expected=Ambiguous)
2188 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=bias2b)
2189 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2190 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2191 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2192 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2193 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2194 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2195 assertLookup(detector=3, timespan=Timespan(None, t5), expected=Ambiguous)
2196 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2197 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2198 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2199 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2200 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=Ambiguous)
2201 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2202 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2203 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2204 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=Ambiguous)
2205 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2206 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2207 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=bias3b)
2208 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2209 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=bias3b)
2210 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2211 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2213 # Decertify [t3, t5) for all data IDs, and do test lookups again.
2214 # This should truncate bias2a to [t2, t3), leave bias3a unchanged at
2215 # [t1, t3), and truncate bias2b and bias3b to [t5, ∞).
2216 registry.decertify(collection=collection, datasetType="bias", timespan=Timespan(t3, t5))
2217 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2218 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2219 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2220 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2221 assertLookup(detector=2, timespan=Timespan(None, t5), expected=bias2a)
2222 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2223 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2224 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2225 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2226 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=bias2a)
2227 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2228 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2229 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2230 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=bias2a)
2231 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2232 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=None)
2233 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=None)
2234 assertLookup(detector=2, timespan=Timespan(t3, None), expected=bias2b)
2235 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=None)
2236 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2237 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2238 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2239 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2240 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2241 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2242 assertLookup(detector=3, timespan=Timespan(None, t5), expected=bias3a)
2243 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2244 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2245 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2246 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2247 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=bias3a)
2248 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2249 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2250 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2251 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=bias3a)
2252 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2253 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2254 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=None)
2255 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2256 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=None)
2257 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2258 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2260 # Decertify everything, this time with explicit data IDs, then check
2261 # that no lookups succeed.
2262 registry.decertify(
2263 collection,
2264 "bias",
2265 Timespan(None, None),
2266 dataIds=[
2267 dict(instrument="Cam1", detector=2),
2268 dict(instrument="Cam1", detector=3),
2269 ],
2270 )
2271 for detector in (2, 3):
2272 for timespan in allTimespans:
2273 assertLookup(detector=detector, timespan=timespan, expected=None)
2274 # Certify bias2a and bias3a over (-∞, ∞), check that all lookups return
2275 # those.
2276 registry.certify(
2277 collection,
2278 [bias2a, bias3a],
2279 Timespan(None, None),
2280 )
2281 for timespan in allTimespans:
2282 assertLookup(detector=2, timespan=timespan, expected=bias2a)
2283 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2284 # Decertify just bias2 over [t2, t4).
2285 # This should split a single certification row into two (and leave the
2286 # other existing row, for bias3a, alone).
2287 registry.decertify(
2288 collection, "bias", Timespan(t2, t4), dataIds=[dict(instrument="Cam1", detector=2)]
2289 )
2290 for timespan in allTimespans:
2291 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2292 overlapsBefore = timespan.overlaps(Timespan(None, t2))
2293 overlapsAfter = timespan.overlaps(Timespan(t4, None))
2294 if overlapsBefore and overlapsAfter:
2295 expected = Ambiguous
2296 elif overlapsBefore or overlapsAfter:
2297 expected = bias2a
2298 else:
2299 expected = None
2300 assertLookup(detector=2, timespan=timespan, expected=expected)
2302 def testSkipCalibs(self):
2303 """Test how queries handle skipping of calibration collections."""
2304 registry = self.makeRegistry()
2305 self.loadData(registry, "base.yaml")
2306 self.loadData(registry, "datasets.yaml")
2308 coll_calib = "Cam1/calibs/default"
2309 registry.registerCollection(coll_calib, type=CollectionType.CALIBRATION)
2311 # Add all biases to the calibration collection.
2312 # Without this, the logic that prunes dataset subqueries based on
2313 # datasetType-collection summary information will fire before the logic
2314 # we want to test below. This is a good thing (it avoids the dreaded
2315 # NotImplementedError a bit more often) everywhere but here.
2316 registry.certify(coll_calib, registry.queryDatasets("bias", collections=...), Timespan(None, None))
2318 coll_list = [coll_calib, "imported_g", "imported_r"]
2319 chain = "Cam1/chain"
2320 registry.registerCollection(chain, type=CollectionType.CHAINED)
2321 registry.setCollectionChain(chain, coll_list)
2323 # explicit list will raise if findFirst=True or there are temporal
2324 # dimensions
2325 with self.assertRaises(NotImplementedError):
2326 registry.queryDatasets("bias", collections=coll_list, findFirst=True)
2327 with self.assertRaises(NotImplementedError):
2328 registry.queryDataIds(
2329 ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
2330 ).count()
2332 # chain will skip
2333 datasets = list(registry.queryDatasets("bias", collections=chain))
2334 self.assertGreater(len(datasets), 0)
2336 dataIds = list(registry.queryDataIds(["instrument", "detector"], datasets="bias", collections=chain))
2337 self.assertGreater(len(dataIds), 0)
2339 # glob will skip too
2340 datasets = list(registry.queryDatasets("bias", collections="*d*"))
2341 self.assertGreater(len(datasets), 0)
2343 # regular expression will skip too
2344 pattern = re.compile(".*")
2345 datasets = list(registry.queryDatasets("bias", collections=pattern))
2346 self.assertGreater(len(datasets), 0)
2348 # ellipsis should work as usual
2349 datasets = list(registry.queryDatasets("bias", collections=...))
2350 self.assertGreater(len(datasets), 0)
2352 # few tests with findFirst
2353 datasets = list(registry.queryDatasets("bias", collections=chain, findFirst=True))
2354 self.assertGreater(len(datasets), 0)
2356 def testIngestTimeQuery(self):
2357 registry = self.makeRegistry()
2358 self.loadData(registry, "base.yaml")
2359 dt0 = datetime.utcnow()
2360 self.loadData(registry, "datasets.yaml")
2361 dt1 = datetime.utcnow()
2363 datasets = list(registry.queryDatasets(..., collections=...))
2364 len0 = len(datasets)
2365 self.assertGreater(len0, 0)
2367 where = "ingest_date > T'2000-01-01'"
2368 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2369 len1 = len(datasets)
2370 self.assertEqual(len0, len1)
2372 # no one will ever use this piece of software in 30 years
2373 where = "ingest_date > T'2050-01-01'"
2374 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2375 len2 = len(datasets)
2376 self.assertEqual(len2, 0)
2378 # Check more exact timing to make sure there is no 37 seconds offset
2379 # (after fixing DM-30124). SQLite time precision is 1 second, make
2380 # sure that we don't test with higher precision.
2381 tests = [
2382 # format: (timestamp, operator, expected_len)
2383 (dt0 - timedelta(seconds=1), ">", len0),
2384 (dt0 - timedelta(seconds=1), "<", 0),
2385 (dt1 + timedelta(seconds=1), "<", len0),
2386 (dt1 + timedelta(seconds=1), ">", 0),
2387 ]
2388 for dt, op, expect_len in tests:
2389 dt_str = dt.isoformat(sep=" ")
2391 where = f"ingest_date {op} T'{dt_str}'"
2392 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2393 self.assertEqual(len(datasets), expect_len)
2395 # same with bind using datetime or astropy Time
2396 where = f"ingest_date {op} ingest_time"
2397 datasets = list(
2398 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt})
2399 )
2400 self.assertEqual(len(datasets), expect_len)
2402 dt_astropy = astropy.time.Time(dt, format="datetime")
2403 datasets = list(
2404 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt_astropy})
2405 )
2406 self.assertEqual(len(datasets), expect_len)
2408 def testTimespanQueries(self):
2409 """Test query expressions involving timespans."""
2410 registry = self.makeRegistry()
2411 self.loadData(registry, "hsc-rc2-subset.yaml")
2412 # All exposures in the database; mapping from ID to timespan.
2413 visits = {record.id: record.timespan for record in registry.queryDimensionRecords("visit")}
2414 # Just those IDs, sorted (which is also temporal sorting, because HSC
2415 # exposure IDs are monotonically increasing).
2416 ids = sorted(visits.keys())
2417 self.assertGreater(len(ids), 20)
2418 # Pick some quasi-random indexes into `ids` to play with.
2419 i1 = int(len(ids) * 0.1)
2420 i2 = int(len(ids) * 0.3)
2421 i3 = int(len(ids) * 0.6)
2422 i4 = int(len(ids) * 0.8)
2423 # Extract some times from those: just before the beginning of i1 (which
2424 # should be after the end of the exposure before), exactly the
2425 # beginning of i2, just after the beginning of i3 (and before its end),
2426 # and the exact end of i4.
2427 t1 = visits[ids[i1]].begin - astropy.time.TimeDelta(1.0, format="sec")
2428 self.assertGreater(t1, visits[ids[i1 - 1]].end)
2429 t2 = visits[ids[i2]].begin
2430 t3 = visits[ids[i3]].begin + astropy.time.TimeDelta(1.0, format="sec")
2431 self.assertLess(t3, visits[ids[i3]].end)
2432 t4 = visits[ids[i4]].end
2433 # Make sure those are actually in order.
2434 self.assertEqual([t1, t2, t3, t4], sorted([t4, t3, t2, t1]))
2436 bind = {
2437 "t1": t1,
2438 "t2": t2,
2439 "t3": t3,
2440 "t4": t4,
2441 "ts23": Timespan(t2, t3),
2442 }
2444 def query(where):
2445 """Helper function that queries for visit data IDs and returns
2446 results as a sorted, deduplicated list of visit IDs.
2447 """
2448 return sorted(
2449 {
2450 dataId["visit"]
2451 for dataId in registry.queryDataIds("visit", instrument="HSC", bind=bind, where=where)
2452 }
2453 )
2455 # Try a bunch of timespan queries, mixing up the bounds themselves,
2456 # where they appear in the expression, and how we get the timespan into
2457 # the expression.
2459 # t1 is before the start of i1, so this should not include i1.
2460 self.assertEqual(ids[:i1], query("visit.timespan OVERLAPS (null, t1)"))
2461 # t2 is exactly at the start of i2, but ends are exclusive, so these
2462 # should not include i2.
2463 self.assertEqual(ids[i1:i2], query("(t1, t2) OVERLAPS visit.timespan"))
2464 self.assertEqual(ids[:i2], query("visit.timespan < (t2, t4)"))
2465 # t3 is in the middle of i3, so this should include i3.
2466 self.assertEqual(ids[i2 : i3 + 1], query("visit.timespan OVERLAPS ts23"))
2467 # This one should not include t3 by the same reasoning.
2468 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > (t1, t3)"))
2469 # t4 is exactly at the end of i4, so this should include i4.
2470 self.assertEqual(ids[i3 : i4 + 1], query(f"visit.timespan OVERLAPS (T'{t3.tai.isot}', t4)"))
2471 # i4's upper bound of t4 is exclusive so this should not include t4.
2472 self.assertEqual(ids[i4 + 1 :], query("visit.timespan OVERLAPS (t4, NULL)"))
2474 # Now some timespan vs. time scalar queries.
2475 self.assertEqual(ids[:i2], query("visit.timespan < t2"))
2476 self.assertEqual(ids[:i2], query("t2 > visit.timespan"))
2477 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > t3"))
2478 self.assertEqual(ids[i3 + 1 :], query("t3 < visit.timespan"))
2479 self.assertEqual(ids[i3 : i3 + 1], query("visit.timespan OVERLAPS t3"))
2480 self.assertEqual(ids[i3 : i3 + 1], query(f"T'{t3.tai.isot}' OVERLAPS visit.timespan"))
2482 # Empty timespans should not overlap anything.
2483 self.assertEqual([], query("visit.timespan OVERLAPS (t3, t2)"))
2485 def testCollectionSummaries(self):
2486 """Test recording and retrieval of collection summaries."""
2487 self.maxDiff = None
2488 registry = self.makeRegistry()
2489 # Importing datasets from yaml should go through the code path where
2490 # we update collection summaries as we insert datasets.
2491 self.loadData(registry, "base.yaml")
2492 self.loadData(registry, "datasets.yaml")
2493 flat = registry.getDatasetType("flat")
2494 expected1 = CollectionSummary()
2495 expected1.dataset_types.add(registry.getDatasetType("bias"))
2496 expected1.add_data_ids(
2497 flat, [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)]
2498 )
2499 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2500 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2501 # Create a chained collection with both of the imported runs; the
2502 # summary should be the same, because it's a union with itself.
2503 chain = "chain"
2504 registry.registerCollection(chain, CollectionType.CHAINED)
2505 registry.setCollectionChain(chain, ["imported_r", "imported_g"])
2506 self.assertEqual(registry.getCollectionSummary(chain), expected1)
2507 # Associate flats only into a tagged collection and a calibration
2508 # collection to check summaries of those.
2509 tag = "tag"
2510 registry.registerCollection(tag, CollectionType.TAGGED)
2511 registry.associate(tag, registry.queryDatasets(flat, collections="imported_g"))
2512 calibs = "calibs"
2513 registry.registerCollection(calibs, CollectionType.CALIBRATION)
2514 registry.certify(
2515 calibs, registry.queryDatasets(flat, collections="imported_g"), timespan=Timespan(None, None)
2516 )
2517 expected2 = expected1.copy()
2518 expected2.dataset_types.discard("bias")
2519 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2520 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2521 # Explicitly calling Registry.refresh() should load those same
2522 # summaries, via a totally different code path.
2523 registry.refresh()
2524 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2525 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2526 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2527 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2529 def testBindInQueryDatasets(self):
2530 """Test that the bind parameter is correctly forwarded in
2531 queryDatasets recursion.
2532 """
2533 registry = self.makeRegistry()
2534 # Importing datasets from yaml should go through the code path where
2535 # we update collection summaries as we insert datasets.
2536 self.loadData(registry, "base.yaml")
2537 self.loadData(registry, "datasets.yaml")
2538 self.assertEqual(
2539 set(registry.queryDatasets("flat", band="r", collections=...)),
2540 set(registry.queryDatasets("flat", where="band=my_band", bind={"my_band": "r"}, collections=...)),
2541 )
2543 def testQueryIntRangeExpressions(self):
2544 """Test integer range expressions in ``where`` arguments.
2546 Note that our expressions use inclusive stop values, unlike Python's.
2547 """
2548 registry = self.makeRegistry()
2549 self.loadData(registry, "base.yaml")
2550 self.assertEqual(
2551 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..2)")),
2552 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 2]},
2553 )
2554 self.assertEqual(
2555 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..4:2)")),
2556 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 3]},
2557 )
2558 self.assertEqual(
2559 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (2..4:2)")),
2560 {registry.expandDataId(instrument="Cam1", detector=n) for n in [2, 4]},
2561 )
2563 def testQueryResultSummaries(self):
2564 """Test summary methods like `count`, `any`, and `explain_no_results`
2565 on `DataCoordinateQueryResults` and `DatasetQueryResults`
2566 """
2567 registry = self.makeRegistry()
2568 self.loadData(registry, "base.yaml")
2569 self.loadData(registry, "datasets.yaml")
2570 self.loadData(registry, "spatial.yaml")
2571 # Default test dataset has two collections, each with both flats and
2572 # biases. Add a new collection with only biases.
2573 registry.registerCollection("biases", CollectionType.TAGGED)
2574 registry.associate("biases", registry.queryDatasets("bias", collections=["imported_g"]))
2575 # First query yields two results, and involves no postprocessing.
2576 query1 = registry.queryDataIds(["physical_filter"], band="r")
2577 self.assertTrue(query1.any(execute=False, exact=False))
2578 self.assertTrue(query1.any(execute=True, exact=False))
2579 self.assertTrue(query1.any(execute=True, exact=True))
2580 self.assertEqual(query1.count(exact=False), 2)
2581 self.assertEqual(query1.count(exact=True), 2)
2582 self.assertFalse(list(query1.explain_no_results()))
2583 # Second query should yield no results, which we should see when
2584 # we attempt to expand the data ID.
2585 query2 = registry.queryDataIds(["physical_filter"], band="h")
2586 # There's no execute=False, exact=Fals test here because the behavior
2587 # not something we want to guarantee in this case (and exact=False
2588 # says either answer is legal).
2589 self.assertFalse(query2.any(execute=True, exact=False))
2590 self.assertFalse(query2.any(execute=True, exact=True))
2591 self.assertEqual(query2.count(exact=False), 0)
2592 self.assertEqual(query2.count(exact=True), 0)
2593 self.assertTrue(list(query2.explain_no_results()))
2594 # These queries yield no results due to various problems that can be
2595 # spotted prior to execution, yielding helpful diagnostics.
2596 base_query = registry.queryDataIds(["detector", "physical_filter"])
2597 queries_and_snippets = [
2598 (
2599 # Dataset type name doesn't match any existing dataset types.
2600 registry.queryDatasets("nonexistent", collections=...),
2601 ["nonexistent"],
2602 ),
2603 (
2604 # Dataset type object isn't registered.
2605 registry.queryDatasets(
2606 DatasetType(
2607 "nonexistent",
2608 dimensions=["instrument"],
2609 universe=registry.dimensions,
2610 storageClass="Image",
2611 ),
2612 collections=...,
2613 ),
2614 ["nonexistent"],
2615 ),
2616 (
2617 # No datasets of this type in this collection.
2618 registry.queryDatasets("flat", collections=["biases"]),
2619 ["flat", "biases"],
2620 ),
2621 (
2622 # No datasets of this type in this collection.
2623 base_query.findDatasets("flat", collections=["biases"]),
2624 ["flat", "biases"],
2625 ),
2626 (
2627 # No collections matching at all.
2628 registry.queryDatasets("flat", collections=re.compile("potato.+")),
2629 ["potato"],
2630 ),
2631 ]
2632 # The behavior of these additional queries is slated to change in the
2633 # future, so we also check for deprecation warnings.
2634 with self.assertWarns(FutureWarning):
2635 queries_and_snippets.append(
2636 (
2637 # Dataset type name doesn't match any existing dataset
2638 # types.
2639 registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...),
2640 ["nonexistent"],
2641 )
2642 )
2643 with self.assertWarns(FutureWarning):
2644 queries_and_snippets.append(
2645 (
2646 # Dataset type name doesn't match any existing dataset
2647 # types.
2648 registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...),
2649 ["nonexistent"],
2650 )
2651 )
2652 for query, snippets in queries_and_snippets:
2653 self.assertFalse(query.any(execute=False, exact=False))
2654 self.assertFalse(query.any(execute=True, exact=False))
2655 self.assertFalse(query.any(execute=True, exact=True))
2656 self.assertEqual(query.count(exact=False), 0)
2657 self.assertEqual(query.count(exact=True), 0)
2658 messages = list(query.explain_no_results())
2659 self.assertTrue(messages)
2660 # Want all expected snippets to appear in at least one message.
2661 self.assertTrue(
2662 any(
2663 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2664 ),
2665 messages,
2666 )
2668 # This query does yield results, but should also emit a warning because
2669 # dataset type patterns to queryDataIds is deprecated; just look for
2670 # the warning.
2671 with self.assertWarns(FutureWarning):
2672 registry.queryDataIds(["detector"], datasets=re.compile("^nonexistent$"), collections=...)
2674 # These queries yield no results due to problems that can be identified
2675 # by cheap follow-up queries, yielding helpful diagnostics.
2676 for query, snippets in [
2677 (
2678 # No records for one of the involved dimensions.
2679 registry.queryDataIds(["subfilter"]),
2680 ["no rows", "subfilter"],
2681 ),
2682 (
2683 # No records for one of the involved dimensions.
2684 registry.queryDimensionRecords("subfilter"),
2685 ["no rows", "subfilter"],
2686 ),
2687 ]:
2688 self.assertFalse(query.any(execute=True, exact=False))
2689 self.assertFalse(query.any(execute=True, exact=True))
2690 self.assertEqual(query.count(exact=True), 0)
2691 messages = list(query.explain_no_results())
2692 self.assertTrue(messages)
2693 # Want all expected snippets to appear in at least one message.
2694 self.assertTrue(
2695 any(
2696 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2697 ),
2698 messages,
2699 )
2701 # This query yields four overlaps in the database, but one is filtered
2702 # out in postprocessing. The count queries aren't accurate because
2703 # they don't account for duplication that happens due to an internal
2704 # join against commonSkyPix.
2705 query3 = registry.queryDataIds(["visit", "tract"], instrument="Cam1", skymap="SkyMap1")
2706 self.assertEqual(
2707 {
2708 DataCoordinate.standardize(
2709 instrument="Cam1",
2710 skymap="SkyMap1",
2711 visit=v,
2712 tract=t,
2713 universe=registry.dimensions,
2714 )
2715 for v, t in [(1, 0), (2, 0), (2, 1)]
2716 },
2717 set(query3),
2718 )
2719 self.assertTrue(query3.any(execute=False, exact=False))
2720 self.assertTrue(query3.any(execute=True, exact=False))
2721 self.assertTrue(query3.any(execute=True, exact=True))
2722 self.assertGreaterEqual(query3.count(exact=False), 4)
2723 self.assertGreaterEqual(query3.count(exact=True, discard=True), 3)
2724 self.assertFalse(list(query3.explain_no_results()))
2725 # This query yields overlaps in the database, but all are filtered
2726 # out in postprocessing. The count queries again aren't very useful.
2727 # We have to use `where=` here to avoid an optimization that
2728 # (currently) skips the spatial postprocess-filtering because it
2729 # recognizes that no spatial join is necessary. That's not ideal, but
2730 # fixing it is out of scope for this ticket.
2731 query4 = registry.queryDataIds(
2732 ["visit", "tract"],
2733 instrument="Cam1",
2734 skymap="SkyMap1",
2735 where="visit=1 AND detector=1 AND tract=0 AND patch=4",
2736 )
2737 self.assertFalse(set(query4))
2738 self.assertTrue(query4.any(execute=False, exact=False))
2739 self.assertTrue(query4.any(execute=True, exact=False))
2740 self.assertFalse(query4.any(execute=True, exact=True))
2741 self.assertGreaterEqual(query4.count(exact=False), 1)
2742 self.assertEqual(query4.count(exact=True, discard=True), 0)
2743 messages = query4.explain_no_results()
2744 self.assertTrue(messages)
2745 self.assertTrue(any("overlap" in message for message in messages))
2746 # This query should yield results from one dataset type but not the
2747 # other, which is not registered.
2748 query5 = registry.queryDatasets(["bias", "nonexistent"], collections=["biases"])
2749 self.assertTrue(set(query5))
2750 self.assertTrue(query5.any(execute=False, exact=False))
2751 self.assertTrue(query5.any(execute=True, exact=False))
2752 self.assertTrue(query5.any(execute=True, exact=True))
2753 self.assertGreaterEqual(query5.count(exact=False), 1)
2754 self.assertGreaterEqual(query5.count(exact=True), 1)
2755 self.assertFalse(list(query5.explain_no_results()))
2756 # This query applies a selection that yields no results, fully in the
2757 # database. Explaining why it fails involves traversing the relation
2758 # tree and running a LIMIT 1 query at each level that has the potential
2759 # to remove rows.
2760 query6 = registry.queryDimensionRecords(
2761 "detector", where="detector.purpose = 'no-purpose'", instrument="Cam1"
2762 )
2763 self.assertEqual(query6.count(exact=True), 0)
2764 messages = query6.explain_no_results()
2765 self.assertTrue(messages)
2766 self.assertTrue(any("no-purpose" in message for message in messages))
2768 def testQueryDataIdsOrderBy(self):
2769 """Test order_by and limit on result returned by queryDataIds()."""
2770 registry = self.makeRegistry()
2771 self.loadData(registry, "base.yaml")
2772 self.loadData(registry, "datasets.yaml")
2773 self.loadData(registry, "spatial.yaml")
2775 def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
2776 return registry.queryDataIds(
2777 dimensions, datasets=datasets, collections=collections, instrument="Cam1", skymap="SkyMap1"
2778 )
2780 Test = namedtuple(
2781 "testQueryDataIdsOrderByTest",
2782 ("order_by", "keys", "result", "limit", "datasets", "collections"),
2783 defaults=(None, None, None),
2784 )
2786 test_data = (
2787 Test("tract,visit", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2788 Test("-tract,visit", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))),
2789 Test("tract,-visit", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))),
2790 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2), (0, 2), (0, 1), (0, 1))),
2791 Test(
2792 "tract.id,visit.id",
2793 "tract,visit",
2794 ((0, 1), (0, 1), (0, 2)),
2795 limit=(3,),
2796 ),
2797 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2)), limit=(3,)),
2798 Test("tract,visit", "tract,visit", ((0, 2), (1, 2), (1, 2)), limit=(3, 3)),
2799 Test("-tract,-visit", "tract,visit", ((0, 1),), limit=(3, 5)),
2800 Test(
2801 "tract,visit.exposure_time", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))
2802 ),
2803 Test(
2804 "-tract,-visit.exposure_time", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))
2805 ),
2806 Test("tract,-exposure_time", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2807 Test("tract,visit.name", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2808 Test(
2809 "tract,-timespan.begin,timespan.end",
2810 "tract,visit",
2811 ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2)),
2812 ),
2813 Test("visit.day_obs,exposure.day_obs", "visit,exposure", ()),
2814 Test("visit.timespan.begin,-exposure.timespan.begin", "visit,exposure", ()),
2815 Test(
2816 "tract,detector",
2817 "tract,detector",
2818 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2819 datasets="flat",
2820 collections="imported_r",
2821 ),
2822 Test(
2823 "tract,detector.full_name",
2824 "tract,detector",
2825 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2826 datasets="flat",
2827 collections="imported_r",
2828 ),
2829 Test(
2830 "tract,detector.raft,detector.name_in_raft",
2831 "tract,detector",
2832 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2833 datasets="flat",
2834 collections="imported_r",
2835 ),
2836 )
2838 for test in test_data:
2839 order_by = test.order_by.split(",")
2840 keys = test.keys.split(",")
2841 query = do_query(keys, test.datasets, test.collections).order_by(*order_by)
2842 if test.limit is not None:
2843 query = query.limit(*test.limit)
2844 dataIds = tuple(tuple(dataId[k] for k in keys) for dataId in query)
2845 self.assertEqual(dataIds, test.result)
2847 # and materialize
2848 query = do_query(keys).order_by(*order_by)
2849 if test.limit is not None:
2850 query = query.limit(*test.limit)
2851 with self.assertRaises(RelationalAlgebraError):
2852 with query.materialize():
2853 pass
2855 # errors in a name
2856 for order_by in ("", "-"):
2857 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2858 list(do_query().order_by(order_by))
2860 for order_by in ("undimension.name", "-undimension.name"):
2861 with self.assertRaisesRegex(ValueError, "Unknown dimension element name 'undimension'"):
2862 list(do_query().order_by(order_by))
2864 for order_by in ("attract", "-attract"):
2865 with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
2866 list(do_query().order_by(order_by))
2868 with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
2869 list(do_query(("exposure", "visit")).order_by("exposure_time"))
2871 with self.assertRaisesRegex(ValueError, "Timespan exists in more than one dimesion"):
2872 list(do_query(("exposure", "visit")).order_by("timespan.begin"))
2874 with self.assertRaisesRegex(
2875 ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
2876 ):
2877 list(do_query("tract").order_by("timespan.begin"))
2879 with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
2880 list(do_query("tract").order_by("tract.timespan.begin"))
2882 with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
2883 list(do_query("tract").order_by("tract.name"))
2885 def testQueryDataIdsGovernorExceptions(self):
2886 """Test exceptions raised by queryDataIds() for incorrect governors."""
2887 registry = self.makeRegistry()
2888 self.loadData(registry, "base.yaml")
2889 self.loadData(registry, "datasets.yaml")
2890 self.loadData(registry, "spatial.yaml")
2892 def do_query(dimensions, dataId=None, where="", bind=None, **kwargs):
2893 return registry.queryDataIds(dimensions, dataId=dataId, where=where, bind=bind, **kwargs)
2895 Test = namedtuple(
2896 "testQueryDataIdExceptionsTest",
2897 ("dimensions", "dataId", "where", "bind", "kwargs", "exception", "count"),
2898 defaults=(None, None, None, {}, None, 0),
2899 )
2901 test_data = (
2902 Test("tract,visit", count=6),
2903 Test("tract,visit", kwargs={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2904 Test(
2905 "tract,visit", kwargs={"instrument": "Cam2", "skymap": "SkyMap1"}, exception=DataIdValueError
2906 ),
2907 Test("tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2908 Test(
2909 "tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap2"}, exception=DataIdValueError
2910 ),
2911 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap1'", count=6),
2912 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap5'", exception=DataIdValueError),
2913 Test(
2914 "tract,visit",
2915 where="instrument=cam AND skymap=map",
2916 bind={"cam": "Cam1", "map": "SkyMap1"},
2917 count=6,
2918 ),
2919 Test(
2920 "tract,visit",
2921 where="instrument=cam AND skymap=map",
2922 bind={"cam": "Cam", "map": "SkyMap"},
2923 exception=DataIdValueError,
2924 ),
2925 )
2927 for test in test_data:
2928 dimensions = test.dimensions.split(",")
2929 if test.exception:
2930 with self.assertRaises(test.exception):
2931 do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs).count()
2932 else:
2933 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2934 self.assertEqual(query.count(discard=True), test.count)
2936 # and materialize
2937 if test.exception:
2938 with self.assertRaises(test.exception):
2939 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2940 with query.materialize() as materialized:
2941 materialized.count(discard=True)
2942 else:
2943 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2944 with query.materialize() as materialized:
2945 self.assertEqual(materialized.count(discard=True), test.count)
2947 def testQueryDimensionRecordsOrderBy(self):
2948 """Test order_by and limit on result returned by
2949 queryDimensionRecords().
2950 """
2951 registry = self.makeRegistry()
2952 self.loadData(registry, "base.yaml")
2953 self.loadData(registry, "datasets.yaml")
2954 self.loadData(registry, "spatial.yaml")
2956 def do_query(element, datasets=None, collections=None):
2957 return registry.queryDimensionRecords(
2958 element, instrument="Cam1", datasets=datasets, collections=collections
2959 )
2961 query = do_query("detector")
2962 self.assertEqual(len(list(query)), 4)
2964 Test = namedtuple(
2965 "testQueryDataIdsOrderByTest",
2966 ("element", "order_by", "result", "limit", "datasets", "collections"),
2967 defaults=(None, None, None),
2968 )
2970 test_data = (
2971 Test("detector", "detector", (1, 2, 3, 4)),
2972 Test("detector", "-detector", (4, 3, 2, 1)),
2973 Test("detector", "raft,-name_in_raft", (2, 1, 4, 3)),
2974 Test("detector", "-detector.purpose", (4,), limit=(1,)),
2975 Test("detector", "-purpose,detector.raft,name_in_raft", (2, 3), limit=(2, 2)),
2976 Test("visit", "visit", (1, 2)),
2977 Test("visit", "-visit.id", (2, 1)),
2978 Test("visit", "zenith_angle", (1, 2)),
2979 Test("visit", "-visit.name", (2, 1)),
2980 Test("visit", "day_obs,-timespan.begin", (2, 1)),
2981 )
2983 for test in test_data:
2984 order_by = test.order_by.split(",")
2985 query = do_query(test.element).order_by(*order_by)
2986 if test.limit is not None:
2987 query = query.limit(*test.limit)
2988 dataIds = tuple(rec.id for rec in query)
2989 self.assertEqual(dataIds, test.result)
2991 # errors in a name
2992 for order_by in ("", "-"):
2993 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2994 list(do_query("detector").order_by(order_by))
2996 for order_by in ("undimension.name", "-undimension.name"):
2997 with self.assertRaisesRegex(ValueError, "Element name mismatch: 'undimension'"):
2998 list(do_query("detector").order_by(order_by))
3000 for order_by in ("attract", "-attract"):
3001 with self.assertRaisesRegex(ValueError, "Field 'attract' does not exist in 'detector'."):
3002 list(do_query("detector").order_by(order_by))
3004 def testQueryDimensionRecordsExceptions(self):
3005 """Test exceptions raised by queryDimensionRecords()."""
3006 registry = self.makeRegistry()
3007 self.loadData(registry, "base.yaml")
3008 self.loadData(registry, "datasets.yaml")
3009 self.loadData(registry, "spatial.yaml")
3011 result = registry.queryDimensionRecords("detector")
3012 self.assertEqual(result.count(), 4)
3013 result = registry.queryDimensionRecords("detector", instrument="Cam1")
3014 self.assertEqual(result.count(), 4)
3015 result = registry.queryDimensionRecords("detector", dataId={"instrument": "Cam1"})
3016 self.assertEqual(result.count(), 4)
3017 result = registry.queryDimensionRecords("detector", where="instrument='Cam1'")
3018 self.assertEqual(result.count(), 4)
3019 result = registry.queryDimensionRecords("detector", where="instrument=instr", bind={"instr": "Cam1"})
3020 self.assertEqual(result.count(), 4)
3022 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3023 result = registry.queryDimensionRecords("detector", instrument="NotCam1")
3024 result.count()
3026 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3027 result = registry.queryDimensionRecords("detector", dataId={"instrument": "NotCam1"})
3028 result.count()
3030 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3031 result = registry.queryDimensionRecords("detector", where="instrument='NotCam1'")
3032 result.count()
3034 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3035 result = registry.queryDimensionRecords(
3036 "detector", where="instrument=instr", bind={"instr": "NotCam1"}
3037 )
3038 result.count()
3040 def testDatasetConstrainedDimensionRecordQueries(self):
3041 """Test that queryDimensionRecords works even when given a dataset
3042 constraint whose dimensions extend beyond the requested dimension
3043 element's.
3044 """
3045 registry = self.makeRegistry()
3046 self.loadData(registry, "base.yaml")
3047 self.loadData(registry, "datasets.yaml")
3048 # Query for physical_filter dimension records, using a dataset that
3049 # has both physical_filter and dataset dimensions.
3050 records = registry.queryDimensionRecords(
3051 "physical_filter",
3052 datasets=["flat"],
3053 collections="imported_r",
3054 )
3055 self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"})
3056 # Trying to constrain by all dataset types is an error.
3057 with self.assertRaises(TypeError):
3058 list(registry.queryDimensionRecords("physical_filter", datasets=..., collections="imported_r"))
3060 def testSkyPixDatasetQueries(self):
3061 """Test that we can build queries involving skypix dimensions as long
3062 as a dataset type that uses those dimensions is included.
3063 """
3064 registry = self.makeRegistry()
3065 self.loadData(registry, "base.yaml")
3066 dataset_type = DatasetType(
3067 "a", dimensions=["htm7", "instrument"], universe=registry.dimensions, storageClass="int"
3068 )
3069 registry.registerDatasetType(dataset_type)
3070 run = "r"
3071 registry.registerRun(run)
3072 # First try queries where there are no datasets; the concern is whether
3073 # we can even build and execute these queries without raising, even
3074 # when "doomed" query shortcuts are in play.
3075 self.assertFalse(
3076 list(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run))
3077 )
3078 self.assertFalse(list(registry.queryDatasets(dataset_type, collections=run)))
3079 # Now add a dataset and see that we can get it back.
3080 htm7 = registry.dimensions.skypix["htm"][7].pixelization
3081 data_id = registry.expandDataId(instrument="Cam1", htm7=htm7.universe()[0][0])
3082 (ref,) = registry.insertDatasets(dataset_type, [data_id], run=run)
3083 self.assertEqual(
3084 set(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run)),
3085 {data_id},
3086 )
3087 self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})
3089 def testDatasetIdFactory(self):
3090 """Simple test for DatasetIdFactory, mostly to catch potential changes
3091 in its API.
3092 """
3093 registry = self.makeRegistry()
3094 factory = registry.datasetIdFactory
3095 dataset_type = DatasetType(
3096 "datasetType",
3097 dimensions=["detector", "instrument"],
3098 universe=registry.dimensions,
3099 storageClass="int",
3100 )
3101 run = "run"
3102 data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)
3104 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
3105 self.assertIsInstance(datasetId, uuid.UUID)
3106 self.assertEqual(datasetId.version, 4)
3108 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
3109 self.assertIsInstance(datasetId, uuid.UUID)
3110 self.assertEqual(datasetId.version, 5)
3112 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
3113 self.assertIsInstance(datasetId, uuid.UUID)
3114 self.assertEqual(datasetId.version, 5)
3116 def testExposureQueries(self):
3117 """Test query methods using arguments sourced from the exposure log
3118 service.
3120 The most complete test dataset currently available to daf_butler tests
3121 is hsc-rc2-subset.yaml export (which is unfortunately distinct from the
3122 the lsst/rc2_subset GitHub repo), but that does not have 'exposure'
3123 dimension records as it was focused on providing nontrivial spatial
3124 overlaps between visit+detector and tract+patch. So in this test we
3125 need to translate queries that originally used the exposure dimension
3126 to use the (very similar) visit dimension instead.
3127 """
3128 registry = self.makeRegistry()
3129 self.loadData(registry, "hsc-rc2-subset.yaml")
3130 self.assertEqual(
3131 [
3132 record.id
3133 for record in registry.queryDimensionRecords("visit", instrument="HSC")
3134 .order_by("id")
3135 .limit(5)
3136 ],
3137 [318, 322, 326, 330, 332],
3138 )
3139 self.assertEqual(
3140 [
3141 data_id["visit"]
3142 for data_id in registry.queryDataIds(["visit"], instrument="HSC").order_by("id").limit(5)
3143 ],
3144 [318, 322, 326, 330, 332],
3145 )
3146 self.assertEqual(
3147 [
3148 record.id
3149 for record in registry.queryDimensionRecords("detector", instrument="HSC")
3150 .order_by("full_name")
3151 .limit(5)
3152 ],
3153 [73, 72, 71, 70, 65],
3154 )
3155 self.assertEqual(
3156 [
3157 data_id["detector"]
3158 for data_id in registry.queryDataIds(["detector"], instrument="HSC")
3159 .order_by("full_name")
3160 .limit(5)
3161 ],
3162 [73, 72, 71, 70, 65],
3163 )
3165 def test_long_query_names(self) -> None:
3166 """Test that queries involving very long names are handled correctly.
3168 This is especially important for PostgreSQL, which truncates symbols
3169 longer than 64 chars, but it's worth testing for all DBs.
3170 """
3171 registry = self.makeRegistry()
3172 name = "abcd" * 17
3173 registry.registerDatasetType(
3174 DatasetType(
3175 name,
3176 dimensions=(),
3177 storageClass="Exposure",
3178 universe=registry.dimensions,
3179 )
3180 )
3181 # Need to search more than one collection actually containing a
3182 # matching dataset to avoid optimizations that sidestep bugs due to
3183 # truncation by making findFirst=True a no-op.
3184 run1 = "run1"
3185 registry.registerRun(run1)
3186 run2 = "run2"
3187 registry.registerRun(run2)
3188 (ref1,) = registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run1)
3189 registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run2)
3190 self.assertEqual(
3191 set(registry.queryDatasets(name, collections=[run1, run2], findFirst=True)),
3192 {ref1},
3193 )
3195 def test_skypix_constraint_queries(self) -> None:
3196 """Test queries spatially constrained by a skypix data ID."""
3197 registry = self.makeRegistry()
3198 self.loadData(registry, "hsc-rc2-subset.yaml")
3199 patch_regions = {
3200 (data_id["tract"], data_id["patch"]): data_id.region
3201 for data_id in registry.queryDataIds(["patch"]).expanded()
3202 }
3203 skypix_dimension: SkyPixDimension = registry.dimensions["htm11"]
3204 # This check ensures the test doesn't become trivial due to a config
3205 # change; if it does, just pick a different HTML level.
3206 self.assertNotEqual(skypix_dimension, registry.dimensions.commonSkyPix)
3207 # Gather all skypix IDs that definitely overlap at least one of these
3208 # patches.
3209 relevant_skypix_ids = lsst.sphgeom.RangeSet()
3210 for patch_region in patch_regions.values():
3211 relevant_skypix_ids |= skypix_dimension.pixelization.interior(patch_region)
3212 # Look for a "nontrivial" skypix_id that overlaps at least one patch
3213 # and does not overlap at least one other patch.
3214 for skypix_id in itertools.chain.from_iterable(
3215 range(begin, end) for begin, end in relevant_skypix_ids
3216 ):
3217 skypix_region = skypix_dimension.pixelization.pixel(skypix_id)
3218 overlapping_patches = {
3219 patch_key
3220 for patch_key, patch_region in patch_regions.items()
3221 if not patch_region.isDisjointFrom(skypix_region)
3222 }
3223 if overlapping_patches and overlapping_patches != patch_regions.keys():
3224 break
3225 else:
3226 raise RuntimeError("Could not find usable skypix ID for this dimension configuration.")
3227 self.assertEqual(
3228 {
3229 (data_id["tract"], data_id["patch"])
3230 for data_id in registry.queryDataIds(
3231 ["patch"],
3232 dataId={skypix_dimension.name: skypix_id},
3233 )
3234 },
3235 overlapping_patches,
3236 )
3237 # Test that a three-way join that includes the common skypix system in
3238 # the dimensions doesn't generate redundant join terms in the query.
3239 full_data_ids = set(
3240 registry.queryDataIds(
3241 ["tract", "visit", "htm7"], skymap="hsc_rings_v1", instrument="HSC"
3242 ).expanded()
3243 )
3244 self.assertGreater(len(full_data_ids), 0)
3245 for data_id in full_data_ids:
3246 self.assertFalse(data_id.records["tract"].region.isDisjointFrom(data_id.records["htm7"].region))
3247 self.assertFalse(data_id.records["visit"].region.isDisjointFrom(data_id.records["htm7"].region))
3249 def test_spatial_constraint_queries(self) -> None:
3250 """Test queries in which one spatial dimension in the constraint (data
3251 ID or ``where`` string) constrains a different spatial dimension in the
3252 query result columns.
3253 """
3254 registry = self.makeRegistry()
3255 self.loadData(registry, "hsc-rc2-subset.yaml")
3256 patch_regions = {
3257 (data_id["tract"], data_id["patch"]): data_id.region
3258 for data_id in registry.queryDataIds(["patch"]).expanded()
3259 }
3260 observation_regions = {
3261 (data_id["visit"], data_id["detector"]): data_id.region
3262 for data_id in registry.queryDataIds(["visit", "detector"]).expanded()
3263 }
3264 all_combos = {
3265 (patch_key, observation_key)
3266 for patch_key, observation_key in itertools.product(patch_regions, observation_regions)
3267 }
3268 overlapping_combos = {
3269 (patch_key, observation_key)
3270 for patch_key, observation_key in all_combos
3271 if not patch_regions[patch_key].isDisjointFrom(observation_regions[observation_key])
3272 }
3273 # Check a direct spatial join with no constraint first.
3274 self.assertEqual(
3275 {
3276 ((data_id["tract"], data_id["patch"]), (data_id["visit"], data_id["detector"]))
3277 for data_id in registry.queryDataIds(["patch", "visit", "detector"])
3278 },
3279 overlapping_combos,
3280 )
3281 overlaps_by_patch: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3282 overlaps_by_observation: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3283 for patch_key, observation_key in overlapping_combos:
3284 overlaps_by_patch[patch_key].add(observation_key)
3285 overlaps_by_observation[observation_key].add(patch_key)
3286 # Find patches and observations that overlap at least one of the other
3287 # but not all of the other.
3288 nontrivial_patch = next(
3289 iter(
3290 patch_key
3291 for patch_key, observation_keys in overlaps_by_patch.items()
3292 if observation_keys and observation_keys != observation_regions.keys()
3293 )
3294 )
3295 nontrivial_observation = next(
3296 iter(
3297 observation_key
3298 for observation_key, patch_keys in overlaps_by_observation.items()
3299 if patch_keys and patch_keys != patch_regions.keys()
3300 )
3301 )
3302 # Use the nontrivial patches and observations as constraints on the
3303 # other dimensions in various ways, first via a 'where' expression.
3304 # It's better in general to us 'bind' instead of f-strings, but these
3305 # all integers so there are no quoting concerns.
3306 self.assertEqual(
3307 {
3308 (data_id["visit"], data_id["detector"])
3309 for data_id in registry.queryDataIds(
3310 ["visit", "detector"],
3311 where=f"tract={nontrivial_patch[0]} AND patch={nontrivial_patch[1]}",
3312 skymap="hsc_rings_v1",
3313 )
3314 },
3315 overlaps_by_patch[nontrivial_patch],
3316 )
3317 self.assertEqual(
3318 {
3319 (data_id["tract"], data_id["patch"])
3320 for data_id in registry.queryDataIds(
3321 ["patch"],
3322 where=f"visit={nontrivial_observation[0]} AND detector={nontrivial_observation[1]}",
3323 instrument="HSC",
3324 )
3325 },
3326 overlaps_by_observation[nontrivial_observation],
3327 )
3328 # and then via the dataId argument.
3329 self.assertEqual(
3330 {
3331 (data_id["visit"], data_id["detector"])
3332 for data_id in registry.queryDataIds(
3333 ["visit", "detector"],
3334 dataId={
3335 "tract": nontrivial_patch[0],
3336 "patch": nontrivial_patch[1],
3337 },
3338 skymap="hsc_rings_v1",
3339 )
3340 },
3341 overlaps_by_patch[nontrivial_patch],
3342 )
3343 self.assertEqual(
3344 {
3345 (data_id["tract"], data_id["patch"])
3346 for data_id in registry.queryDataIds(
3347 ["patch"],
3348 dataId={
3349 "visit": nontrivial_observation[0],
3350 "detector": nontrivial_observation[1],
3351 },
3352 instrument="HSC",
3353 )
3354 },
3355 overlaps_by_observation[nontrivial_observation],
3356 )
3358 def test_query_projection_drop_postprocessing(self) -> None:
3359 """Test that projections and deduplications on query objects can
3360 drop post-query region filtering to ensure the query remains in
3361 the SQL engine.
3362 """
3363 registry = self.makeRegistry()
3364 self.loadData(registry, "base.yaml")
3365 self.loadData(registry, "spatial.yaml")
3367 def pop_transfer(tree: Relation) -> Relation:
3368 """If a relation tree terminates with a transfer to a new engine,
3369 return the relation prior to that transfer. If not, return the
3370 original relation.
3371 """
3372 match tree:
3373 case Transfer(target=target):
3374 return target
3375 case _:
3376 return tree
3378 # There's no public way to get a Query object yet, so we get one from a
3379 # DataCoordinateQueryResults private attribute. When a public API is
3380 # available this test should use it.
3381 query = registry.queryDataIds(["visit", "detector", "tract", "patch"])._query
3382 # We expect this query to terminate in the iteration engine originally,
3383 # because region-filtering is necessary.
3384 self.assertIsInstance(pop_transfer(query.relation).engine, iteration.Engine)
3385 # If we deduplicate, we usually have to do that downstream of the
3386 # filtering. That means the deduplication has to happen in the
3387 # iteration engine.
3388 self.assertIsInstance(pop_transfer(query.projected(unique=True).relation).engine, iteration.Engine)
3389 # If we pass drop_postprocessing, we instead drop the region filtering
3390 # so the deduplication can happen in SQL (though there might still be
3391 # transfer to iteration at the tail of the tree that we can ignore;
3392 # that's what the pop_transfer takes care of here).
3393 self.assertIsInstance(
3394 pop_transfer(query.projected(unique=True, drop_postprocessing=True).relation).engine,
3395 sql.Engine,
3396 )
3398 def test_query_empty_collections(self) -> None:
3399 """Test for registry query methods with empty collections. The methods
3400 should return empty result set (or None when applicable) and provide
3401 "doomed" diagnostics.
3402 """
3403 registry = self.makeRegistry()
3404 self.loadData(registry, "base.yaml")
3405 self.loadData(registry, "datasets.yaml")
3407 # Tests for registry.findDataset()
3408 with self.assertRaises(NoDefaultCollectionError):
3409 registry.findDataset("bias", instrument="Cam1", detector=1)
3410 self.assertIsNotNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=...))
3411 self.assertIsNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=[]))
3413 # Tests for registry.queryDatasets()
3414 with self.assertRaises(NoDefaultCollectionError):
3415 registry.queryDatasets("bias")
3416 self.assertTrue(list(registry.queryDatasets("bias", collections=...)))
3418 result = registry.queryDatasets("bias", collections=[])
3419 self.assertEqual(len(list(result)), 0)
3420 messages = list(result.explain_no_results())
3421 self.assertTrue(messages)
3422 self.assertTrue(any("because collection list is empty" in message for message in messages))
3424 # Tests for registry.queryDataIds()
3425 with self.assertRaises(NoDefaultCollectionError):
3426 registry.queryDataIds("detector", datasets="bias")
3427 self.assertTrue(list(registry.queryDataIds("detector", datasets="bias", collections=...)))
3429 result = registry.queryDataIds("detector", datasets="bias", collections=[])
3430 self.assertEqual(len(list(result)), 0)
3431 messages = list(result.explain_no_results())
3432 self.assertTrue(messages)
3433 self.assertTrue(any("because collection list is empty" in message for message in messages))
3435 # Tests for registry.queryDimensionRecords()
3436 with self.assertRaises(NoDefaultCollectionError):
3437 registry.queryDimensionRecords("detector", datasets="bias")
3438 self.assertTrue(list(registry.queryDimensionRecords("detector", datasets="bias", collections=...)))
3440 result = registry.queryDimensionRecords("detector", datasets="bias", collections=[])
3441 self.assertEqual(len(list(result)), 0)
3442 messages = list(result.explain_no_results())
3443 self.assertTrue(messages)
3444 self.assertTrue(any("because collection list is empty" in message for message in messages))