Coverage for python/lsst/daf/butler/registry/tests/_registry.py: 5%
1495 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RegistryTests"]
25import itertools
26import logging
27import os
28import re
29import unittest
30import uuid
31from abc import ABC, abstractmethod
32from collections import defaultdict, namedtuple
33from collections.abc import Iterator
34from datetime import datetime, timedelta
35from typing import TYPE_CHECKING
37import astropy.time
38import sqlalchemy
40try:
41 import numpy as np
42except ImportError:
43 np = None
45import lsst.sphgeom
46from lsst.daf.relation import Relation, RelationalAlgebraError, Transfer, iteration, sql
48from ...core import (
49 DataCoordinate,
50 DataCoordinateSet,
51 DatasetAssociation,
52 DatasetIdFactory,
53 DatasetIdGenEnum,
54 DatasetRef,
55 DatasetType,
56 DimensionGraph,
57 NamedValueSet,
58 SkyPixDimension,
59 StorageClass,
60 Timespan,
61 ddl,
62)
63from .._collection_summary import CollectionSummary
64from .._collectionType import CollectionType
65from .._config import RegistryConfig
66from .._exceptions import (
67 ArgumentError,
68 CollectionError,
69 CollectionTypeError,
70 ConflictingDefinitionError,
71 DataIdValueError,
72 DatasetTypeError,
73 InconsistentDataIdError,
74 MissingCollectionError,
75 MissingDatasetTypeError,
76 NoDefaultCollectionError,
77 OrphanedRecordError,
78)
79from ..interfaces import ButlerAttributeExistsError
81if TYPE_CHECKING:
82 from .._registry import Registry
85class RegistryTests(ABC):
86 """Generic tests for the `Registry` class that can be subclassed to
87 generate tests for different configurations.
88 """
90 collectionsManager: str | None = None
91 """Name of the collections manager class, if subclass provides value for
92 this member then it overrides name specified in default configuration
93 (`str`).
94 """
96 datasetsManager: str | dict[str, str] | None = None
97 """Name or configuration dictionary of the datasets manager class, if
98 subclass provides value for this member then it overrides name specified
99 in default configuration (`str` or `dict`).
100 """
102 @classmethod
103 @abstractmethod
104 def getDataDir(cls) -> str:
105 """Return the root directory containing test data YAML files."""
106 raise NotImplementedError()
108 def makeRegistryConfig(self) -> RegistryConfig:
109 """Create RegistryConfig used to create a registry.
111 This method should be called by a subclass from `makeRegistry`.
112 Returned instance will be pre-configured based on the values of class
113 members, and default-configured for all other parameters. Subclasses
114 that need default configuration should just instantiate
115 `RegistryConfig` directly.
116 """
117 config = RegistryConfig()
118 if self.collectionsManager:
119 config["managers", "collections"] = self.collectionsManager
120 if self.datasetsManager:
121 config["managers", "datasets"] = self.datasetsManager
122 return config
124 @abstractmethod
125 def makeRegistry(self, share_repo_with: Registry | None = None) -> Registry | None:
126 """Return the Registry instance to be tested.
128 Parameters
129 ----------
130 share_repo_with : `Registry`, optional
131 If provided, the new registry should point to the same data
132 repository as this existing registry.
134 Returns
135 -------
136 registry : `Registry`
137 New `Registry` instance, or `None` *only* if `share_repo_with` is
138 not `None` and this test case does not support that argument
139 (e.g. it is impossible with in-memory SQLite DBs).
140 """
141 raise NotImplementedError()
143 def loadData(self, registry: Registry, filename: str):
144 """Load registry test data from ``getDataDir/<filename>``,
145 which should be a YAML import/export file.
146 """
147 from ...transfers import YamlRepoImportBackend
149 with open(os.path.join(self.getDataDir(), filename)) as stream:
150 backend = YamlRepoImportBackend(stream, registry)
151 backend.register()
152 backend.load(datastore=None)
154 def checkQueryResults(self, results, expected):
155 """Check that a query results object contains expected values.
157 Parameters
158 ----------
159 results : `DataCoordinateQueryResults` or `DatasetQueryResults`
160 A lazy-evaluation query results object.
161 expected : `list`
162 A list of `DataCoordinate` o `DatasetRef` objects that should be
163 equal to results of the query, aside from ordering.
164 """
165 self.assertCountEqual(list(results), expected)
166 self.assertEqual(results.count(), len(expected))
167 if expected:
168 self.assertTrue(results.any())
169 else:
170 self.assertFalse(results.any())
172 def testOpaque(self):
173 """Tests for `Registry.registerOpaqueTable`,
174 `Registry.insertOpaqueData`, `Registry.fetchOpaqueData`, and
175 `Registry.deleteOpaqueData`.
176 """
177 registry = self.makeRegistry()
178 table = "opaque_table_for_testing"
179 registry.registerOpaqueTable(
180 table,
181 spec=ddl.TableSpec(
182 fields=[
183 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, primaryKey=True),
184 ddl.FieldSpec("name", dtype=sqlalchemy.String, length=16, nullable=False),
185 ddl.FieldSpec("count", dtype=sqlalchemy.SmallInteger, nullable=True),
186 ],
187 ),
188 )
189 rows = [
190 {"id": 1, "name": "one", "count": None},
191 {"id": 2, "name": "two", "count": 5},
192 {"id": 3, "name": "three", "count": 6},
193 ]
194 registry.insertOpaqueData(table, *rows)
195 self.assertCountEqual(rows, list(registry.fetchOpaqueData(table)))
196 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=1)))
197 self.assertEqual(rows[1:2], list(registry.fetchOpaqueData(table, name="two")))
198 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=(1, 3), name=("one", "two"))))
199 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=(1, 2, 3))))
200 # Test very long IN clause which exceeds sqlite limit on number of
201 # parameters. SQLite says the limit is 32k but it looks like it is
202 # much higher.
203 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=list(range(300_000)))))
204 # Two IN clauses, each longer than 1k batch size, first with
205 # duplicates, second has matching elements in different batches (after
206 # sorting).
207 self.assertEqual(
208 rows[0:2],
209 list(
210 registry.fetchOpaqueData(
211 table,
212 id=list(range(1000)) + list(range(100, 0, -1)),
213 name=["one"] + [f"q{i}" for i in range(2200)] + ["two"],
214 )
215 ),
216 )
217 self.assertEqual([], list(registry.fetchOpaqueData(table, id=1, name="two")))
218 registry.deleteOpaqueData(table, id=3)
219 self.assertCountEqual(rows[:2], list(registry.fetchOpaqueData(table)))
220 registry.deleteOpaqueData(table)
221 self.assertEqual([], list(registry.fetchOpaqueData(table)))
223 def testDatasetType(self):
224 """Tests for `Registry.registerDatasetType` and
225 `Registry.getDatasetType`.
226 """
227 registry = self.makeRegistry()
228 # Check valid insert
229 datasetTypeName = "test"
230 storageClass = StorageClass("testDatasetType")
231 registry.storageClasses.registerStorageClass(storageClass)
232 dimensions = registry.dimensions.extract(("instrument", "visit"))
233 differentDimensions = registry.dimensions.extract(("instrument", "patch"))
234 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
235 # Inserting for the first time should return True
236 self.assertTrue(registry.registerDatasetType(inDatasetType))
237 outDatasetType1 = registry.getDatasetType(datasetTypeName)
238 self.assertEqual(outDatasetType1, inDatasetType)
240 # Re-inserting should work
241 self.assertFalse(registry.registerDatasetType(inDatasetType))
242 # Except when they are not identical
243 with self.assertRaises(ConflictingDefinitionError):
244 nonIdenticalDatasetType = DatasetType(datasetTypeName, differentDimensions, storageClass)
245 registry.registerDatasetType(nonIdenticalDatasetType)
247 # Template can be None
248 datasetTypeName = "testNoneTemplate"
249 storageClass = StorageClass("testDatasetType2")
250 registry.storageClasses.registerStorageClass(storageClass)
251 dimensions = registry.dimensions.extract(("instrument", "visit"))
252 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
253 registry.registerDatasetType(inDatasetType)
254 outDatasetType2 = registry.getDatasetType(datasetTypeName)
255 self.assertEqual(outDatasetType2, inDatasetType)
257 allTypes = set(registry.queryDatasetTypes())
258 self.assertEqual(allTypes, {outDatasetType1, outDatasetType2})
260 def testDimensions(self):
261 """Tests for `Registry.insertDimensionData`,
262 `Registry.syncDimensionData`, and `Registry.expandDataId`.
263 """
264 registry = self.makeRegistry()
265 dimensionName = "instrument"
266 dimension = registry.dimensions[dimensionName]
267 dimensionValue = {
268 "name": "DummyCam",
269 "visit_max": 10,
270 "visit_system": 0,
271 "exposure_max": 10,
272 "detector_max": 2,
273 "class_name": "lsst.pipe.base.Instrument",
274 }
275 registry.insertDimensionData(dimensionName, dimensionValue)
276 # Inserting the same value twice should fail
277 with self.assertRaises(sqlalchemy.exc.IntegrityError):
278 registry.insertDimensionData(dimensionName, dimensionValue)
279 # expandDataId should retrieve the record we just inserted
280 self.assertEqual(
281 registry.expandDataId(instrument="DummyCam", graph=dimension.graph)
282 .records[dimensionName]
283 .toDict(),
284 dimensionValue,
285 )
286 # expandDataId should raise if there is no record with the given ID.
287 with self.assertRaises(DataIdValueError):
288 registry.expandDataId({"instrument": "Unknown"}, graph=dimension.graph)
289 # band doesn't have a table; insert should fail.
290 with self.assertRaises(TypeError):
291 registry.insertDimensionData("band", {"band": "i"})
292 dimensionName2 = "physical_filter"
293 dimension2 = registry.dimensions[dimensionName2]
294 dimensionValue2 = {"name": "DummyCam_i", "band": "i"}
295 # Missing required dependency ("instrument") should fail
296 with self.assertRaises(KeyError):
297 registry.insertDimensionData(dimensionName2, dimensionValue2)
298 # Adding required dependency should fix the failure
299 dimensionValue2["instrument"] = "DummyCam"
300 registry.insertDimensionData(dimensionName2, dimensionValue2)
301 # expandDataId should retrieve the record we just inserted.
302 self.assertEqual(
303 registry.expandDataId(instrument="DummyCam", physical_filter="DummyCam_i", graph=dimension2.graph)
304 .records[dimensionName2]
305 .toDict(),
306 dimensionValue2,
307 )
308 # Use syncDimensionData to insert a new record successfully.
309 dimensionName3 = "detector"
310 dimensionValue3 = {
311 "instrument": "DummyCam",
312 "id": 1,
313 "full_name": "one",
314 "name_in_raft": "zero",
315 "purpose": "SCIENCE",
316 }
317 self.assertTrue(registry.syncDimensionData(dimensionName3, dimensionValue3))
318 # Sync that again. Note that one field ("raft") is NULL, and that
319 # should be okay.
320 self.assertFalse(registry.syncDimensionData(dimensionName3, dimensionValue3))
321 # Now try that sync with the same primary key but a different value.
322 # This should fail.
323 with self.assertRaises(ConflictingDefinitionError):
324 registry.syncDimensionData(
325 dimensionName3,
326 {
327 "instrument": "DummyCam",
328 "id": 1,
329 "full_name": "one",
330 "name_in_raft": "four",
331 "purpose": "SCIENCE",
332 },
333 )
335 @unittest.skipIf(np is None, "numpy not available.")
336 def testNumpyDataId(self):
337 """Test that we can use a numpy int in a dataId."""
338 registry = self.makeRegistry()
339 dimensionEntries = [
340 ("instrument", {"instrument": "DummyCam"}),
341 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
342 # Using an np.int64 here fails unless Records.fromDict is also
343 # patched to look for numbers.Integral
344 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
345 ]
346 for args in dimensionEntries:
347 registry.insertDimensionData(*args)
349 # Try a normal integer and something that looks like an int but
350 # is not.
351 for visit_id in (42, np.int64(42)):
352 with self.subTest(visit_id=visit_id, id_type=type(visit_id).__name__):
353 expanded = registry.expandDataId({"instrument": "DummyCam", "visit": visit_id})
354 self.assertEqual(expanded["visit"], int(visit_id))
355 self.assertIsInstance(expanded["visit"], int)
357 def testDataIdRelationships(self):
358 """Test that `Registry.expandDataId` raises an exception when the given
359 keys are inconsistent.
360 """
361 registry = self.makeRegistry()
362 self.loadData(registry, "base.yaml")
363 # Insert a few more dimension records for the next test.
364 registry.insertDimensionData(
365 "exposure",
366 {"instrument": "Cam1", "id": 1, "obs_id": "one", "physical_filter": "Cam1-G"},
367 )
368 registry.insertDimensionData(
369 "exposure",
370 {"instrument": "Cam1", "id": 2, "obs_id": "two", "physical_filter": "Cam1-G"},
371 )
372 registry.insertDimensionData(
373 "visit_system",
374 {"instrument": "Cam1", "id": 0, "name": "one-to-one"},
375 )
376 registry.insertDimensionData(
377 "visit",
378 {"instrument": "Cam1", "id": 1, "name": "one", "physical_filter": "Cam1-G", "visit_system": 0},
379 )
380 registry.insertDimensionData(
381 "visit_definition",
382 {"instrument": "Cam1", "visit": 1, "exposure": 1, "visit_system": 0},
383 )
384 with self.assertRaises(InconsistentDataIdError):
385 registry.expandDataId(
386 {"instrument": "Cam1", "visit": 1, "exposure": 2},
387 )
389 def testDataset(self):
390 """Basic tests for `Registry.insertDatasets`, `Registry.getDataset`,
391 and `Registry.removeDatasets`.
392 """
393 registry = self.makeRegistry()
394 self.loadData(registry, "base.yaml")
395 run = "tésτ"
396 registry.registerRun(run)
397 datasetType = registry.getDatasetType("bias")
398 dataId = {"instrument": "Cam1", "detector": 2}
399 (ref,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
400 outRef = registry.getDataset(ref.id)
401 self.assertIsNotNone(ref.id)
402 self.assertEqual(ref, outRef)
403 with self.assertRaises(ConflictingDefinitionError):
404 registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
405 registry.removeDatasets([ref])
406 self.assertIsNone(registry.findDataset(datasetType, dataId, collections=[run]))
408 def testFindDataset(self):
409 """Tests for `Registry.findDataset`."""
410 registry = self.makeRegistry()
411 self.loadData(registry, "base.yaml")
412 run = "tésτ"
413 datasetType = registry.getDatasetType("bias")
414 dataId = {"instrument": "Cam1", "detector": 4}
415 registry.registerRun(run)
416 (inputRef,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
417 outputRef = registry.findDataset(datasetType, dataId, collections=[run])
418 self.assertEqual(outputRef, inputRef)
419 # Check that retrieval with invalid dataId raises
420 with self.assertRaises(LookupError):
421 dataId = {"instrument": "Cam1"} # no detector
422 registry.findDataset(datasetType, dataId, collections=run)
423 # Check that different dataIds match to different datasets
424 dataId1 = {"instrument": "Cam1", "detector": 1}
425 (inputRef1,) = registry.insertDatasets(datasetType, dataIds=[dataId1], run=run)
426 dataId2 = {"instrument": "Cam1", "detector": 2}
427 (inputRef2,) = registry.insertDatasets(datasetType, dataIds=[dataId2], run=run)
428 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef1)
429 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef2)
430 self.assertNotEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef2)
431 self.assertNotEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef1)
432 # Check that requesting a non-existing dataId returns None
433 nonExistingDataId = {"instrument": "Cam1", "detector": 3}
434 self.assertIsNone(registry.findDataset(datasetType, nonExistingDataId, collections=run))
435 # Search more than one collection, in which two have the right
436 # dataset type and another does not.
437 registry.registerRun("empty")
438 self.loadData(registry, "datasets-uuid.yaml")
439 bias1 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_g"])
440 self.assertIsNotNone(bias1)
441 bias2 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_r"])
442 self.assertIsNotNone(bias2)
443 self.assertEqual(
444 bias1,
445 registry.findDataset(
446 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "imported_r"]
447 ),
448 )
449 self.assertEqual(
450 bias2,
451 registry.findDataset(
452 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_r", "imported_g"]
453 ),
454 )
455 # Search more than one collection, with one of them a CALIBRATION
456 # collection.
457 registry.registerCollection("Cam1/calib", CollectionType.CALIBRATION)
458 timespan = Timespan(
459 begin=astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai"),
460 end=astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai"),
461 )
462 registry.certify("Cam1/calib", [bias2], timespan=timespan)
463 self.assertEqual(
464 bias1,
465 registry.findDataset(
466 "bias",
467 instrument="Cam1",
468 detector=2,
469 collections=["empty", "imported_g", "Cam1/calib"],
470 timespan=timespan,
471 ),
472 )
473 self.assertEqual(
474 bias2,
475 registry.findDataset(
476 "bias",
477 instrument="Cam1",
478 detector=2,
479 collections=["empty", "Cam1/calib", "imported_g"],
480 timespan=timespan,
481 ),
482 )
483 # If we try to search those same collections without a timespan, it
484 # should still work, since the CALIBRATION collection is ignored.
485 self.assertEqual(
486 bias1,
487 registry.findDataset(
488 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "Cam1/calib"]
489 ),
490 )
491 self.assertEqual(
492 bias1,
493 registry.findDataset(
494 "bias", instrument="Cam1", detector=2, collections=["empty", "Cam1/calib", "imported_g"]
495 ),
496 )
498 def testRemoveDatasetTypeSuccess(self):
499 """Test that Registry.removeDatasetType works when there are no
500 datasets of that type present.
501 """
502 registry = self.makeRegistry()
503 self.loadData(registry, "base.yaml")
504 registry.removeDatasetType("flat")
505 with self.assertRaises(MissingDatasetTypeError):
506 registry.getDatasetType("flat")
508 def testRemoveDatasetTypeFailure(self):
509 """Test that Registry.removeDatasetType raises when there are datasets
510 of that type present or if the dataset type is for a component.
511 """
512 registry = self.makeRegistry()
513 self.loadData(registry, "base.yaml")
514 self.loadData(registry, "datasets.yaml")
515 with self.assertRaises(OrphanedRecordError):
516 registry.removeDatasetType("flat")
517 with self.assertRaises(ValueError):
518 registry.removeDatasetType(DatasetType.nameWithComponent("flat", "image"))
520 def testImportDatasetsUUID(self):
521 """Test for `Registry._importDatasets` with UUID dataset ID."""
522 if isinstance(self.datasetsManager, str):
523 if not self.datasetsManager.endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
524 self.skipTest(f"Unexpected dataset manager {self.datasetsManager}")
525 elif isinstance(self.datasetsManager, dict) and not self.datasetsManager["cls"].endswith(
526 ".ByDimensionsDatasetRecordStorageManagerUUID"
527 ):
528 self.skipTest(f"Unexpected dataset manager {self.datasetsManager['cls']}")
530 registry = self.makeRegistry()
531 self.loadData(registry, "base.yaml")
532 for run in range(6):
533 registry.registerRun(f"run{run}")
534 datasetTypeBias = registry.getDatasetType("bias")
535 datasetTypeFlat = registry.getDatasetType("flat")
536 dataIdBias1 = {"instrument": "Cam1", "detector": 1}
537 dataIdBias2 = {"instrument": "Cam1", "detector": 2}
538 dataIdFlat1 = {"instrument": "Cam1", "detector": 1, "physical_filter": "Cam1-G", "band": "g"}
540 ref = DatasetRef(datasetTypeBias, dataIdBias1, run="run0")
541 (ref1,) = registry._importDatasets([ref])
542 # UUID is used without change
543 self.assertEqual(ref.id, ref1.id)
545 # All different failure modes
546 refs = (
547 # Importing same DatasetRef with different dataset ID is an error
548 DatasetRef(datasetTypeBias, dataIdBias1, run="run0"),
549 # Same DatasetId but different DataId
550 DatasetRef(datasetTypeBias, dataIdBias2, id=ref1.id, run="run0"),
551 DatasetRef(datasetTypeFlat, dataIdFlat1, id=ref1.id, run="run0"),
552 # Same DatasetRef and DatasetId but different run
553 DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run="run1"),
554 )
555 for ref in refs:
556 with self.assertRaises(ConflictingDefinitionError):
557 registry._importDatasets([ref])
559 # Test for non-unique IDs, they can be re-imported multiple times.
560 for run, idGenMode in ((2, DatasetIdGenEnum.DATAID_TYPE), (4, DatasetIdGenEnum.DATAID_TYPE_RUN)):
561 with self.subTest(idGenMode=idGenMode):
562 # Make dataset ref with reproducible dataset ID.
563 ref = DatasetRef(datasetTypeBias, dataIdBias1, run=f"run{run}", id_generation_mode=idGenMode)
564 (ref1,) = registry._importDatasets([ref])
565 self.assertIsInstance(ref1.id, uuid.UUID)
566 self.assertEqual(ref1.id.version, 5)
567 self.assertEqual(ref1.id, ref.id)
569 # Importing it again is OK
570 (ref2,) = registry._importDatasets([ref1])
571 self.assertEqual(ref2.id, ref1.id)
573 # Cannot import to different run with the same ID
574 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run=f"run{run+1}")
575 with self.assertRaises(ConflictingDefinitionError):
576 registry._importDatasets([ref])
578 ref = DatasetRef(
579 datasetTypeBias, dataIdBias1, run=f"run{run+1}", id_generation_mode=idGenMode
580 )
581 if idGenMode is DatasetIdGenEnum.DATAID_TYPE:
582 # Cannot import same DATAID_TYPE ref into a new run
583 with self.assertRaises(ConflictingDefinitionError):
584 (ref2,) = registry._importDatasets([ref])
585 else:
586 # DATAID_TYPE_RUN ref can be imported into a new run
587 (ref2,) = registry._importDatasets([ref])
589 def testDatasetTypeComponentQueries(self):
590 """Test component options when querying for dataset types.
592 All of the behavior here is deprecated, so many of these tests are
593 currently wrapped in a context to check that we get a warning whenever
594 a component dataset is actually returned.
595 """
596 registry = self.makeRegistry()
597 self.loadData(registry, "base.yaml")
598 self.loadData(registry, "datasets.yaml")
599 # Test querying for dataset types with different inputs.
600 # First query for all dataset types; components should only be included
601 # when components=True.
602 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes()).names)
603 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes(components=False)).names)
604 with self.assertWarns(FutureWarning):
605 self.assertLess(
606 {"bias", "flat", "bias.wcs", "flat.photoCalib"},
607 NamedValueSet(registry.queryDatasetTypes(components=True)).names,
608 )
609 # Use a pattern that can match either parent or components. Again,
610 # components are only returned if components=True.
611 self.assertEqual({"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"))).names)
612 self.assertEqual(
613 {"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=False)).names
614 )
615 with self.assertWarns(FutureWarning):
616 self.assertLess(
617 {"bias", "bias.wcs"},
618 NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=True)).names,
619 )
620 # This pattern matches only a component. In this case we also return
621 # that component dataset type if components=None.
622 with self.assertWarns(FutureWarning):
623 self.assertEqual(
624 {"bias.wcs"},
625 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=None)).names,
626 )
627 self.assertEqual(
628 set(),
629 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=False)).names,
630 )
631 with self.assertWarns(FutureWarning):
632 self.assertEqual(
633 {"bias.wcs"},
634 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=True)).names,
635 )
636 # Add a dataset type using a StorageClass that we'll then remove; check
637 # that this does not affect our ability to query for dataset types
638 # (though it will warn).
639 tempStorageClass = StorageClass(
640 name="TempStorageClass",
641 components={
642 "data1": registry.storageClasses.getStorageClass("StructuredDataDict"),
643 "data2": registry.storageClasses.getStorageClass("StructuredDataDict"),
644 },
645 )
646 registry.storageClasses.registerStorageClass(tempStorageClass)
647 datasetType = DatasetType(
648 "temporary",
649 dimensions=["instrument"],
650 storageClass=tempStorageClass,
651 universe=registry.dimensions,
652 )
653 registry.registerDatasetType(datasetType)
654 registry.storageClasses._unregisterStorageClass(tempStorageClass.name)
655 datasetType._storageClass = None
656 del tempStorageClass
657 # Querying for all dataset types, including components, should include
658 # at least all non-component dataset types (and I don't want to
659 # enumerate all of the Exposure components for bias and flat here).
660 with self.assertWarns(FutureWarning):
661 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
662 everything = NamedValueSet(registry.queryDatasetTypes(components=True))
663 self.assertIn("TempStorageClass", cm.output[0])
664 self.assertLess({"bias", "flat", "temporary"}, everything.names)
665 # It should not include "temporary.columns", because we tried to remove
666 # the storage class that would tell it about that. So if the next line
667 # fails (i.e. "temporary.columns" _is_ in everything.names), it means
668 # this part of the test isn't doing anything, because the _unregister
669 # call about isn't simulating the real-life case we want it to
670 # simulate, in which different versions of daf_butler in entirely
671 # different Python processes interact with the same repo.
672 self.assertNotIn("temporary.data", everything.names)
673 # Query for dataset types that start with "temp". This should again
674 # not include the component, and also not fail.
675 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
676 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=True))
677 self.assertIn("TempStorageClass", cm.output[0])
678 self.assertEqual({"temporary"}, startsWithTemp.names)
679 # Querying with no components should not warn at all.
680 with self.assertLogs("lsst.daf.butler.registries", logging.WARN) as cm:
681 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=False))
682 # Must issue a warning of our own to be captured.
683 logging.getLogger("lsst.daf.butler.registries").warning("test message")
684 self.assertEqual(len(cm.output), 1)
685 self.assertIn("test message", cm.output[0])
687 def testComponentLookups(self):
688 """Test searching for component datasets via their parents.
690 All of the behavior here is deprecated, so many of these tests are
691 currently wrapped in a context to check that we get a warning whenever
692 a component dataset is actually returned.
693 """
694 registry = self.makeRegistry()
695 self.loadData(registry, "base.yaml")
696 self.loadData(registry, "datasets.yaml")
697 # Test getting the child dataset type (which does still exist in the
698 # Registry), and check for consistency with
699 # DatasetRef.makeComponentRef.
700 collection = "imported_g"
701 parentType = registry.getDatasetType("bias")
702 childType = registry.getDatasetType("bias.wcs")
703 parentRefResolved = registry.findDataset(
704 parentType, collections=collection, instrument="Cam1", detector=1
705 )
706 self.assertIsInstance(parentRefResolved, DatasetRef)
707 self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType)
708 # Search for a single dataset with findDataset.
709 childRef1 = registry.findDataset("bias.wcs", collections=collection, dataId=parentRefResolved.dataId)
710 self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs"))
711 # Search for detector data IDs constrained by component dataset
712 # existence with queryDataIds.
713 with self.assertWarns(FutureWarning):
714 dataIds = registry.queryDataIds(
715 ["detector"],
716 datasets=["bias.wcs"],
717 collections=collection,
718 ).toSet()
719 self.assertEqual(
720 dataIds,
721 DataCoordinateSet(
722 {
723 DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions)
724 for d in (1, 2, 3)
725 },
726 parentType.dimensions,
727 ),
728 )
729 # Search for multiple datasets of a single type with queryDatasets.
730 with self.assertWarns(FutureWarning):
731 childRefs2 = set(
732 registry.queryDatasets(
733 "bias.wcs",
734 collections=collection,
735 )
736 )
737 self.assertEqual({ref.datasetType for ref in childRefs2}, {childType})
738 self.assertEqual({ref.dataId for ref in childRefs2}, set(dataIds))
740 def testCollections(self):
741 """Tests for registry methods that manage collections."""
742 registry = self.makeRegistry()
743 other_registry = self.makeRegistry(share_repo_with=registry)
744 self.loadData(registry, "base.yaml")
745 self.loadData(registry, "datasets.yaml")
746 run1 = "imported_g"
747 run2 = "imported_r"
748 # Test setting a collection docstring after it has been created.
749 registry.setCollectionDocumentation(run1, "doc for run1")
750 self.assertEqual(registry.getCollectionDocumentation(run1), "doc for run1")
751 registry.setCollectionDocumentation(run1, None)
752 self.assertIsNone(registry.getCollectionDocumentation(run1))
753 datasetType = "bias"
754 # Find some datasets via their run's collection.
755 dataId1 = {"instrument": "Cam1", "detector": 1}
756 ref1 = registry.findDataset(datasetType, dataId1, collections=run1)
757 self.assertIsNotNone(ref1)
758 dataId2 = {"instrument": "Cam1", "detector": 2}
759 ref2 = registry.findDataset(datasetType, dataId2, collections=run1)
760 self.assertIsNotNone(ref2)
761 # Associate those into a new collection, then look for them there.
762 tag1 = "tag1"
763 registry.registerCollection(tag1, type=CollectionType.TAGGED, doc="doc for tag1")
764 # Check that we can query for old and new collections by type.
765 self.assertEqual(set(registry.queryCollections(collectionTypes=CollectionType.RUN)), {run1, run2})
766 self.assertEqual(
767 set(registry.queryCollections(collectionTypes={CollectionType.TAGGED, CollectionType.RUN})),
768 {tag1, run1, run2},
769 )
770 self.assertEqual(registry.getCollectionDocumentation(tag1), "doc for tag1")
771 registry.associate(tag1, [ref1, ref2])
772 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
773 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
774 # Disassociate one and verify that we can't it there anymore...
775 registry.disassociate(tag1, [ref1])
776 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=tag1))
777 # ...but we can still find ref2 in tag1, and ref1 in the run.
778 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run1), ref1)
779 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
780 collections = set(registry.queryCollections())
781 self.assertEqual(collections, {run1, run2, tag1})
782 # Associate both refs into tag1 again; ref2 is already there, but that
783 # should be a harmless no-op.
784 registry.associate(tag1, [ref1, ref2])
785 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
786 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
787 # Get a different dataset (from a different run) that has the same
788 # dataset type and data ID as ref2.
789 ref2b = registry.findDataset(datasetType, dataId2, collections=run2)
790 self.assertNotEqual(ref2, ref2b)
791 # Attempting to associate that into tag1 should be an error.
792 with self.assertRaises(ConflictingDefinitionError):
793 registry.associate(tag1, [ref2b])
794 # That error shouldn't have messed up what we had before.
795 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
796 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
797 # Attempt to associate the conflicting dataset again, this time with
798 # a dataset that isn't in the collection and won't cause a conflict.
799 # Should also fail without modifying anything.
800 dataId3 = {"instrument": "Cam1", "detector": 3}
801 ref3 = registry.findDataset(datasetType, dataId3, collections=run1)
802 with self.assertRaises(ConflictingDefinitionError):
803 registry.associate(tag1, [ref3, ref2b])
804 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
805 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
806 self.assertIsNone(registry.findDataset(datasetType, dataId3, collections=tag1))
807 # Register a chained collection that searches [tag1, run2]
808 chain1 = "chain1"
809 registry.registerCollection(chain1, type=CollectionType.CHAINED)
810 self.assertIs(registry.getCollectionType(chain1), CollectionType.CHAINED)
811 # Chained collection exists, but has no collections in it.
812 self.assertFalse(registry.getCollectionChain(chain1))
813 # If we query for all collections, we should get the chained collection
814 # only if we don't ask to flatten it (i.e. yield only its children).
815 self.assertEqual(set(registry.queryCollections(flattenChains=False)), {tag1, run1, run2, chain1})
816 self.assertEqual(set(registry.queryCollections(flattenChains=True)), {tag1, run1, run2})
817 # Attempt to set its child collections to something circular; that
818 # should fail.
819 with self.assertRaises(ValueError):
820 registry.setCollectionChain(chain1, [tag1, chain1])
821 # Add the child collections.
822 registry.setCollectionChain(chain1, [tag1, run2])
823 self.assertEqual(list(registry.getCollectionChain(chain1)), [tag1, run2])
824 self.assertEqual(registry.getCollectionParentChains(tag1), {chain1})
825 self.assertEqual(registry.getCollectionParentChains(run2), {chain1})
826 # Refresh the other registry that points to the same repo, and make
827 # sure it can see the things we've done (note that this does require
828 # an explicit refresh(); that's the documented behavior, because
829 # caching is ~impossible otherwise).
830 if other_registry is not None:
831 other_registry.refresh()
832 self.assertEqual(list(other_registry.getCollectionChain(chain1)), [tag1, run2])
833 self.assertEqual(other_registry.getCollectionParentChains(tag1), {chain1})
834 self.assertEqual(other_registry.getCollectionParentChains(run2), {chain1})
835 # Searching for dataId1 or dataId2 in the chain should return ref1 and
836 # ref2, because both are in tag1.
837 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain1), ref1)
838 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain1), ref2)
839 # Now disassociate ref2 from tag1. The search (for bias) with
840 # dataId2 in chain1 should then:
841 # 1. not find it in tag1
842 # 2. find a different dataset in run2
843 registry.disassociate(tag1, [ref2])
844 ref2b = registry.findDataset(datasetType, dataId2, collections=chain1)
845 self.assertNotEqual(ref2b, ref2)
846 self.assertEqual(ref2b, registry.findDataset(datasetType, dataId2, collections=run2))
847 # Define a new chain so we can test recursive chains.
848 chain2 = "chain2"
849 registry.registerCollection(chain2, type=CollectionType.CHAINED)
850 registry.setCollectionChain(chain2, [run2, chain1])
851 self.assertEqual(registry.getCollectionParentChains(chain1), {chain2})
852 self.assertEqual(registry.getCollectionParentChains(run2), {chain1, chain2})
853 # Query for collections matching a regex.
854 self.assertCountEqual(
855 list(registry.queryCollections(re.compile("imported_."), flattenChains=False)),
856 ["imported_r", "imported_g"],
857 )
858 # Query for collections matching a regex or an explicit str.
859 self.assertCountEqual(
860 list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)),
861 ["imported_r", "imported_g", "chain1"],
862 )
863 # Search for bias with dataId1 should find it via tag1 in chain2,
864 # recursing, because is not in run1.
865 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=run2))
866 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain2), ref1)
867 # Search for bias with dataId2 should find it in run2 (ref2b).
868 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain2), ref2b)
869 # Search for a flat that is in run2. That should not be found
870 # at the front of chain2, because of the restriction to bias
871 # on run2 there, but it should be found in at the end of chain1.
872 dataId4 = {"instrument": "Cam1", "detector": 3, "physical_filter": "Cam1-R2"}
873 ref4 = registry.findDataset("flat", dataId4, collections=run2)
874 self.assertIsNotNone(ref4)
875 self.assertEqual(ref4, registry.findDataset("flat", dataId4, collections=chain2))
876 # Deleting a collection that's part of a CHAINED collection is not
877 # allowed, and is exception-safe.
878 with self.assertRaises(sqlalchemy.exc.IntegrityError):
879 registry.removeCollection(run2)
880 self.assertEqual(registry.getCollectionType(run2), CollectionType.RUN)
881 with self.assertRaises(sqlalchemy.exc.IntegrityError):
882 registry.removeCollection(chain1)
883 self.assertEqual(registry.getCollectionType(chain1), CollectionType.CHAINED)
884 # Actually remove chain2, test that it's gone by asking for its type.
885 registry.removeCollection(chain2)
886 with self.assertRaises(MissingCollectionError):
887 registry.getCollectionType(chain2)
888 # Actually remove run2 and chain1, which should work now.
889 registry.removeCollection(chain1)
890 registry.removeCollection(run2)
891 with self.assertRaises(MissingCollectionError):
892 registry.getCollectionType(run2)
893 with self.assertRaises(MissingCollectionError):
894 registry.getCollectionType(chain1)
895 # Remove tag1 as well, just to test that we can remove TAGGED
896 # collections.
897 registry.removeCollection(tag1)
898 with self.assertRaises(MissingCollectionError):
899 registry.getCollectionType(tag1)
901 def testCollectionChainFlatten(self):
902 """Test that Registry.setCollectionChain obeys its 'flatten' option."""
903 registry = self.makeRegistry()
904 registry.registerCollection("inner", CollectionType.CHAINED)
905 registry.registerCollection("innermost", CollectionType.RUN)
906 registry.setCollectionChain("inner", ["innermost"])
907 registry.registerCollection("outer", CollectionType.CHAINED)
908 registry.setCollectionChain("outer", ["inner"], flatten=False)
909 self.assertEqual(list(registry.getCollectionChain("outer")), ["inner"])
910 registry.setCollectionChain("outer", ["inner"], flatten=True)
911 self.assertEqual(list(registry.getCollectionChain("outer")), ["innermost"])
913 def testBasicTransaction(self):
914 """Test that all operations within a single transaction block are
915 rolled back if an exception propagates out of the block.
916 """
917 registry = self.makeRegistry()
918 storageClass = StorageClass("testDatasetType")
919 registry.storageClasses.registerStorageClass(storageClass)
920 with registry.transaction():
921 registry.insertDimensionData("instrument", {"name": "Cam1", "class_name": "A"})
922 with self.assertRaises(ValueError):
923 with registry.transaction():
924 registry.insertDimensionData("instrument", {"name": "Cam2"})
925 raise ValueError("Oops, something went wrong")
926 # Cam1 should exist
927 self.assertEqual(registry.expandDataId(instrument="Cam1").records["instrument"].class_name, "A")
928 # But Cam2 and Cam3 should both not exist
929 with self.assertRaises(DataIdValueError):
930 registry.expandDataId(instrument="Cam2")
931 with self.assertRaises(DataIdValueError):
932 registry.expandDataId(instrument="Cam3")
934 def testNestedTransaction(self):
935 """Test that operations within a transaction block are not rolled back
936 if an exception propagates out of an inner transaction block and is
937 then caught.
938 """
939 registry = self.makeRegistry()
940 dimension = registry.dimensions["instrument"]
941 dataId1 = {"instrument": "DummyCam"}
942 dataId2 = {"instrument": "DummyCam2"}
943 checkpointReached = False
944 with registry.transaction():
945 # This should be added and (ultimately) committed.
946 registry.insertDimensionData(dimension, dataId1)
947 with self.assertRaises(sqlalchemy.exc.IntegrityError):
948 with registry.transaction(savepoint=True):
949 # This does not conflict, and should succeed (but not
950 # be committed).
951 registry.insertDimensionData(dimension, dataId2)
952 checkpointReached = True
953 # This should conflict and raise, triggerring a rollback
954 # of the previous insertion within the same transaction
955 # context, but not the original insertion in the outer
956 # block.
957 registry.insertDimensionData(dimension, dataId1)
958 self.assertTrue(checkpointReached)
959 self.assertIsNotNone(registry.expandDataId(dataId1, graph=dimension.graph))
960 with self.assertRaises(DataIdValueError):
961 registry.expandDataId(dataId2, graph=dimension.graph)
963 def testInstrumentDimensions(self):
964 """Test queries involving only instrument dimensions, with no joins to
965 skymap.
966 """
967 registry = self.makeRegistry()
969 # need a bunch of dimensions and datasets for test
970 registry.insertDimensionData(
971 "instrument", dict(name="DummyCam", visit_max=25, exposure_max=300, detector_max=6)
972 )
973 registry.insertDimensionData(
974 "physical_filter",
975 dict(instrument="DummyCam", name="dummy_r", band="r"),
976 dict(instrument="DummyCam", name="dummy_i", band="i"),
977 )
978 registry.insertDimensionData(
979 "detector", *[dict(instrument="DummyCam", id=i, full_name=str(i)) for i in range(1, 6)]
980 )
981 registry.insertDimensionData(
982 "visit_system",
983 dict(instrument="DummyCam", id=1, name="default"),
984 )
985 registry.insertDimensionData(
986 "visit",
987 dict(instrument="DummyCam", id=10, name="ten", physical_filter="dummy_i", visit_system=1),
988 dict(instrument="DummyCam", id=11, name="eleven", physical_filter="dummy_r", visit_system=1),
989 dict(instrument="DummyCam", id=20, name="twelve", physical_filter="dummy_r", visit_system=1),
990 )
991 for i in range(1, 6):
992 registry.insertDimensionData(
993 "visit_detector_region",
994 dict(instrument="DummyCam", visit=10, detector=i),
995 dict(instrument="DummyCam", visit=11, detector=i),
996 dict(instrument="DummyCam", visit=20, detector=i),
997 )
998 registry.insertDimensionData(
999 "exposure",
1000 dict(instrument="DummyCam", id=100, obs_id="100", physical_filter="dummy_i"),
1001 dict(instrument="DummyCam", id=101, obs_id="101", physical_filter="dummy_i"),
1002 dict(instrument="DummyCam", id=110, obs_id="110", physical_filter="dummy_r"),
1003 dict(instrument="DummyCam", id=111, obs_id="111", physical_filter="dummy_r"),
1004 dict(instrument="DummyCam", id=200, obs_id="200", physical_filter="dummy_r"),
1005 dict(instrument="DummyCam", id=201, obs_id="201", physical_filter="dummy_r"),
1006 )
1007 registry.insertDimensionData(
1008 "visit_definition",
1009 dict(instrument="DummyCam", exposure=100, visit_system=1, visit=10),
1010 dict(instrument="DummyCam", exposure=101, visit_system=1, visit=10),
1011 dict(instrument="DummyCam", exposure=110, visit_system=1, visit=11),
1012 dict(instrument="DummyCam", exposure=111, visit_system=1, visit=11),
1013 dict(instrument="DummyCam", exposure=200, visit_system=1, visit=20),
1014 dict(instrument="DummyCam", exposure=201, visit_system=1, visit=20),
1015 )
1016 # dataset types
1017 run1 = "test1_r"
1018 run2 = "test2_r"
1019 tagged2 = "test2_t"
1020 registry.registerRun(run1)
1021 registry.registerRun(run2)
1022 registry.registerCollection(tagged2)
1023 storageClass = StorageClass("testDataset")
1024 registry.storageClasses.registerStorageClass(storageClass)
1025 rawType = DatasetType(
1026 name="RAW",
1027 dimensions=registry.dimensions.extract(("instrument", "exposure", "detector")),
1028 storageClass=storageClass,
1029 )
1030 registry.registerDatasetType(rawType)
1031 calexpType = DatasetType(
1032 name="CALEXP",
1033 dimensions=registry.dimensions.extract(("instrument", "visit", "detector")),
1034 storageClass=storageClass,
1035 )
1036 registry.registerDatasetType(calexpType)
1038 # add pre-existing datasets
1039 for exposure in (100, 101, 110, 111):
1040 for detector in (1, 2, 3):
1041 # note that only 3 of 5 detectors have datasets
1042 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1043 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run1)
1044 # exposures 100 and 101 appear in both run1 and tagged2.
1045 # 100 has different datasets in the different collections
1046 # 101 has the same dataset in both collections.
1047 if exposure == 100:
1048 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1049 if exposure in (100, 101):
1050 registry.associate(tagged2, [ref])
1051 # Add pre-existing datasets to tagged2.
1052 for exposure in (200, 201):
1053 for detector in (3, 4, 5):
1054 # note that only 3 of 5 detectors have datasets
1055 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1056 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1057 registry.associate(tagged2, [ref])
1059 dimensions = DimensionGraph(
1060 registry.dimensions, dimensions=(rawType.dimensions.required | calexpType.dimensions.required)
1061 )
1062 # Test that single dim string works as well as list of str
1063 rows = registry.queryDataIds("visit", datasets=rawType, collections=run1).expanded().toSet()
1064 rowsI = registry.queryDataIds(["visit"], datasets=rawType, collections=run1).expanded().toSet()
1065 self.assertEqual(rows, rowsI)
1066 # with empty expression
1067 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=run1).expanded().toSet()
1068 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1069 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111))
1070 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11))
1071 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1073 # second collection
1074 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=tagged2).toSet()
1075 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1076 for dataId in rows:
1077 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1078 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 200, 201))
1079 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 20))
1080 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1082 # with two input datasets
1083 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=[run1, tagged2]).toSet()
1084 self.assertEqual(len(set(rows)), 6 * 3) # 6 exposures times 3 detectors; set needed to de-dupe
1085 for dataId in rows:
1086 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1087 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111, 200, 201))
1088 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11, 20))
1089 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1091 # limit to single visit
1092 rows = registry.queryDataIds(
1093 dimensions, datasets=rawType, collections=run1, where="visit = 10", instrument="DummyCam"
1094 ).toSet()
1095 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1096 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1097 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1098 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1100 # more limiting expression, using link names instead of Table.column
1101 rows = registry.queryDataIds(
1102 dimensions,
1103 datasets=rawType,
1104 collections=run1,
1105 where="visit = 10 and detector > 1 and 'DummyCam'=instrument",
1106 ).toSet()
1107 self.assertEqual(len(rows), 2 * 2) # 2 exposures times 2 detectors
1108 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1109 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1110 self.assertCountEqual({dataId["detector"] for dataId in rows}, (2, 3))
1112 # queryDataIds with only one of `datasets` and `collections` is an
1113 # error.
1114 with self.assertRaises(CollectionError):
1115 registry.queryDataIds(dimensions, datasets=rawType)
1116 with self.assertRaises(ArgumentError):
1117 registry.queryDataIds(dimensions, collections=run1)
1119 # expression excludes everything
1120 rows = registry.queryDataIds(
1121 dimensions, datasets=rawType, collections=run1, where="visit > 1000", instrument="DummyCam"
1122 ).toSet()
1123 self.assertEqual(len(rows), 0)
1125 # Selecting by physical_filter, this is not in the dimensions, but it
1126 # is a part of the full expression so it should work too.
1127 rows = registry.queryDataIds(
1128 dimensions,
1129 datasets=rawType,
1130 collections=run1,
1131 where="physical_filter = 'dummy_r'",
1132 instrument="DummyCam",
1133 ).toSet()
1134 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1135 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (110, 111))
1136 self.assertCountEqual({dataId["visit"] for dataId in rows}, (11,))
1137 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1139 def testSkyMapDimensions(self):
1140 """Tests involving only skymap dimensions, no joins to instrument."""
1141 registry = self.makeRegistry()
1143 # need a bunch of dimensions and datasets for test, we want
1144 # "band" in the test so also have to add physical_filter
1145 # dimensions
1146 registry.insertDimensionData("instrument", dict(instrument="DummyCam"))
1147 registry.insertDimensionData(
1148 "physical_filter",
1149 dict(instrument="DummyCam", name="dummy_r", band="r"),
1150 dict(instrument="DummyCam", name="dummy_i", band="i"),
1151 )
1152 registry.insertDimensionData("skymap", dict(name="DummyMap", hash=b"sha!"))
1153 for tract in range(10):
1154 registry.insertDimensionData("tract", dict(skymap="DummyMap", id=tract))
1155 registry.insertDimensionData(
1156 "patch",
1157 *[dict(skymap="DummyMap", tract=tract, id=patch, cell_x=0, cell_y=0) for patch in range(10)],
1158 )
1160 # dataset types
1161 run = "tésτ"
1162 registry.registerRun(run)
1163 storageClass = StorageClass("testDataset")
1164 registry.storageClasses.registerStorageClass(storageClass)
1165 calexpType = DatasetType(
1166 name="deepCoadd_calexp",
1167 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1168 storageClass=storageClass,
1169 )
1170 registry.registerDatasetType(calexpType)
1171 mergeType = DatasetType(
1172 name="deepCoadd_mergeDet",
1173 dimensions=registry.dimensions.extract(("skymap", "tract", "patch")),
1174 storageClass=storageClass,
1175 )
1176 registry.registerDatasetType(mergeType)
1177 measType = DatasetType(
1178 name="deepCoadd_meas",
1179 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1180 storageClass=storageClass,
1181 )
1182 registry.registerDatasetType(measType)
1184 dimensions = DimensionGraph(
1185 registry.dimensions,
1186 dimensions=(
1187 calexpType.dimensions.required | mergeType.dimensions.required | measType.dimensions.required
1188 ),
1189 )
1191 # add pre-existing datasets
1192 for tract in (1, 3, 5):
1193 for patch in (2, 4, 6, 7):
1194 dataId = dict(skymap="DummyMap", tract=tract, patch=patch)
1195 registry.insertDatasets(mergeType, dataIds=[dataId], run=run)
1196 for aFilter in ("i", "r"):
1197 dataId = dict(skymap="DummyMap", tract=tract, patch=patch, band=aFilter)
1198 registry.insertDatasets(calexpType, dataIds=[dataId], run=run)
1200 # with empty expression
1201 rows = registry.queryDataIds(dimensions, datasets=[calexpType, mergeType], collections=run).toSet()
1202 self.assertEqual(len(rows), 3 * 4 * 2) # 4 tracts x 4 patches x 2 filters
1203 for dataId in rows:
1204 self.assertCountEqual(dataId.keys(), ("skymap", "tract", "patch", "band"))
1205 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1206 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1207 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1209 # limit to 2 tracts and 2 patches
1210 rows = registry.queryDataIds(
1211 dimensions,
1212 datasets=[calexpType, mergeType],
1213 collections=run,
1214 where="tract IN (1, 5) AND patch IN (2, 7)",
1215 skymap="DummyMap",
1216 ).toSet()
1217 self.assertEqual(len(rows), 2 * 2 * 2) # 2 tracts x 2 patches x 2 filters
1218 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 5))
1219 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 7))
1220 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1222 # limit to single filter
1223 rows = registry.queryDataIds(
1224 dimensions, datasets=[calexpType, mergeType], collections=run, where="band = 'i'"
1225 ).toSet()
1226 self.assertEqual(len(rows), 3 * 4 * 1) # 4 tracts x 4 patches x 2 filters
1227 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1228 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1229 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",))
1231 # Specifying non-existing skymap is an exception
1232 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
1233 rows = registry.queryDataIds(
1234 dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
1235 ).toSet()
1237 def testSpatialJoin(self):
1238 """Test queries that involve spatial overlap joins."""
1239 registry = self.makeRegistry()
1240 self.loadData(registry, "hsc-rc2-subset.yaml")
1242 # Dictionary of spatial DatabaseDimensionElements, keyed by the name of
1243 # the TopologicalFamily they belong to. We'll relate all elements in
1244 # each family to all of the elements in each other family.
1245 families = defaultdict(set)
1246 # Dictionary of {element.name: {dataId: region}}.
1247 regions = {}
1248 for element in registry.dimensions.getDatabaseElements():
1249 if element.spatial is not None:
1250 families[element.spatial.name].add(element)
1251 regions[element.name] = {
1252 record.dataId: record.region for record in registry.queryDimensionRecords(element)
1253 }
1255 # If this check fails, it's not necessarily a problem - it may just be
1256 # a reasonable change to the default dimension definitions - but the
1257 # test below depends on there being more than one family to do anything
1258 # useful.
1259 self.assertEqual(len(families), 2)
1261 # Overlap DatabaseDimensionElements with each other.
1262 for family1, family2 in itertools.combinations(families, 2):
1263 for element1, element2 in itertools.product(families[family1], families[family2]):
1264 graph = DimensionGraph.union(element1.graph, element2.graph)
1265 # Construct expected set of overlapping data IDs via a
1266 # brute-force comparison of the regions we've already fetched.
1267 expected = {
1268 DataCoordinate.standardize({**dataId1.byName(), **dataId2.byName()}, graph=graph)
1269 for (dataId1, region1), (dataId2, region2) in itertools.product(
1270 regions[element1.name].items(), regions[element2.name].items()
1271 )
1272 if not region1.isDisjointFrom(region2)
1273 }
1274 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1275 queried = set(registry.queryDataIds(graph))
1276 self.assertEqual(expected, queried)
1278 # Overlap each DatabaseDimensionElement with the commonSkyPix system.
1279 commonSkyPix = registry.dimensions.commonSkyPix
1280 for elementName, these_regions in regions.items():
1281 graph = DimensionGraph.union(registry.dimensions[elementName].graph, commonSkyPix.graph)
1282 expected = set()
1283 for dataId, region in these_regions.items():
1284 for begin, end in commonSkyPix.pixelization.envelope(region):
1285 expected.update(
1286 DataCoordinate.standardize({commonSkyPix.name: index, **dataId.byName()}, graph=graph)
1287 for index in range(begin, end)
1288 )
1289 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1290 queried = set(registry.queryDataIds(graph))
1291 self.assertEqual(expected, queried)
1293 def testAbstractQuery(self):
1294 """Test that we can run a query that just lists the known
1295 bands. This is tricky because band is
1296 backed by a query against physical_filter.
1297 """
1298 registry = self.makeRegistry()
1299 registry.insertDimensionData("instrument", dict(name="DummyCam"))
1300 registry.insertDimensionData(
1301 "physical_filter",
1302 dict(instrument="DummyCam", name="dummy_i", band="i"),
1303 dict(instrument="DummyCam", name="dummy_i2", band="i"),
1304 dict(instrument="DummyCam", name="dummy_r", band="r"),
1305 )
1306 rows = registry.queryDataIds(["band"]).toSet()
1307 self.assertCountEqual(
1308 rows,
1309 [
1310 DataCoordinate.standardize(band="i", universe=registry.dimensions),
1311 DataCoordinate.standardize(band="r", universe=registry.dimensions),
1312 ],
1313 )
1315 def testAttributeManager(self):
1316 """Test basic functionality of attribute manager."""
1317 # number of attributes with schema versions in a fresh database,
1318 # 6 managers with 2 records per manager, plus config for dimensions
1319 VERSION_COUNT = 6 * 2 + 1
1321 registry = self.makeRegistry()
1322 attributes = registry._managers.attributes
1324 # check what get() returns for non-existing key
1325 self.assertIsNone(attributes.get("attr"))
1326 self.assertEqual(attributes.get("attr", ""), "")
1327 self.assertEqual(attributes.get("attr", "Value"), "Value")
1328 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1330 # cannot store empty key or value
1331 with self.assertRaises(ValueError):
1332 attributes.set("", "value")
1333 with self.assertRaises(ValueError):
1334 attributes.set("attr", "")
1336 # set value of non-existing key
1337 attributes.set("attr", "value")
1338 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1339 self.assertEqual(attributes.get("attr"), "value")
1341 # update value of existing key
1342 with self.assertRaises(ButlerAttributeExistsError):
1343 attributes.set("attr", "value2")
1345 attributes.set("attr", "value2", force=True)
1346 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1347 self.assertEqual(attributes.get("attr"), "value2")
1349 # delete existing key
1350 self.assertTrue(attributes.delete("attr"))
1351 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1353 # delete non-existing key
1354 self.assertFalse(attributes.delete("non-attr"))
1356 # store bunch of keys and get the list back
1357 data = [
1358 ("version.core", "1.2.3"),
1359 ("version.dimensions", "3.2.1"),
1360 ("config.managers.opaque", "ByNameOpaqueTableStorageManager"),
1361 ]
1362 for key, value in data:
1363 attributes.set(key, value)
1364 items = dict(attributes.items())
1365 for key, value in data:
1366 self.assertEqual(items[key], value)
1368 def testQueryDatasetsDeduplication(self):
1369 """Test that the findFirst option to queryDatasets selects datasets
1370 from collections in the order given".
1371 """
1372 registry = self.makeRegistry()
1373 self.loadData(registry, "base.yaml")
1374 self.loadData(registry, "datasets.yaml")
1375 self.assertCountEqual(
1376 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"])),
1377 [
1378 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1379 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1380 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1381 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1382 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1383 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1384 ],
1385 )
1386 self.assertCountEqual(
1387 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"], findFirst=True)),
1388 [
1389 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1390 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1391 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1392 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1393 ],
1394 )
1395 self.assertCountEqual(
1396 list(registry.queryDatasets("bias", collections=["imported_r", "imported_g"], findFirst=True)),
1397 [
1398 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1399 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1400 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1401 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1402 ],
1403 )
1405 def testQueryResults(self):
1406 """Test querying for data IDs and then manipulating the QueryResults
1407 object returned to perform other queries.
1408 """
1409 registry = self.makeRegistry()
1410 self.loadData(registry, "base.yaml")
1411 self.loadData(registry, "datasets.yaml")
1412 bias = registry.getDatasetType("bias")
1413 flat = registry.getDatasetType("flat")
1414 # Obtain expected results from methods other than those we're testing
1415 # here. That includes:
1416 # - the dimensions of the data IDs we want to query:
1417 expectedGraph = DimensionGraph(registry.dimensions, names=["detector", "physical_filter"])
1418 # - the dimensions of some other data IDs we'll extract from that:
1419 expectedSubsetGraph = DimensionGraph(registry.dimensions, names=["detector"])
1420 # - the data IDs we expect to obtain from the first queries:
1421 expectedDataIds = DataCoordinateSet(
1422 {
1423 DataCoordinate.standardize(
1424 instrument="Cam1", detector=d, physical_filter=p, universe=registry.dimensions
1425 )
1426 for d, p in itertools.product({1, 2, 3}, {"Cam1-G", "Cam1-R1", "Cam1-R2"})
1427 },
1428 graph=expectedGraph,
1429 hasFull=False,
1430 hasRecords=False,
1431 )
1432 # - the flat datasets we expect to find from those data IDs, in just
1433 # one collection (so deduplication is irrelevant):
1434 expectedFlats = [
1435 registry.findDataset(
1436 flat, instrument="Cam1", detector=1, physical_filter="Cam1-R1", collections="imported_r"
1437 ),
1438 registry.findDataset(
1439 flat, instrument="Cam1", detector=2, physical_filter="Cam1-R1", collections="imported_r"
1440 ),
1441 registry.findDataset(
1442 flat, instrument="Cam1", detector=3, physical_filter="Cam1-R2", collections="imported_r"
1443 ),
1444 ]
1445 # - the data IDs we expect to extract from that:
1446 expectedSubsetDataIds = expectedDataIds.subset(expectedSubsetGraph)
1447 # - the bias datasets we expect to find from those data IDs, after we
1448 # subset-out the physical_filter dimension, both with duplicates:
1449 expectedAllBiases = [
1450 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1451 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_g"),
1452 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_g"),
1453 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1454 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1455 ]
1456 # - ...and without duplicates:
1457 expectedDeduplicatedBiases = [
1458 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1459 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1460 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1461 ]
1462 # Test against those expected results, using a "lazy" query for the
1463 # data IDs (which re-executes that query each time we use it to do
1464 # something new).
1465 dataIds = registry.queryDataIds(
1466 ["detector", "physical_filter"],
1467 where="detector.purpose = 'SCIENCE'", # this rejects detector=4
1468 instrument="Cam1",
1469 )
1470 self.assertEqual(dataIds.graph, expectedGraph)
1471 self.assertEqual(dataIds.toSet(), expectedDataIds)
1472 self.assertCountEqual(
1473 list(
1474 dataIds.findDatasets(
1475 flat,
1476 collections=["imported_r"],
1477 )
1478 ),
1479 expectedFlats,
1480 )
1481 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1482 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1483 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1484 self.assertCountEqual(
1485 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=False)),
1486 expectedAllBiases,
1487 )
1488 self.assertCountEqual(
1489 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)),
1490 expectedDeduplicatedBiases,
1491 )
1493 # Check dimensions match.
1494 with self.assertRaises(ValueError):
1495 subsetDataIds.findDatasets("flat", collections=["imported_r", "imported_g"], findFirst=True)
1497 # Use a component dataset type.
1498 self.assertCountEqual(
1499 [
1500 ref.makeComponentRef("image")
1501 for ref in subsetDataIds.findDatasets(
1502 bias,
1503 collections=["imported_r", "imported_g"],
1504 findFirst=False,
1505 )
1506 ],
1507 [ref.makeComponentRef("image") for ref in expectedAllBiases],
1508 )
1510 # Use a named dataset type that does not exist and a dataset type
1511 # object that does not exist.
1512 unknown_type = DatasetType("not_known", dimensions=bias.dimensions, storageClass="Exposure")
1514 # Test both string name and dataset type object.
1515 test_type: str | DatasetType
1516 for test_type, test_type_name in (
1517 (unknown_type, unknown_type.name),
1518 (unknown_type.name, unknown_type.name),
1519 ):
1520 with self.assertRaisesRegex(DatasetTypeError, expected_regex=test_type_name):
1521 list(
1522 subsetDataIds.findDatasets(
1523 test_type, collections=["imported_r", "imported_g"], findFirst=True
1524 )
1525 )
1527 # Materialize the bias dataset queries (only) by putting the results
1528 # into temporary tables, then repeat those tests.
1529 with subsetDataIds.findDatasets(
1530 bias, collections=["imported_r", "imported_g"], findFirst=False
1531 ).materialize() as biases:
1532 self.assertCountEqual(list(biases), expectedAllBiases)
1533 with subsetDataIds.findDatasets(
1534 bias, collections=["imported_r", "imported_g"], findFirst=True
1535 ).materialize() as biases:
1536 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1537 # Materialize the data ID subset query, but not the dataset queries.
1538 with subsetDataIds.materialize() as subsetDataIds:
1539 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1540 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1541 self.assertCountEqual(
1542 list(
1543 subsetDataIds.findDatasets(
1544 bias, collections=["imported_r", "imported_g"], findFirst=False
1545 )
1546 ),
1547 expectedAllBiases,
1548 )
1549 self.assertCountEqual(
1550 list(
1551 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1552 ),
1553 expectedDeduplicatedBiases,
1554 )
1555 # Materialize the dataset queries, too.
1556 with subsetDataIds.findDatasets(
1557 bias, collections=["imported_r", "imported_g"], findFirst=False
1558 ).materialize() as biases:
1559 self.assertCountEqual(list(biases), expectedAllBiases)
1560 with subsetDataIds.findDatasets(
1561 bias, collections=["imported_r", "imported_g"], findFirst=True
1562 ).materialize() as biases:
1563 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1564 # Materialize the original query, but none of the follow-up queries.
1565 with dataIds.materialize() as dataIds:
1566 self.assertEqual(dataIds.graph, expectedGraph)
1567 self.assertEqual(dataIds.toSet(), expectedDataIds)
1568 self.assertCountEqual(
1569 list(
1570 dataIds.findDatasets(
1571 flat,
1572 collections=["imported_r"],
1573 )
1574 ),
1575 expectedFlats,
1576 )
1577 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1578 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1579 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1580 self.assertCountEqual(
1581 list(
1582 subsetDataIds.findDatasets(
1583 bias, collections=["imported_r", "imported_g"], findFirst=False
1584 )
1585 ),
1586 expectedAllBiases,
1587 )
1588 self.assertCountEqual(
1589 list(
1590 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1591 ),
1592 expectedDeduplicatedBiases,
1593 )
1594 # Materialize just the bias dataset queries.
1595 with subsetDataIds.findDatasets(
1596 bias, collections=["imported_r", "imported_g"], findFirst=False
1597 ).materialize() as biases:
1598 self.assertCountEqual(list(biases), expectedAllBiases)
1599 with subsetDataIds.findDatasets(
1600 bias, collections=["imported_r", "imported_g"], findFirst=True
1601 ).materialize() as biases:
1602 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1603 # Materialize the subset data ID query, but not the dataset
1604 # queries.
1605 with subsetDataIds.materialize() as subsetDataIds:
1606 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1607 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1608 self.assertCountEqual(
1609 list(
1610 subsetDataIds.findDatasets(
1611 bias, collections=["imported_r", "imported_g"], findFirst=False
1612 )
1613 ),
1614 expectedAllBiases,
1615 )
1616 self.assertCountEqual(
1617 list(
1618 subsetDataIds.findDatasets(
1619 bias, collections=["imported_r", "imported_g"], findFirst=True
1620 )
1621 ),
1622 expectedDeduplicatedBiases,
1623 )
1624 # Materialize the bias dataset queries, too, so now we're
1625 # materializing every single step.
1626 with subsetDataIds.findDatasets(
1627 bias, collections=["imported_r", "imported_g"], findFirst=False
1628 ).materialize() as biases:
1629 self.assertCountEqual(list(biases), expectedAllBiases)
1630 with subsetDataIds.findDatasets(
1631 bias, collections=["imported_r", "imported_g"], findFirst=True
1632 ).materialize() as biases:
1633 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1635 def testStorageClassPropagation(self):
1636 """Test that queries for datasets respect the storage class passed in
1637 as part of a full dataset type.
1638 """
1639 registry = self.makeRegistry()
1640 self.loadData(registry, "base.yaml")
1641 dataset_type_in_registry = DatasetType(
1642 "tbl", dimensions=["instrument"], storageClass="Packages", universe=registry.dimensions
1643 )
1644 registry.registerDatasetType(dataset_type_in_registry)
1645 run = "run1"
1646 registry.registerRun(run)
1647 (inserted_ref,) = registry.insertDatasets(
1648 dataset_type_in_registry, [registry.expandDataId(instrument="Cam1")], run=run
1649 )
1650 self.assertEqual(inserted_ref.datasetType, dataset_type_in_registry)
1651 query_dataset_type = DatasetType(
1652 "tbl", dimensions=["instrument"], storageClass="StructuredDataDict", universe=registry.dimensions
1653 )
1654 self.assertNotEqual(dataset_type_in_registry, query_dataset_type)
1655 query_datasets_result = registry.queryDatasets(query_dataset_type, collections=[run])
1656 self.assertEqual(query_datasets_result.parentDatasetType, query_dataset_type) # type: ignore
1657 (query_datasets_ref,) = query_datasets_result
1658 self.assertEqual(query_datasets_ref.datasetType, query_dataset_type)
1659 query_data_ids_find_datasets_result = registry.queryDataIds(["instrument"]).findDatasets(
1660 query_dataset_type, collections=[run]
1661 )
1662 self.assertEqual(query_data_ids_find_datasets_result.parentDatasetType, query_dataset_type)
1663 (query_data_ids_find_datasets_ref,) = query_data_ids_find_datasets_result
1664 self.assertEqual(query_data_ids_find_datasets_ref.datasetType, query_dataset_type)
1665 query_dataset_types_result = registry.queryDatasetTypes(query_dataset_type)
1666 self.assertEqual(list(query_dataset_types_result), [query_dataset_type])
1667 find_dataset_ref = registry.findDataset(query_dataset_type, instrument="Cam1", collections=[run])
1668 self.assertEqual(find_dataset_ref.datasetType, query_dataset_type)
1670 def testEmptyDimensionsQueries(self):
1671 """Test Query and QueryResults objects in the case where there are no
1672 dimensions.
1673 """
1674 # Set up test data: one dataset type, two runs, one dataset in each.
1675 registry = self.makeRegistry()
1676 self.loadData(registry, "base.yaml")
1677 schema = DatasetType("schema", dimensions=registry.dimensions.empty, storageClass="Catalog")
1678 registry.registerDatasetType(schema)
1679 dataId = DataCoordinate.makeEmpty(registry.dimensions)
1680 run1 = "run1"
1681 run2 = "run2"
1682 registry.registerRun(run1)
1683 registry.registerRun(run2)
1684 (dataset1,) = registry.insertDatasets(schema, dataIds=[dataId], run=run1)
1685 (dataset2,) = registry.insertDatasets(schema, dataIds=[dataId], run=run2)
1686 # Query directly for both of the datasets, and each one, one at a time.
1687 self.checkQueryResults(
1688 registry.queryDatasets(schema, collections=[run1, run2], findFirst=False), [dataset1, dataset2]
1689 )
1690 self.checkQueryResults(
1691 registry.queryDatasets(schema, collections=[run1, run2], findFirst=True),
1692 [dataset1],
1693 )
1694 self.checkQueryResults(
1695 registry.queryDatasets(schema, collections=[run2, run1], findFirst=True),
1696 [dataset2],
1697 )
1698 # Query for data IDs with no dimensions.
1699 dataIds = registry.queryDataIds([])
1700 self.checkQueryResults(dataIds, [dataId])
1701 # Use queried data IDs to find the datasets.
1702 self.checkQueryResults(
1703 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1704 [dataset1, dataset2],
1705 )
1706 self.checkQueryResults(
1707 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1708 [dataset1],
1709 )
1710 self.checkQueryResults(
1711 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1712 [dataset2],
1713 )
1714 # Now materialize the data ID query results and repeat those tests.
1715 with dataIds.materialize() as dataIds:
1716 self.checkQueryResults(dataIds, [dataId])
1717 self.checkQueryResults(
1718 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1719 [dataset1],
1720 )
1721 self.checkQueryResults(
1722 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1723 [dataset2],
1724 )
1725 # Query for non-empty data IDs, then subset that to get the empty one.
1726 # Repeat the above tests starting from that.
1727 dataIds = registry.queryDataIds(["instrument"]).subset(registry.dimensions.empty, unique=True)
1728 self.checkQueryResults(dataIds, [dataId])
1729 self.checkQueryResults(
1730 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1731 [dataset1, dataset2],
1732 )
1733 self.checkQueryResults(
1734 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1735 [dataset1],
1736 )
1737 self.checkQueryResults(
1738 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1739 [dataset2],
1740 )
1741 with dataIds.materialize() as dataIds:
1742 self.checkQueryResults(dataIds, [dataId])
1743 self.checkQueryResults(
1744 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1745 [dataset1, dataset2],
1746 )
1747 self.checkQueryResults(
1748 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1749 [dataset1],
1750 )
1751 self.checkQueryResults(
1752 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1753 [dataset2],
1754 )
1755 # Query for non-empty data IDs, then materialize, then subset to get
1756 # the empty one. Repeat again.
1757 with registry.queryDataIds(["instrument"]).materialize() as nonEmptyDataIds:
1758 dataIds = nonEmptyDataIds.subset(registry.dimensions.empty, unique=True)
1759 self.checkQueryResults(dataIds, [dataId])
1760 self.checkQueryResults(
1761 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1762 [dataset1, dataset2],
1763 )
1764 self.checkQueryResults(
1765 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1766 [dataset1],
1767 )
1768 self.checkQueryResults(
1769 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1770 [dataset2],
1771 )
1772 with dataIds.materialize() as dataIds:
1773 self.checkQueryResults(dataIds, [dataId])
1774 self.checkQueryResults(
1775 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1776 [dataset1, dataset2],
1777 )
1778 self.checkQueryResults(
1779 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1780 [dataset1],
1781 )
1782 self.checkQueryResults(
1783 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1784 [dataset2],
1785 )
1786 # Query for non-empty data IDs with a constraint on an empty-data-ID
1787 # dataset that exists.
1788 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=...)
1789 self.checkQueryResults(
1790 dataIds.subset(unique=True),
1791 [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)],
1792 )
1793 # Again query for non-empty data IDs with a constraint on empty-data-ID
1794 # datasets, but when the datasets don't exist. We delete the existing
1795 # dataset and query just that collection rather than creating a new
1796 # empty collection because this is a bit less likely for our build-time
1797 # logic to shortcut-out (via the collection summaries), and such a
1798 # shortcut would make this test a bit more trivial than we'd like.
1799 registry.removeDatasets([dataset2])
1800 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=run2)
1801 self.checkQueryResults(dataIds, [])
1803 def testDimensionDataModifications(self):
1804 """Test that modifying dimension records via:
1805 syncDimensionData(..., update=True) and
1806 insertDimensionData(..., replace=True) works as expected, even in the
1807 presence of datasets using those dimensions and spatial overlap
1808 relationships.
1809 """
1811 def unpack_range_set(ranges: lsst.sphgeom.RangeSet) -> Iterator[int]:
1812 """Unpack a sphgeom.RangeSet into the integers it contains."""
1813 for begin, end in ranges:
1814 yield from range(begin, end)
1816 def range_set_hull(
1817 ranges: lsst.sphgeom.RangeSet,
1818 pixelization: lsst.sphgeom.HtmPixelization,
1819 ) -> lsst.sphgeom.ConvexPolygon:
1820 """Create a ConvexPolygon hull of the region defined by a set of
1821 HTM pixelization index ranges.
1822 """
1823 points = []
1824 for index in unpack_range_set(ranges):
1825 points.extend(pixelization.triangle(index).getVertices())
1826 return lsst.sphgeom.ConvexPolygon(points)
1828 # Use HTM to set up an initial parent region (one arbitrary trixel)
1829 # and four child regions (the trixels within the parent at the next
1830 # level. We'll use the parent as a tract/visit region and the children
1831 # as its patch/visit_detector regions.
1832 registry = self.makeRegistry()
1833 htm6 = registry.dimensions.skypix["htm"][6].pixelization
1834 commonSkyPix = registry.dimensions.commonSkyPix.pixelization
1835 index = 12288
1836 child_ranges_small = lsst.sphgeom.RangeSet(index).scaled(4)
1837 assert htm6.universe().contains(child_ranges_small)
1838 child_regions_small = [htm6.triangle(i) for i in unpack_range_set(child_ranges_small)]
1839 parent_region_small = lsst.sphgeom.ConvexPolygon(
1840 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_small))
1841 )
1842 assert all(parent_region_small.contains(c) for c in child_regions_small)
1843 # Make a larger version of each child region, defined to be the set of
1844 # htm6 trixels that overlap the original's bounding circle. Make a new
1845 # parent that's the convex hull of the new children.
1846 child_regions_large = [
1847 range_set_hull(htm6.envelope(c.getBoundingCircle()), htm6) for c in child_regions_small
1848 ]
1849 assert all(
1850 large.contains(small)
1851 for large, small in zip(child_regions_large, child_regions_small, strict=True)
1852 )
1853 parent_region_large = lsst.sphgeom.ConvexPolygon(
1854 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_large))
1855 )
1856 assert all(parent_region_large.contains(c) for c in child_regions_large)
1857 assert parent_region_large.contains(parent_region_small)
1858 assert not parent_region_small.contains(parent_region_large)
1859 assert not all(parent_region_small.contains(c) for c in child_regions_large)
1860 # Find some commonSkyPix indices that overlap the large regions but not
1861 # overlap the small regions. We use commonSkyPix here to make sure the
1862 # real tests later involve what's in the database, not just post-query
1863 # filtering of regions.
1864 child_difference_indices = []
1865 for large, small in zip(child_regions_large, child_regions_small, strict=True):
1866 difference = list(unpack_range_set(commonSkyPix.envelope(large) - commonSkyPix.envelope(small)))
1867 assert difference, "if this is empty, we can't test anything useful with these regions"
1868 assert all(
1869 not commonSkyPix.triangle(d).isDisjointFrom(large)
1870 and commonSkyPix.triangle(d).isDisjointFrom(small)
1871 for d in difference
1872 )
1873 child_difference_indices.append(difference)
1874 parent_difference_indices = list(
1875 unpack_range_set(
1876 commonSkyPix.envelope(parent_region_large) - commonSkyPix.envelope(parent_region_small)
1877 )
1878 )
1879 assert parent_difference_indices, "if this is empty, we can't test anything useful with these regions"
1880 assert all(
1881 (
1882 not commonSkyPix.triangle(d).isDisjointFrom(parent_region_large)
1883 and commonSkyPix.triangle(d).isDisjointFrom(parent_region_small)
1884 )
1885 for d in parent_difference_indices
1886 )
1887 # Now that we've finally got those regions, we'll insert the large ones
1888 # as tract/patch dimension records.
1889 skymap_name = "testing_v1"
1890 registry.insertDimensionData(
1891 "skymap",
1892 {
1893 "name": skymap_name,
1894 "hash": bytes([42]),
1895 "tract_max": 1,
1896 "patch_nx_max": 2,
1897 "patch_ny_max": 2,
1898 },
1899 )
1900 registry.insertDimensionData("tract", {"skymap": skymap_name, "id": 0, "region": parent_region_large})
1901 registry.insertDimensionData(
1902 "patch",
1903 *[
1904 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1905 for n, c in enumerate(child_regions_large)
1906 ],
1907 )
1908 # Add at dataset that uses these dimensions to make sure that modifying
1909 # them doesn't disrupt foreign keys (need to make sure DB doesn't
1910 # implement insert with replace=True as delete-then-insert).
1911 dataset_type = DatasetType(
1912 "coadd",
1913 dimensions=["tract", "patch"],
1914 universe=registry.dimensions,
1915 storageClass="Exposure",
1916 )
1917 registry.registerDatasetType(dataset_type)
1918 registry.registerCollection("the_run", CollectionType.RUN)
1919 registry.insertDatasets(
1920 dataset_type,
1921 [{"skymap": skymap_name, "tract": 0, "patch": 2}],
1922 run="the_run",
1923 )
1924 # Query for tracts and patches that overlap some "difference" htm9
1925 # pixels; there should be overlaps, because the database has
1926 # the "large" suite of regions.
1927 self.assertEqual(
1928 {0},
1929 {
1930 data_id["tract"]
1931 for data_id in registry.queryDataIds(
1932 ["tract"],
1933 skymap=skymap_name,
1934 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1935 )
1936 },
1937 )
1938 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1939 self.assertIn(
1940 patch_id,
1941 {
1942 data_id["patch"]
1943 for data_id in registry.queryDataIds(
1944 ["patch"],
1945 skymap=skymap_name,
1946 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1947 )
1948 },
1949 )
1950 # Use sync to update the tract region and insert to update the regions
1951 # of the patches, to the "small" suite.
1952 updated = registry.syncDimensionData(
1953 "tract",
1954 {"skymap": skymap_name, "id": 0, "region": parent_region_small},
1955 update=True,
1956 )
1957 self.assertEqual(updated, {"region": parent_region_large})
1958 registry.insertDimensionData(
1959 "patch",
1960 *[
1961 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1962 for n, c in enumerate(child_regions_small)
1963 ],
1964 replace=True,
1965 )
1966 # Query again; there now should be no such overlaps, because the
1967 # database has the "small" suite of regions.
1968 self.assertFalse(
1969 set(
1970 registry.queryDataIds(
1971 ["tract"],
1972 skymap=skymap_name,
1973 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1974 )
1975 )
1976 )
1977 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1978 self.assertNotIn(
1979 patch_id,
1980 {
1981 data_id["patch"]
1982 for data_id in registry.queryDataIds(
1983 ["patch"],
1984 skymap=skymap_name,
1985 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1986 )
1987 },
1988 )
1989 # Update back to the large regions and query one more time.
1990 updated = registry.syncDimensionData(
1991 "tract",
1992 {"skymap": skymap_name, "id": 0, "region": parent_region_large},
1993 update=True,
1994 )
1995 self.assertEqual(updated, {"region": parent_region_small})
1996 registry.insertDimensionData(
1997 "patch",
1998 *[
1999 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
2000 for n, c in enumerate(child_regions_large)
2001 ],
2002 replace=True,
2003 )
2004 self.assertEqual(
2005 {0},
2006 {
2007 data_id["tract"]
2008 for data_id in registry.queryDataIds(
2009 ["tract"],
2010 skymap=skymap_name,
2011 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
2012 )
2013 },
2014 )
2015 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
2016 self.assertIn(
2017 patch_id,
2018 {
2019 data_id["patch"]
2020 for data_id in registry.queryDataIds(
2021 ["patch"],
2022 skymap=skymap_name,
2023 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
2024 )
2025 },
2026 )
2028 def testCalibrationCollections(self):
2029 """Test operations on `~CollectionType.CALIBRATION` collections,
2030 including `Registry.certify`, `Registry.decertify`, and
2031 `Registry.findDataset`.
2032 """
2033 # Setup - make a Registry, fill it with some datasets in
2034 # non-calibration collections.
2035 registry = self.makeRegistry()
2036 self.loadData(registry, "base.yaml")
2037 self.loadData(registry, "datasets.yaml")
2038 # Set up some timestamps.
2039 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")
2040 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai")
2041 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai")
2042 t4 = astropy.time.Time("2020-01-01T04:00:00", format="isot", scale="tai")
2043 t5 = astropy.time.Time("2020-01-01T05:00:00", format="isot", scale="tai")
2044 allTimespans = [
2045 Timespan(a, b) for a, b in itertools.combinations([None, t1, t2, t3, t4, t5, None], r=2)
2046 ]
2047 # Get references to some datasets.
2048 bias2a = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g")
2049 bias3a = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g")
2050 bias2b = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r")
2051 bias3b = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r")
2052 # Register the main calibration collection we'll be working with.
2053 collection = "Cam1/calibs/default"
2054 registry.registerCollection(collection, type=CollectionType.CALIBRATION)
2055 # Cannot associate into a calibration collection (no timespan).
2056 with self.assertRaises(CollectionTypeError):
2057 registry.associate(collection, [bias2a])
2058 # Certify 2a dataset with [t2, t4) validity.
2059 registry.certify(collection, [bias2a], Timespan(begin=t2, end=t4))
2060 # Test that we can query for this dataset via the new collection, both
2061 # on its own and with a RUN collection, as long as we don't try to join
2062 # in temporal dimensions or use findFirst=True.
2063 self.assertEqual(
2064 set(registry.queryDatasets("bias", findFirst=False, collections=collection)),
2065 {bias2a},
2066 )
2067 self.assertEqual(
2068 set(registry.queryDatasets("bias", findFirst=False, collections=[collection, "imported_r"])),
2069 {
2070 bias2a,
2071 bias2b,
2072 bias3b,
2073 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2074 },
2075 )
2076 self.assertEqual(
2077 set(registry.queryDataIds("detector", datasets="bias", collections=collection)),
2078 {registry.expandDataId(instrument="Cam1", detector=2)},
2079 )
2080 self.assertEqual(
2081 set(registry.queryDataIds("detector", datasets="bias", collections=[collection, "imported_r"])),
2082 {
2083 registry.expandDataId(instrument="Cam1", detector=2),
2084 registry.expandDataId(instrument="Cam1", detector=3),
2085 registry.expandDataId(instrument="Cam1", detector=4),
2086 },
2087 )
2089 # We should not be able to certify 2b with anything overlapping that
2090 # window.
2091 with self.assertRaises(ConflictingDefinitionError):
2092 registry.certify(collection, [bias2b], Timespan(begin=None, end=t3))
2093 with self.assertRaises(ConflictingDefinitionError):
2094 registry.certify(collection, [bias2b], Timespan(begin=None, end=t5))
2095 with self.assertRaises(ConflictingDefinitionError):
2096 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t3))
2097 with self.assertRaises(ConflictingDefinitionError):
2098 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t5))
2099 with self.assertRaises(ConflictingDefinitionError):
2100 registry.certify(collection, [bias2b], Timespan(begin=t1, end=None))
2101 with self.assertRaises(ConflictingDefinitionError):
2102 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t3))
2103 with self.assertRaises(ConflictingDefinitionError):
2104 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t5))
2105 with self.assertRaises(ConflictingDefinitionError):
2106 registry.certify(collection, [bias2b], Timespan(begin=t2, end=None))
2107 # We should be able to certify 3a with a range overlapping that window,
2108 # because it's for a different detector.
2109 # We'll certify 3a over [t1, t3).
2110 registry.certify(collection, [bias3a], Timespan(begin=t1, end=t3))
2111 # Now we'll certify 2b and 3b together over [t4, ∞).
2112 registry.certify(collection, [bias2b, bias3b], Timespan(begin=t4, end=None))
2114 # Fetch all associations and check that they are what we expect.
2115 self.assertCountEqual(
2116 list(
2117 registry.queryDatasetAssociations(
2118 "bias",
2119 collections=[collection, "imported_g", "imported_r"],
2120 )
2121 ),
2122 [
2123 DatasetAssociation(
2124 ref=registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
2125 collection="imported_g",
2126 timespan=None,
2127 ),
2128 DatasetAssociation(
2129 ref=registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2130 collection="imported_r",
2131 timespan=None,
2132 ),
2133 DatasetAssociation(ref=bias2a, collection="imported_g", timespan=None),
2134 DatasetAssociation(ref=bias3a, collection="imported_g", timespan=None),
2135 DatasetAssociation(ref=bias2b, collection="imported_r", timespan=None),
2136 DatasetAssociation(ref=bias3b, collection="imported_r", timespan=None),
2137 DatasetAssociation(ref=bias2a, collection=collection, timespan=Timespan(begin=t2, end=t4)),
2138 DatasetAssociation(ref=bias3a, collection=collection, timespan=Timespan(begin=t1, end=t3)),
2139 DatasetAssociation(ref=bias2b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2140 DatasetAssociation(ref=bias3b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2141 ],
2142 )
2144 class Ambiguous:
2145 """Tag class to denote lookups that should be ambiguous."""
2147 pass
2149 def assertLookup(
2150 detector: int, timespan: Timespan, expected: DatasetRef | type[Ambiguous] | None
2151 ) -> None:
2152 """Local function that asserts that a bias lookup returns the given
2153 expected result.
2154 """
2155 if expected is Ambiguous:
2156 with self.assertRaises((DatasetTypeError, LookupError)):
2157 registry.findDataset(
2158 "bias",
2159 collections=collection,
2160 instrument="Cam1",
2161 detector=detector,
2162 timespan=timespan,
2163 )
2164 else:
2165 self.assertEqual(
2166 expected,
2167 registry.findDataset(
2168 "bias",
2169 collections=collection,
2170 instrument="Cam1",
2171 detector=detector,
2172 timespan=timespan,
2173 ),
2174 )
2176 # Systematically test lookups against expected results.
2177 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2178 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2179 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2180 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2181 assertLookup(detector=2, timespan=Timespan(None, t5), expected=Ambiguous)
2182 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2183 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2184 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2185 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2186 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=Ambiguous)
2187 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2188 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2189 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2190 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=Ambiguous)
2191 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2192 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=bias2a)
2193 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=Ambiguous)
2194 assertLookup(detector=2, timespan=Timespan(t3, None), expected=Ambiguous)
2195 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=bias2b)
2196 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2197 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2198 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2199 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2200 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2201 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2202 assertLookup(detector=3, timespan=Timespan(None, t5), expected=Ambiguous)
2203 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2204 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2205 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2206 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2207 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=Ambiguous)
2208 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2209 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2210 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2211 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=Ambiguous)
2212 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2213 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2214 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=bias3b)
2215 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2216 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=bias3b)
2217 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2218 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2220 # Decertify [t3, t5) for all data IDs, and do test lookups again.
2221 # This should truncate bias2a to [t2, t3), leave bias3a unchanged at
2222 # [t1, t3), and truncate bias2b and bias3b to [t5, ∞).
2223 registry.decertify(collection=collection, datasetType="bias", timespan=Timespan(t3, t5))
2224 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2225 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2226 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2227 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2228 assertLookup(detector=2, timespan=Timespan(None, t5), expected=bias2a)
2229 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2230 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2231 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2232 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2233 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=bias2a)
2234 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2235 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2236 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2237 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=bias2a)
2238 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2239 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=None)
2240 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=None)
2241 assertLookup(detector=2, timespan=Timespan(t3, None), expected=bias2b)
2242 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=None)
2243 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2244 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2245 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2246 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2247 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2248 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2249 assertLookup(detector=3, timespan=Timespan(None, t5), expected=bias3a)
2250 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2251 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2252 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2253 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2254 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=bias3a)
2255 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2256 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2257 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2258 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=bias3a)
2259 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2260 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2261 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=None)
2262 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2263 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=None)
2264 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2265 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2267 # Decertify everything, this time with explicit data IDs, then check
2268 # that no lookups succeed.
2269 registry.decertify(
2270 collection,
2271 "bias",
2272 Timespan(None, None),
2273 dataIds=[
2274 dict(instrument="Cam1", detector=2),
2275 dict(instrument="Cam1", detector=3),
2276 ],
2277 )
2278 for detector in (2, 3):
2279 for timespan in allTimespans:
2280 assertLookup(detector=detector, timespan=timespan, expected=None)
2281 # Certify bias2a and bias3a over (-∞, ∞), check that all lookups return
2282 # those.
2283 registry.certify(
2284 collection,
2285 [bias2a, bias3a],
2286 Timespan(None, None),
2287 )
2288 for timespan in allTimespans:
2289 assertLookup(detector=2, timespan=timespan, expected=bias2a)
2290 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2291 # Decertify just bias2 over [t2, t4).
2292 # This should split a single certification row into two (and leave the
2293 # other existing row, for bias3a, alone).
2294 registry.decertify(
2295 collection, "bias", Timespan(t2, t4), dataIds=[dict(instrument="Cam1", detector=2)]
2296 )
2297 for timespan in allTimespans:
2298 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2299 overlapsBefore = timespan.overlaps(Timespan(None, t2))
2300 overlapsAfter = timespan.overlaps(Timespan(t4, None))
2301 if overlapsBefore and overlapsAfter:
2302 expected = Ambiguous
2303 elif overlapsBefore or overlapsAfter:
2304 expected = bias2a
2305 else:
2306 expected = None
2307 assertLookup(detector=2, timespan=timespan, expected=expected)
2309 def testSkipCalibs(self):
2310 """Test how queries handle skipping of calibration collections."""
2311 registry = self.makeRegistry()
2312 self.loadData(registry, "base.yaml")
2313 self.loadData(registry, "datasets.yaml")
2315 coll_calib = "Cam1/calibs/default"
2316 registry.registerCollection(coll_calib, type=CollectionType.CALIBRATION)
2318 # Add all biases to the calibration collection.
2319 # Without this, the logic that prunes dataset subqueries based on
2320 # datasetType-collection summary information will fire before the logic
2321 # we want to test below. This is a good thing (it avoids the dreaded
2322 # NotImplementedError a bit more often) everywhere but here.
2323 registry.certify(coll_calib, registry.queryDatasets("bias", collections=...), Timespan(None, None))
2325 coll_list = [coll_calib, "imported_g", "imported_r"]
2326 chain = "Cam1/chain"
2327 registry.registerCollection(chain, type=CollectionType.CHAINED)
2328 registry.setCollectionChain(chain, coll_list)
2330 # explicit list will raise if findFirst=True or there are temporal
2331 # dimensions
2332 with self.assertRaises(NotImplementedError):
2333 registry.queryDatasets("bias", collections=coll_list, findFirst=True)
2334 with self.assertRaises(NotImplementedError):
2335 registry.queryDataIds(
2336 ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
2337 ).count()
2339 # chain will skip
2340 datasets = list(registry.queryDatasets("bias", collections=chain))
2341 self.assertGreater(len(datasets), 0)
2343 dataIds = list(registry.queryDataIds(["instrument", "detector"], datasets="bias", collections=chain))
2344 self.assertGreater(len(dataIds), 0)
2346 # glob will skip too
2347 datasets = list(registry.queryDatasets("bias", collections="*d*"))
2348 self.assertGreater(len(datasets), 0)
2350 # regular expression will skip too
2351 pattern = re.compile(".*")
2352 datasets = list(registry.queryDatasets("bias", collections=pattern))
2353 self.assertGreater(len(datasets), 0)
2355 # ellipsis should work as usual
2356 datasets = list(registry.queryDatasets("bias", collections=...))
2357 self.assertGreater(len(datasets), 0)
2359 # few tests with findFirst
2360 datasets = list(registry.queryDatasets("bias", collections=chain, findFirst=True))
2361 self.assertGreater(len(datasets), 0)
2363 def testIngestTimeQuery(self):
2364 registry = self.makeRegistry()
2365 self.loadData(registry, "base.yaml")
2366 dt0 = datetime.utcnow()
2367 self.loadData(registry, "datasets.yaml")
2368 dt1 = datetime.utcnow()
2370 datasets = list(registry.queryDatasets(..., collections=...))
2371 len0 = len(datasets)
2372 self.assertGreater(len0, 0)
2374 where = "ingest_date > T'2000-01-01'"
2375 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2376 len1 = len(datasets)
2377 self.assertEqual(len0, len1)
2379 # no one will ever use this piece of software in 30 years
2380 where = "ingest_date > T'2050-01-01'"
2381 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2382 len2 = len(datasets)
2383 self.assertEqual(len2, 0)
2385 # Check more exact timing to make sure there is no 37 seconds offset
2386 # (after fixing DM-30124). SQLite time precision is 1 second, make
2387 # sure that we don't test with higher precision.
2388 tests = [
2389 # format: (timestamp, operator, expected_len)
2390 (dt0 - timedelta(seconds=1), ">", len0),
2391 (dt0 - timedelta(seconds=1), "<", 0),
2392 (dt1 + timedelta(seconds=1), "<", len0),
2393 (dt1 + timedelta(seconds=1), ">", 0),
2394 ]
2395 for dt, op, expect_len in tests:
2396 dt_str = dt.isoformat(sep=" ")
2398 where = f"ingest_date {op} T'{dt_str}'"
2399 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2400 self.assertEqual(len(datasets), expect_len)
2402 # same with bind using datetime or astropy Time
2403 where = f"ingest_date {op} ingest_time"
2404 datasets = list(
2405 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt})
2406 )
2407 self.assertEqual(len(datasets), expect_len)
2409 dt_astropy = astropy.time.Time(dt, format="datetime")
2410 datasets = list(
2411 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt_astropy})
2412 )
2413 self.assertEqual(len(datasets), expect_len)
2415 def testTimespanQueries(self):
2416 """Test query expressions involving timespans."""
2417 registry = self.makeRegistry()
2418 self.loadData(registry, "hsc-rc2-subset.yaml")
2419 # All exposures in the database; mapping from ID to timespan.
2420 visits = {record.id: record.timespan for record in registry.queryDimensionRecords("visit")}
2421 # Just those IDs, sorted (which is also temporal sorting, because HSC
2422 # exposure IDs are monotonically increasing).
2423 ids = sorted(visits.keys())
2424 self.assertGreater(len(ids), 20)
2425 # Pick some quasi-random indexes into `ids` to play with.
2426 i1 = int(len(ids) * 0.1)
2427 i2 = int(len(ids) * 0.3)
2428 i3 = int(len(ids) * 0.6)
2429 i4 = int(len(ids) * 0.8)
2430 # Extract some times from those: just before the beginning of i1 (which
2431 # should be after the end of the exposure before), exactly the
2432 # beginning of i2, just after the beginning of i3 (and before its end),
2433 # and the exact end of i4.
2434 t1 = visits[ids[i1]].begin - astropy.time.TimeDelta(1.0, format="sec")
2435 self.assertGreater(t1, visits[ids[i1 - 1]].end)
2436 t2 = visits[ids[i2]].begin
2437 t3 = visits[ids[i3]].begin + astropy.time.TimeDelta(1.0, format="sec")
2438 self.assertLess(t3, visits[ids[i3]].end)
2439 t4 = visits[ids[i4]].end
2440 # Make sure those are actually in order.
2441 self.assertEqual([t1, t2, t3, t4], sorted([t4, t3, t2, t1]))
2443 bind = {
2444 "t1": t1,
2445 "t2": t2,
2446 "t3": t3,
2447 "t4": t4,
2448 "ts23": Timespan(t2, t3),
2449 }
2451 def query(where):
2452 """Return results as a sorted, deduplicated list of visit IDs."""
2453 return sorted(
2454 {
2455 dataId["visit"]
2456 for dataId in registry.queryDataIds("visit", instrument="HSC", bind=bind, where=where)
2457 }
2458 )
2460 # Try a bunch of timespan queries, mixing up the bounds themselves,
2461 # where they appear in the expression, and how we get the timespan into
2462 # the expression.
2464 # t1 is before the start of i1, so this should not include i1.
2465 self.assertEqual(ids[:i1], query("visit.timespan OVERLAPS (null, t1)"))
2466 # t2 is exactly at the start of i2, but ends are exclusive, so these
2467 # should not include i2.
2468 self.assertEqual(ids[i1:i2], query("(t1, t2) OVERLAPS visit.timespan"))
2469 self.assertEqual(ids[:i2], query("visit.timespan < (t2, t4)"))
2470 # t3 is in the middle of i3, so this should include i3.
2471 self.assertEqual(ids[i2 : i3 + 1], query("visit.timespan OVERLAPS ts23"))
2472 # This one should not include t3 by the same reasoning.
2473 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > (t1, t3)"))
2474 # t4 is exactly at the end of i4, so this should include i4.
2475 self.assertEqual(ids[i3 : i4 + 1], query(f"visit.timespan OVERLAPS (T'{t3.tai.isot}', t4)"))
2476 # i4's upper bound of t4 is exclusive so this should not include t4.
2477 self.assertEqual(ids[i4 + 1 :], query("visit.timespan OVERLAPS (t4, NULL)"))
2479 # Now some timespan vs. time scalar queries.
2480 self.assertEqual(ids[:i2], query("visit.timespan < t2"))
2481 self.assertEqual(ids[:i2], query("t2 > visit.timespan"))
2482 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > t3"))
2483 self.assertEqual(ids[i3 + 1 :], query("t3 < visit.timespan"))
2484 self.assertEqual(ids[i3 : i3 + 1], query("visit.timespan OVERLAPS t3"))
2485 self.assertEqual(ids[i3 : i3 + 1], query(f"T'{t3.tai.isot}' OVERLAPS visit.timespan"))
2487 # Empty timespans should not overlap anything.
2488 self.assertEqual([], query("visit.timespan OVERLAPS (t3, t2)"))
2490 def testCollectionSummaries(self):
2491 """Test recording and retrieval of collection summaries."""
2492 self.maxDiff = None
2493 registry = self.makeRegistry()
2494 # Importing datasets from yaml should go through the code path where
2495 # we update collection summaries as we insert datasets.
2496 self.loadData(registry, "base.yaml")
2497 self.loadData(registry, "datasets.yaml")
2498 flat = registry.getDatasetType("flat")
2499 expected1 = CollectionSummary()
2500 expected1.dataset_types.add(registry.getDatasetType("bias"))
2501 expected1.add_data_ids(
2502 flat, [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)]
2503 )
2504 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2505 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2506 # Create a chained collection with both of the imported runs; the
2507 # summary should be the same, because it's a union with itself.
2508 chain = "chain"
2509 registry.registerCollection(chain, CollectionType.CHAINED)
2510 registry.setCollectionChain(chain, ["imported_r", "imported_g"])
2511 self.assertEqual(registry.getCollectionSummary(chain), expected1)
2512 # Associate flats only into a tagged collection and a calibration
2513 # collection to check summaries of those.
2514 tag = "tag"
2515 registry.registerCollection(tag, CollectionType.TAGGED)
2516 registry.associate(tag, registry.queryDatasets(flat, collections="imported_g"))
2517 calibs = "calibs"
2518 registry.registerCollection(calibs, CollectionType.CALIBRATION)
2519 registry.certify(
2520 calibs, registry.queryDatasets(flat, collections="imported_g"), timespan=Timespan(None, None)
2521 )
2522 expected2 = expected1.copy()
2523 expected2.dataset_types.discard("bias")
2524 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2525 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2526 # Explicitly calling Registry.refresh() should load those same
2527 # summaries, via a totally different code path.
2528 registry.refresh()
2529 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2530 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2531 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2532 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2534 def testBindInQueryDatasets(self):
2535 """Test that the bind parameter is correctly forwarded in
2536 queryDatasets recursion.
2537 """
2538 registry = self.makeRegistry()
2539 # Importing datasets from yaml should go through the code path where
2540 # we update collection summaries as we insert datasets.
2541 self.loadData(registry, "base.yaml")
2542 self.loadData(registry, "datasets.yaml")
2543 self.assertEqual(
2544 set(registry.queryDatasets("flat", band="r", collections=...)),
2545 set(registry.queryDatasets("flat", where="band=my_band", bind={"my_band": "r"}, collections=...)),
2546 )
2548 def testQueryIntRangeExpressions(self):
2549 """Test integer range expressions in ``where`` arguments.
2551 Note that our expressions use inclusive stop values, unlike Python's.
2552 """
2553 registry = self.makeRegistry()
2554 self.loadData(registry, "base.yaml")
2555 self.assertEqual(
2556 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..2)")),
2557 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 2]},
2558 )
2559 self.assertEqual(
2560 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..4:2)")),
2561 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 3]},
2562 )
2563 self.assertEqual(
2564 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (2..4:2)")),
2565 {registry.expandDataId(instrument="Cam1", detector=n) for n in [2, 4]},
2566 )
2568 def testQueryResultSummaries(self):
2569 """Test summary methods like `count`, `any`, and `explain_no_results`
2570 on `DataCoordinateQueryResults` and `DatasetQueryResults`.
2571 """
2572 registry = self.makeRegistry()
2573 self.loadData(registry, "base.yaml")
2574 self.loadData(registry, "datasets.yaml")
2575 self.loadData(registry, "spatial.yaml")
2576 # Default test dataset has two collections, each with both flats and
2577 # biases. Add a new collection with only biases.
2578 registry.registerCollection("biases", CollectionType.TAGGED)
2579 registry.associate("biases", registry.queryDatasets("bias", collections=["imported_g"]))
2580 # First query yields two results, and involves no postprocessing.
2581 query1 = registry.queryDataIds(["physical_filter"], band="r")
2582 self.assertTrue(query1.any(execute=False, exact=False))
2583 self.assertTrue(query1.any(execute=True, exact=False))
2584 self.assertTrue(query1.any(execute=True, exact=True))
2585 self.assertEqual(query1.count(exact=False), 2)
2586 self.assertEqual(query1.count(exact=True), 2)
2587 self.assertFalse(list(query1.explain_no_results()))
2588 # Second query should yield no results, which we should see when
2589 # we attempt to expand the data ID.
2590 query2 = registry.queryDataIds(["physical_filter"], band="h")
2591 # There's no execute=False, exact=Fals test here because the behavior
2592 # not something we want to guarantee in this case (and exact=False
2593 # says either answer is legal).
2594 self.assertFalse(query2.any(execute=True, exact=False))
2595 self.assertFalse(query2.any(execute=True, exact=True))
2596 self.assertEqual(query2.count(exact=False), 0)
2597 self.assertEqual(query2.count(exact=True), 0)
2598 self.assertTrue(list(query2.explain_no_results()))
2599 # These queries yield no results due to various problems that can be
2600 # spotted prior to execution, yielding helpful diagnostics.
2601 base_query = registry.queryDataIds(["detector", "physical_filter"])
2602 queries_and_snippets = [
2603 (
2604 # Dataset type name doesn't match any existing dataset types.
2605 registry.queryDatasets("nonexistent", collections=...),
2606 ["nonexistent"],
2607 ),
2608 (
2609 # Dataset type object isn't registered.
2610 registry.queryDatasets(
2611 DatasetType(
2612 "nonexistent",
2613 dimensions=["instrument"],
2614 universe=registry.dimensions,
2615 storageClass="Image",
2616 ),
2617 collections=...,
2618 ),
2619 ["nonexistent"],
2620 ),
2621 (
2622 # No datasets of this type in this collection.
2623 registry.queryDatasets("flat", collections=["biases"]),
2624 ["flat", "biases"],
2625 ),
2626 (
2627 # No datasets of this type in this collection.
2628 base_query.findDatasets("flat", collections=["biases"]),
2629 ["flat", "biases"],
2630 ),
2631 (
2632 # No collections matching at all.
2633 registry.queryDatasets("flat", collections=re.compile("potato.+")),
2634 ["potato"],
2635 ),
2636 ]
2637 # The behavior of these additional queries is slated to change in the
2638 # future, so we also check for deprecation warnings.
2639 with self.assertWarns(FutureWarning):
2640 queries_and_snippets.append(
2641 (
2642 # Dataset type name doesn't match any existing dataset
2643 # types.
2644 registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...),
2645 ["nonexistent"],
2646 )
2647 )
2648 with self.assertWarns(FutureWarning):
2649 queries_and_snippets.append(
2650 (
2651 # Dataset type name doesn't match any existing dataset
2652 # types.
2653 registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...),
2654 ["nonexistent"],
2655 )
2656 )
2657 for query, snippets in queries_and_snippets:
2658 self.assertFalse(query.any(execute=False, exact=False))
2659 self.assertFalse(query.any(execute=True, exact=False))
2660 self.assertFalse(query.any(execute=True, exact=True))
2661 self.assertEqual(query.count(exact=False), 0)
2662 self.assertEqual(query.count(exact=True), 0)
2663 messages = list(query.explain_no_results())
2664 self.assertTrue(messages)
2665 # Want all expected snippets to appear in at least one message.
2666 self.assertTrue(
2667 any(
2668 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2669 ),
2670 messages,
2671 )
2673 # This query does yield results, but should also emit a warning because
2674 # dataset type patterns to queryDataIds is deprecated; just look for
2675 # the warning.
2676 with self.assertWarns(FutureWarning):
2677 registry.queryDataIds(["detector"], datasets=re.compile("^nonexistent$"), collections=...)
2679 # These queries yield no results due to problems that can be identified
2680 # by cheap follow-up queries, yielding helpful diagnostics.
2681 for query, snippets in [
2682 (
2683 # No records for one of the involved dimensions.
2684 registry.queryDataIds(["subfilter"]),
2685 ["no rows", "subfilter"],
2686 ),
2687 (
2688 # No records for one of the involved dimensions.
2689 registry.queryDimensionRecords("subfilter"),
2690 ["no rows", "subfilter"],
2691 ),
2692 ]:
2693 self.assertFalse(query.any(execute=True, exact=False))
2694 self.assertFalse(query.any(execute=True, exact=True))
2695 self.assertEqual(query.count(exact=True), 0)
2696 messages = list(query.explain_no_results())
2697 self.assertTrue(messages)
2698 # Want all expected snippets to appear in at least one message.
2699 self.assertTrue(
2700 any(
2701 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2702 ),
2703 messages,
2704 )
2706 # This query yields four overlaps in the database, but one is filtered
2707 # out in postprocessing. The count queries aren't accurate because
2708 # they don't account for duplication that happens due to an internal
2709 # join against commonSkyPix.
2710 query3 = registry.queryDataIds(["visit", "tract"], instrument="Cam1", skymap="SkyMap1")
2711 self.assertEqual(
2712 {
2713 DataCoordinate.standardize(
2714 instrument="Cam1",
2715 skymap="SkyMap1",
2716 visit=v,
2717 tract=t,
2718 universe=registry.dimensions,
2719 )
2720 for v, t in [(1, 0), (2, 0), (2, 1)]
2721 },
2722 set(query3),
2723 )
2724 self.assertTrue(query3.any(execute=False, exact=False))
2725 self.assertTrue(query3.any(execute=True, exact=False))
2726 self.assertTrue(query3.any(execute=True, exact=True))
2727 self.assertGreaterEqual(query3.count(exact=False), 4)
2728 self.assertGreaterEqual(query3.count(exact=True, discard=True), 3)
2729 self.assertFalse(list(query3.explain_no_results()))
2730 # This query yields overlaps in the database, but all are filtered
2731 # out in postprocessing. The count queries again aren't very useful.
2732 # We have to use `where=` here to avoid an optimization that
2733 # (currently) skips the spatial postprocess-filtering because it
2734 # recognizes that no spatial join is necessary. That's not ideal, but
2735 # fixing it is out of scope for this ticket.
2736 query4 = registry.queryDataIds(
2737 ["visit", "tract"],
2738 instrument="Cam1",
2739 skymap="SkyMap1",
2740 where="visit=1 AND detector=1 AND tract=0 AND patch=4",
2741 )
2742 self.assertFalse(set(query4))
2743 self.assertTrue(query4.any(execute=False, exact=False))
2744 self.assertTrue(query4.any(execute=True, exact=False))
2745 self.assertFalse(query4.any(execute=True, exact=True))
2746 self.assertGreaterEqual(query4.count(exact=False), 1)
2747 self.assertEqual(query4.count(exact=True, discard=True), 0)
2748 messages = query4.explain_no_results()
2749 self.assertTrue(messages)
2750 self.assertTrue(any("overlap" in message for message in messages))
2751 # This query should yield results from one dataset type but not the
2752 # other, which is not registered.
2753 query5 = registry.queryDatasets(["bias", "nonexistent"], collections=["biases"])
2754 self.assertTrue(set(query5))
2755 self.assertTrue(query5.any(execute=False, exact=False))
2756 self.assertTrue(query5.any(execute=True, exact=False))
2757 self.assertTrue(query5.any(execute=True, exact=True))
2758 self.assertGreaterEqual(query5.count(exact=False), 1)
2759 self.assertGreaterEqual(query5.count(exact=True), 1)
2760 self.assertFalse(list(query5.explain_no_results()))
2761 # This query applies a selection that yields no results, fully in the
2762 # database. Explaining why it fails involves traversing the relation
2763 # tree and running a LIMIT 1 query at each level that has the potential
2764 # to remove rows.
2765 query6 = registry.queryDimensionRecords(
2766 "detector", where="detector.purpose = 'no-purpose'", instrument="Cam1"
2767 )
2768 self.assertEqual(query6.count(exact=True), 0)
2769 messages = query6.explain_no_results()
2770 self.assertTrue(messages)
2771 self.assertTrue(any("no-purpose" in message for message in messages))
2773 def testQueryDataIdsExpressionError(self):
2774 """Test error checking of 'where' expressions in queryDataIds."""
2775 registry = self.makeRegistry()
2776 self.loadData(registry, "base.yaml")
2777 bind = {"time": astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")}
2778 with self.assertRaisesRegex(LookupError, r"No dimension element with name 'foo' in 'foo\.bar'\."):
2779 registry.queryDataIds(["detector"], where="foo.bar = 12")
2780 with self.assertRaisesRegex(
2781 LookupError, "Dimension element name cannot be inferred in this context."
2782 ):
2783 registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind)
2785 def testQueryDataIdsOrderBy(self):
2786 """Test order_by and limit on result returned by queryDataIds()."""
2787 registry = self.makeRegistry()
2788 self.loadData(registry, "base.yaml")
2789 self.loadData(registry, "datasets.yaml")
2790 self.loadData(registry, "spatial.yaml")
2792 def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
2793 return registry.queryDataIds(
2794 dimensions, datasets=datasets, collections=collections, instrument="Cam1", skymap="SkyMap1"
2795 )
2797 Test = namedtuple(
2798 "testQueryDataIdsOrderByTest",
2799 ("order_by", "keys", "result", "limit", "datasets", "collections"),
2800 defaults=(None, None, None),
2801 )
2803 test_data = (
2804 Test("tract,visit", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2805 Test("-tract,visit", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))),
2806 Test("tract,-visit", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))),
2807 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2), (0, 2), (0, 1), (0, 1))),
2808 Test(
2809 "tract.id,visit.id",
2810 "tract,visit",
2811 ((0, 1), (0, 1), (0, 2)),
2812 limit=(3,),
2813 ),
2814 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2)), limit=(3,)),
2815 Test("tract,visit", "tract,visit", ((0, 2), (1, 2), (1, 2)), limit=(3, 3)),
2816 Test("-tract,-visit", "tract,visit", ((0, 1),), limit=(3, 5)),
2817 Test(
2818 "tract,visit.exposure_time", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))
2819 ),
2820 Test(
2821 "-tract,-visit.exposure_time", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))
2822 ),
2823 Test("tract,-exposure_time", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2824 Test("tract,visit.name", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2825 Test(
2826 "tract,-timespan.begin,timespan.end",
2827 "tract,visit",
2828 ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2)),
2829 ),
2830 Test("visit.day_obs,exposure.day_obs", "visit,exposure", ()),
2831 Test("visit.timespan.begin,-exposure.timespan.begin", "visit,exposure", ()),
2832 Test(
2833 "tract,detector",
2834 "tract,detector",
2835 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2836 datasets="flat",
2837 collections="imported_r",
2838 ),
2839 Test(
2840 "tract,detector.full_name",
2841 "tract,detector",
2842 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2843 datasets="flat",
2844 collections="imported_r",
2845 ),
2846 Test(
2847 "tract,detector.raft,detector.name_in_raft",
2848 "tract,detector",
2849 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2850 datasets="flat",
2851 collections="imported_r",
2852 ),
2853 )
2855 for test in test_data:
2856 order_by = test.order_by.split(",")
2857 keys = test.keys.split(",")
2858 query = do_query(keys, test.datasets, test.collections).order_by(*order_by)
2859 if test.limit is not None:
2860 query = query.limit(*test.limit)
2861 dataIds = tuple(tuple(dataId[k] for k in keys) for dataId in query)
2862 self.assertEqual(dataIds, test.result)
2864 # and materialize
2865 query = do_query(keys).order_by(*order_by)
2866 if test.limit is not None:
2867 query = query.limit(*test.limit)
2868 with self.assertRaises(RelationalAlgebraError):
2869 with query.materialize():
2870 pass
2872 # errors in a name
2873 for order_by in ("", "-"):
2874 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2875 list(do_query().order_by(order_by))
2877 for order_by in ("undimension.name", "-undimension.name"):
2878 with self.assertRaisesRegex(ValueError, "Unknown dimension element 'undimension'"):
2879 list(do_query().order_by(order_by))
2881 for order_by in ("attract", "-attract"):
2882 with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
2883 list(do_query().order_by(order_by))
2885 with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
2886 list(do_query(("exposure", "visit")).order_by("exposure_time"))
2888 with self.assertRaisesRegex(
2889 ValueError,
2890 r"Timespan exists in more than one dimension element \(exposure, visit\); "
2891 r"qualify timespan with specific dimension name\.",
2892 ):
2893 list(do_query(("exposure", "visit")).order_by("timespan.begin"))
2895 with self.assertRaisesRegex(
2896 ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
2897 ):
2898 list(do_query("tract").order_by("timespan.begin"))
2900 with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
2901 list(do_query("tract").order_by("tract.timespan.begin"))
2903 with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
2904 list(do_query("tract").order_by("tract.name"))
2906 with self.assertRaisesRegex(
2907 ValueError, r"Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?"
2908 ):
2909 list(do_query("visit").order_by("timestamp.begin"))
2911 def testQueryDataIdsGovernorExceptions(self):
2912 """Test exceptions raised by queryDataIds() for incorrect governors."""
2913 registry = self.makeRegistry()
2914 self.loadData(registry, "base.yaml")
2915 self.loadData(registry, "datasets.yaml")
2916 self.loadData(registry, "spatial.yaml")
2918 def do_query(dimensions, dataId=None, where="", bind=None, **kwargs):
2919 return registry.queryDataIds(dimensions, dataId=dataId, where=where, bind=bind, **kwargs)
2921 Test = namedtuple(
2922 "testQueryDataIdExceptionsTest",
2923 ("dimensions", "dataId", "where", "bind", "kwargs", "exception", "count"),
2924 defaults=(None, None, None, {}, None, 0),
2925 )
2927 test_data = (
2928 Test("tract,visit", count=6),
2929 Test("tract,visit", kwargs={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2930 Test(
2931 "tract,visit", kwargs={"instrument": "Cam2", "skymap": "SkyMap1"}, exception=DataIdValueError
2932 ),
2933 Test("tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2934 Test(
2935 "tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap2"}, exception=DataIdValueError
2936 ),
2937 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap1'", count=6),
2938 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap5'", exception=DataIdValueError),
2939 Test(
2940 "tract,visit",
2941 where="instrument=cam AND skymap=map",
2942 bind={"cam": "Cam1", "map": "SkyMap1"},
2943 count=6,
2944 ),
2945 Test(
2946 "tract,visit",
2947 where="instrument=cam AND skymap=map",
2948 bind={"cam": "Cam", "map": "SkyMap"},
2949 exception=DataIdValueError,
2950 ),
2951 )
2953 for test in test_data:
2954 dimensions = test.dimensions.split(",")
2955 if test.exception:
2956 with self.assertRaises(test.exception):
2957 do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs).count()
2958 else:
2959 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2960 self.assertEqual(query.count(discard=True), test.count)
2962 # and materialize
2963 if test.exception:
2964 with self.assertRaises(test.exception):
2965 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2966 with query.materialize() as materialized:
2967 materialized.count(discard=True)
2968 else:
2969 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2970 with query.materialize() as materialized:
2971 self.assertEqual(materialized.count(discard=True), test.count)
2973 def testQueryDimensionRecordsOrderBy(self):
2974 """Test order_by and limit on result returned by
2975 queryDimensionRecords().
2976 """
2977 registry = self.makeRegistry()
2978 self.loadData(registry, "base.yaml")
2979 self.loadData(registry, "datasets.yaml")
2980 self.loadData(registry, "spatial.yaml")
2982 def do_query(element, datasets=None, collections=None):
2983 return registry.queryDimensionRecords(
2984 element, instrument="Cam1", datasets=datasets, collections=collections
2985 )
2987 query = do_query("detector")
2988 self.assertEqual(len(list(query)), 4)
2990 Test = namedtuple(
2991 "testQueryDataIdsOrderByTest",
2992 ("element", "order_by", "result", "limit", "datasets", "collections"),
2993 defaults=(None, None, None),
2994 )
2996 test_data = (
2997 Test("detector", "detector", (1, 2, 3, 4)),
2998 Test("detector", "-detector", (4, 3, 2, 1)),
2999 Test("detector", "raft,-name_in_raft", (2, 1, 4, 3)),
3000 Test("detector", "-detector.purpose", (4,), limit=(1,)),
3001 Test("detector", "-purpose,detector.raft,name_in_raft", (2, 3), limit=(2, 2)),
3002 Test("visit", "visit", (1, 2)),
3003 Test("visit", "-visit.id", (2, 1)),
3004 Test("visit", "zenith_angle", (1, 2)),
3005 Test("visit", "-visit.name", (2, 1)),
3006 Test("visit", "day_obs,-timespan.begin", (2, 1)),
3007 )
3009 for test in test_data:
3010 order_by = test.order_by.split(",")
3011 query = do_query(test.element).order_by(*order_by)
3012 if test.limit is not None:
3013 query = query.limit(*test.limit)
3014 dataIds = tuple(rec.id for rec in query)
3015 self.assertEqual(dataIds, test.result)
3017 # errors in a name
3018 for order_by in ("", "-"):
3019 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
3020 list(do_query("detector").order_by(order_by))
3022 for order_by in ("undimension.name", "-undimension.name"):
3023 with self.assertRaisesRegex(ValueError, "Element name mismatch: 'undimension'"):
3024 list(do_query("detector").order_by(order_by))
3026 for order_by in ("attract", "-attract"):
3027 with self.assertRaisesRegex(ValueError, "Field 'attract' does not exist in 'detector'."):
3028 list(do_query("detector").order_by(order_by))
3030 for order_by in ("timestamp.begin", "-timestamp.begin"):
3031 with self.assertRaisesRegex(
3032 ValueError,
3033 r"Element name mismatch: 'timestamp' instead of 'visit'; "
3034 r"perhaps you meant 'timespan.begin'\?",
3035 ):
3036 list(do_query("visit").order_by(order_by))
3038 def testQueryDimensionRecordsExceptions(self):
3039 """Test exceptions raised by queryDimensionRecords()."""
3040 registry = self.makeRegistry()
3041 self.loadData(registry, "base.yaml")
3042 self.loadData(registry, "datasets.yaml")
3043 self.loadData(registry, "spatial.yaml")
3045 result = registry.queryDimensionRecords("detector")
3046 self.assertEqual(result.count(), 4)
3047 result = registry.queryDimensionRecords("detector", instrument="Cam1")
3048 self.assertEqual(result.count(), 4)
3049 result = registry.queryDimensionRecords("detector", dataId={"instrument": "Cam1"})
3050 self.assertEqual(result.count(), 4)
3051 result = registry.queryDimensionRecords("detector", where="instrument='Cam1'")
3052 self.assertEqual(result.count(), 4)
3053 result = registry.queryDimensionRecords("detector", where="instrument=instr", bind={"instr": "Cam1"})
3054 self.assertEqual(result.count(), 4)
3056 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3057 result = registry.queryDimensionRecords("detector", instrument="NotCam1")
3058 result.count()
3060 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3061 result = registry.queryDimensionRecords("detector", dataId={"instrument": "NotCam1"})
3062 result.count()
3064 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3065 result = registry.queryDimensionRecords("detector", where="instrument='NotCam1'")
3066 result.count()
3068 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3069 result = registry.queryDimensionRecords(
3070 "detector", where="instrument=instr", bind={"instr": "NotCam1"}
3071 )
3072 result.count()
3074 def testDatasetConstrainedDimensionRecordQueries(self):
3075 """Test that queryDimensionRecords works even when given a dataset
3076 constraint whose dimensions extend beyond the requested dimension
3077 element's.
3078 """
3079 registry = self.makeRegistry()
3080 self.loadData(registry, "base.yaml")
3081 self.loadData(registry, "datasets.yaml")
3082 # Query for physical_filter dimension records, using a dataset that
3083 # has both physical_filter and dataset dimensions.
3084 records = registry.queryDimensionRecords(
3085 "physical_filter",
3086 datasets=["flat"],
3087 collections="imported_r",
3088 )
3089 self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"})
3090 # Trying to constrain by all dataset types is an error.
3091 with self.assertRaises(TypeError):
3092 list(registry.queryDimensionRecords("physical_filter", datasets=..., collections="imported_r"))
3094 def testSkyPixDatasetQueries(self):
3095 """Test that we can build queries involving skypix dimensions as long
3096 as a dataset type that uses those dimensions is included.
3097 """
3098 registry = self.makeRegistry()
3099 self.loadData(registry, "base.yaml")
3100 dataset_type = DatasetType(
3101 "a", dimensions=["htm7", "instrument"], universe=registry.dimensions, storageClass="int"
3102 )
3103 registry.registerDatasetType(dataset_type)
3104 run = "r"
3105 registry.registerRun(run)
3106 # First try queries where there are no datasets; the concern is whether
3107 # we can even build and execute these queries without raising, even
3108 # when "doomed" query shortcuts are in play.
3109 self.assertFalse(
3110 list(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run))
3111 )
3112 self.assertFalse(list(registry.queryDatasets(dataset_type, collections=run)))
3113 # Now add a dataset and see that we can get it back.
3114 htm7 = registry.dimensions.skypix["htm"][7].pixelization
3115 data_id = registry.expandDataId(instrument="Cam1", htm7=htm7.universe()[0][0])
3116 (ref,) = registry.insertDatasets(dataset_type, [data_id], run=run)
3117 self.assertEqual(
3118 set(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run)),
3119 {data_id},
3120 )
3121 self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})
3123 def testDatasetIdFactory(self):
3124 """Simple test for DatasetIdFactory, mostly to catch potential changes
3125 in its API.
3126 """
3127 registry = self.makeRegistry()
3128 factory = DatasetIdFactory()
3129 dataset_type = DatasetType(
3130 "datasetType",
3131 dimensions=["detector", "instrument"],
3132 universe=registry.dimensions,
3133 storageClass="int",
3134 )
3135 run = "run"
3136 data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)
3138 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
3139 self.assertIsInstance(datasetId, uuid.UUID)
3140 self.assertEqual(datasetId.version, 4)
3142 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
3143 self.assertIsInstance(datasetId, uuid.UUID)
3144 self.assertEqual(datasetId.version, 5)
3146 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
3147 self.assertIsInstance(datasetId, uuid.UUID)
3148 self.assertEqual(datasetId.version, 5)
3150 def testExposureQueries(self):
3151 """Test query methods using arguments sourced from the exposure log
3152 service.
3154 The most complete test dataset currently available to daf_butler tests
3155 is hsc-rc2-subset.yaml export (which is unfortunately distinct from the
3156 the lsst/rc2_subset GitHub repo), but that does not have 'exposure'
3157 dimension records as it was focused on providing nontrivial spatial
3158 overlaps between visit+detector and tract+patch. So in this test we
3159 need to translate queries that originally used the exposure dimension
3160 to use the (very similar) visit dimension instead.
3161 """
3162 registry = self.makeRegistry()
3163 self.loadData(registry, "hsc-rc2-subset.yaml")
3164 self.assertEqual(
3165 [
3166 record.id
3167 for record in registry.queryDimensionRecords("visit", instrument="HSC")
3168 .order_by("id")
3169 .limit(5)
3170 ],
3171 [318, 322, 326, 330, 332],
3172 )
3173 self.assertEqual(
3174 [
3175 data_id["visit"]
3176 for data_id in registry.queryDataIds(["visit"], instrument="HSC").order_by("id").limit(5)
3177 ],
3178 [318, 322, 326, 330, 332],
3179 )
3180 self.assertEqual(
3181 [
3182 record.id
3183 for record in registry.queryDimensionRecords("detector", instrument="HSC")
3184 .order_by("full_name")
3185 .limit(5)
3186 ],
3187 [73, 72, 71, 70, 65],
3188 )
3189 self.assertEqual(
3190 [
3191 data_id["detector"]
3192 for data_id in registry.queryDataIds(["detector"], instrument="HSC")
3193 .order_by("full_name")
3194 .limit(5)
3195 ],
3196 [73, 72, 71, 70, 65],
3197 )
3199 def test_long_query_names(self) -> None:
3200 """Test that queries involving very long names are handled correctly.
3202 This is especially important for PostgreSQL, which truncates symbols
3203 longer than 64 chars, but it's worth testing for all DBs.
3204 """
3205 registry = self.makeRegistry()
3206 name = "abcd" * 17
3207 registry.registerDatasetType(
3208 DatasetType(
3209 name,
3210 dimensions=(),
3211 storageClass="Exposure",
3212 universe=registry.dimensions,
3213 )
3214 )
3215 # Need to search more than one collection actually containing a
3216 # matching dataset to avoid optimizations that sidestep bugs due to
3217 # truncation by making findFirst=True a no-op.
3218 run1 = "run1"
3219 registry.registerRun(run1)
3220 run2 = "run2"
3221 registry.registerRun(run2)
3222 (ref1,) = registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run1)
3223 registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run2)
3224 self.assertEqual(
3225 set(registry.queryDatasets(name, collections=[run1, run2], findFirst=True)),
3226 {ref1},
3227 )
3229 def test_skypix_constraint_queries(self) -> None:
3230 """Test queries spatially constrained by a skypix data ID."""
3231 registry = self.makeRegistry()
3232 self.loadData(registry, "hsc-rc2-subset.yaml")
3233 patch_regions = {
3234 (data_id["tract"], data_id["patch"]): data_id.region
3235 for data_id in registry.queryDataIds(["patch"]).expanded()
3236 }
3237 skypix_dimension: SkyPixDimension = registry.dimensions["htm11"]
3238 # This check ensures the test doesn't become trivial due to a config
3239 # change; if it does, just pick a different HTML level.
3240 self.assertNotEqual(skypix_dimension, registry.dimensions.commonSkyPix)
3241 # Gather all skypix IDs that definitely overlap at least one of these
3242 # patches.
3243 relevant_skypix_ids = lsst.sphgeom.RangeSet()
3244 for patch_region in patch_regions.values():
3245 relevant_skypix_ids |= skypix_dimension.pixelization.interior(patch_region)
3246 # Look for a "nontrivial" skypix_id that overlaps at least one patch
3247 # and does not overlap at least one other patch.
3248 for skypix_id in itertools.chain.from_iterable(
3249 range(begin, end) for begin, end in relevant_skypix_ids
3250 ):
3251 skypix_region = skypix_dimension.pixelization.pixel(skypix_id)
3252 overlapping_patches = {
3253 patch_key
3254 for patch_key, patch_region in patch_regions.items()
3255 if not patch_region.isDisjointFrom(skypix_region)
3256 }
3257 if overlapping_patches and overlapping_patches != patch_regions.keys():
3258 break
3259 else:
3260 raise RuntimeError("Could not find usable skypix ID for this dimension configuration.")
3261 self.assertEqual(
3262 {
3263 (data_id["tract"], data_id["patch"])
3264 for data_id in registry.queryDataIds(
3265 ["patch"],
3266 dataId={skypix_dimension.name: skypix_id},
3267 )
3268 },
3269 overlapping_patches,
3270 )
3271 # Test that a three-way join that includes the common skypix system in
3272 # the dimensions doesn't generate redundant join terms in the query.
3273 full_data_ids = set(
3274 registry.queryDataIds(
3275 ["tract", "visit", "htm7"], skymap="hsc_rings_v1", instrument="HSC"
3276 ).expanded()
3277 )
3278 self.assertGreater(len(full_data_ids), 0)
3279 for data_id in full_data_ids:
3280 self.assertFalse(data_id.records["tract"].region.isDisjointFrom(data_id.records["htm7"].region))
3281 self.assertFalse(data_id.records["visit"].region.isDisjointFrom(data_id.records["htm7"].region))
3283 def test_spatial_constraint_queries(self) -> None:
3284 """Test queries in which one spatial dimension in the constraint (data
3285 ID or ``where`` string) constrains a different spatial dimension in the
3286 query result columns.
3287 """
3288 registry = self.makeRegistry()
3289 self.loadData(registry, "hsc-rc2-subset.yaml")
3290 patch_regions = {
3291 (data_id["tract"], data_id["patch"]): data_id.region
3292 for data_id in registry.queryDataIds(["patch"]).expanded()
3293 }
3294 observation_regions = {
3295 (data_id["visit"], data_id["detector"]): data_id.region
3296 for data_id in registry.queryDataIds(["visit", "detector"]).expanded()
3297 }
3298 all_combos = {
3299 (patch_key, observation_key)
3300 for patch_key, observation_key in itertools.product(patch_regions, observation_regions)
3301 }
3302 overlapping_combos = {
3303 (patch_key, observation_key)
3304 for patch_key, observation_key in all_combos
3305 if not patch_regions[patch_key].isDisjointFrom(observation_regions[observation_key])
3306 }
3307 # Check a direct spatial join with no constraint first.
3308 self.assertEqual(
3309 {
3310 ((data_id["tract"], data_id["patch"]), (data_id["visit"], data_id["detector"]))
3311 for data_id in registry.queryDataIds(["patch", "visit", "detector"])
3312 },
3313 overlapping_combos,
3314 )
3315 overlaps_by_patch: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3316 overlaps_by_observation: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3317 for patch_key, observation_key in overlapping_combos:
3318 overlaps_by_patch[patch_key].add(observation_key)
3319 overlaps_by_observation[observation_key].add(patch_key)
3320 # Find patches and observations that overlap at least one of the other
3321 # but not all of the other.
3322 nontrivial_patch = next(
3323 iter(
3324 patch_key
3325 for patch_key, observation_keys in overlaps_by_patch.items()
3326 if observation_keys and observation_keys != observation_regions.keys()
3327 )
3328 )
3329 nontrivial_observation = next(
3330 iter(
3331 observation_key
3332 for observation_key, patch_keys in overlaps_by_observation.items()
3333 if patch_keys and patch_keys != patch_regions.keys()
3334 )
3335 )
3336 # Use the nontrivial patches and observations as constraints on the
3337 # other dimensions in various ways, first via a 'where' expression.
3338 # It's better in general to us 'bind' instead of f-strings, but these
3339 # all integers so there are no quoting concerns.
3340 self.assertEqual(
3341 {
3342 (data_id["visit"], data_id["detector"])
3343 for data_id in registry.queryDataIds(
3344 ["visit", "detector"],
3345 where=f"tract={nontrivial_patch[0]} AND patch={nontrivial_patch[1]}",
3346 skymap="hsc_rings_v1",
3347 )
3348 },
3349 overlaps_by_patch[nontrivial_patch],
3350 )
3351 self.assertEqual(
3352 {
3353 (data_id["tract"], data_id["patch"])
3354 for data_id in registry.queryDataIds(
3355 ["patch"],
3356 where=f"visit={nontrivial_observation[0]} AND detector={nontrivial_observation[1]}",
3357 instrument="HSC",
3358 )
3359 },
3360 overlaps_by_observation[nontrivial_observation],
3361 )
3362 # and then via the dataId argument.
3363 self.assertEqual(
3364 {
3365 (data_id["visit"], data_id["detector"])
3366 for data_id in registry.queryDataIds(
3367 ["visit", "detector"],
3368 dataId={
3369 "tract": nontrivial_patch[0],
3370 "patch": nontrivial_patch[1],
3371 },
3372 skymap="hsc_rings_v1",
3373 )
3374 },
3375 overlaps_by_patch[nontrivial_patch],
3376 )
3377 self.assertEqual(
3378 {
3379 (data_id["tract"], data_id["patch"])
3380 for data_id in registry.queryDataIds(
3381 ["patch"],
3382 dataId={
3383 "visit": nontrivial_observation[0],
3384 "detector": nontrivial_observation[1],
3385 },
3386 instrument="HSC",
3387 )
3388 },
3389 overlaps_by_observation[nontrivial_observation],
3390 )
3392 def test_query_projection_drop_postprocessing(self) -> None:
3393 """Test that projections and deduplications on query objects can
3394 drop post-query region filtering to ensure the query remains in
3395 the SQL engine.
3396 """
3397 registry = self.makeRegistry()
3398 self.loadData(registry, "base.yaml")
3399 self.loadData(registry, "spatial.yaml")
3401 def pop_transfer(tree: Relation) -> Relation:
3402 """If a relation tree terminates with a transfer to a new engine,
3403 return the relation prior to that transfer. If not, return the
3404 original relation.
3405 """
3406 match tree:
3407 case Transfer(target=target):
3408 return target
3409 case _:
3410 return tree
3412 # There's no public way to get a Query object yet, so we get one from a
3413 # DataCoordinateQueryResults private attribute. When a public API is
3414 # available this test should use it.
3415 query = registry.queryDataIds(["visit", "detector", "tract", "patch"])._query
3416 # We expect this query to terminate in the iteration engine originally,
3417 # because region-filtering is necessary.
3418 self.assertIsInstance(pop_transfer(query.relation).engine, iteration.Engine)
3419 # If we deduplicate, we usually have to do that downstream of the
3420 # filtering. That means the deduplication has to happen in the
3421 # iteration engine.
3422 self.assertIsInstance(pop_transfer(query.projected(unique=True).relation).engine, iteration.Engine)
3423 # If we pass drop_postprocessing, we instead drop the region filtering
3424 # so the deduplication can happen in SQL (though there might still be
3425 # transfer to iteration at the tail of the tree that we can ignore;
3426 # that's what the pop_transfer takes care of here).
3427 self.assertIsInstance(
3428 pop_transfer(query.projected(unique=True, drop_postprocessing=True).relation).engine,
3429 sql.Engine,
3430 )
3432 def test_query_find_datasets_drop_postprocessing(self) -> None:
3433 """Test that DataCoordinateQueryResults.findDatasets avoids commutator
3434 problems with the FindFirstDataset relation operation.
3435 """
3436 # Setup: load some visit, tract, and patch records, and insert two
3437 # datasets with dimensions {visit, patch}, with one in each of two
3438 # RUN collections.
3439 registry = self.makeRegistry()
3440 self.loadData(registry, "base.yaml")
3441 self.loadData(registry, "spatial.yaml")
3442 storage_class = StorageClass("Warpy")
3443 registry.storageClasses.registerStorageClass(storage_class)
3444 dataset_type = DatasetType(
3445 "warp", {"visit", "patch"}, storageClass=storage_class, universe=registry.dimensions
3446 )
3447 registry.registerDatasetType(dataset_type)
3448 (data_id,) = registry.queryDataIds(["visit", "patch"]).limit(1)
3449 registry.registerRun("run1")
3450 registry.registerRun("run2")
3451 (ref1,) = registry.insertDatasets(dataset_type, [data_id], run="run1")
3452 (ref2,) = registry.insertDatasets(dataset_type, [data_id], run="run2")
3453 # Query for the dataset using queryDataIds(...).findDatasets(...)
3454 # against only one of the two collections. This should work even
3455 # though the relation returned by queryDataIds ends with
3456 # iteration-engine region-filtering, because we can recognize before
3457 # running the query that there is only one collecton to search and
3458 # hence the (default) findFirst=True is irrelevant, and joining in the
3459 # dataset query commutes past the iteration-engine postprocessing.
3460 query1 = registry.queryDataIds(
3461 {"visit", "patch"}, visit=data_id["visit"], instrument=data_id["instrument"]
3462 )
3463 self.assertEqual(
3464 set(query1.findDatasets(dataset_type.name, collections=["run1"])),
3465 {ref1},
3466 )
3467 # Query for the dataset using queryDataIds(...).findDatasets(...)
3468 # against both collections. This can only work if the FindFirstDataset
3469 # operation can be commuted past the iteration-engine options into SQL.
3470 query2 = registry.queryDataIds(
3471 {"visit", "patch"}, visit=data_id["visit"], instrument=data_id["instrument"]
3472 )
3473 self.assertEqual(
3474 set(query2.findDatasets(dataset_type.name, collections=["run2", "run1"])),
3475 {ref2},
3476 )
3478 def test_query_empty_collections(self) -> None:
3479 """Test for registry query methods with empty collections. The methods
3480 should return empty result set (or None when applicable) and provide
3481 "doomed" diagnostics.
3482 """
3483 registry = self.makeRegistry()
3484 self.loadData(registry, "base.yaml")
3485 self.loadData(registry, "datasets.yaml")
3487 # Tests for registry.findDataset()
3488 with self.assertRaises(NoDefaultCollectionError):
3489 registry.findDataset("bias", instrument="Cam1", detector=1)
3490 self.assertIsNotNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=...))
3491 self.assertIsNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=[]))
3493 # Tests for registry.queryDatasets()
3494 with self.assertRaises(NoDefaultCollectionError):
3495 registry.queryDatasets("bias")
3496 self.assertTrue(list(registry.queryDatasets("bias", collections=...)))
3498 result = registry.queryDatasets("bias", collections=[])
3499 self.assertEqual(len(list(result)), 0)
3500 messages = list(result.explain_no_results())
3501 self.assertTrue(messages)
3502 self.assertTrue(any("because collection list is empty" in message for message in messages))
3504 # Tests for registry.queryDataIds()
3505 with self.assertRaises(NoDefaultCollectionError):
3506 registry.queryDataIds("detector", datasets="bias")
3507 self.assertTrue(list(registry.queryDataIds("detector", datasets="bias", collections=...)))
3509 result = registry.queryDataIds("detector", datasets="bias", collections=[])
3510 self.assertEqual(len(list(result)), 0)
3511 messages = list(result.explain_no_results())
3512 self.assertTrue(messages)
3513 self.assertTrue(any("because collection list is empty" in message for message in messages))
3515 # Tests for registry.queryDimensionRecords()
3516 with self.assertRaises(NoDefaultCollectionError):
3517 registry.queryDimensionRecords("detector", datasets="bias")
3518 self.assertTrue(list(registry.queryDimensionRecords("detector", datasets="bias", collections=...)))
3520 result = registry.queryDimensionRecords("detector", datasets="bias", collections=[])
3521 self.assertEqual(len(list(result)), 0)
3522 messages = list(result.explain_no_results())
3523 self.assertTrue(messages)
3524 self.assertTrue(any("because collection list is empty" in message for message in messages))