Coverage for python/lsst/daf/butler/registry/tests/_registry.py: 5%
1479 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-21 09:55 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-21 09:55 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RegistryTests"]
25import itertools
26import logging
27import os
28import re
29import unittest
30import uuid
31from abc import ABC, abstractmethod
32from collections import defaultdict, namedtuple
33from collections.abc import Iterator
34from datetime import datetime, timedelta
35from typing import TYPE_CHECKING
37import astropy.time
38import sqlalchemy
40try:
41 import numpy as np
42except ImportError:
43 np = None
45import lsst.sphgeom
46from lsst.daf.relation import Relation, RelationalAlgebraError, Transfer, iteration, sql
48from ...core import (
49 DataCoordinate,
50 DataCoordinateSet,
51 DatasetAssociation,
52 DatasetIdFactory,
53 DatasetIdGenEnum,
54 DatasetRef,
55 DatasetType,
56 DimensionGraph,
57 NamedValueSet,
58 SkyPixDimension,
59 StorageClass,
60 Timespan,
61 ddl,
62)
63from .._collection_summary import CollectionSummary
64from .._collectionType import CollectionType
65from .._config import RegistryConfig
66from .._exceptions import (
67 ArgumentError,
68 CollectionError,
69 CollectionTypeError,
70 ConflictingDefinitionError,
71 DataIdValueError,
72 DatasetTypeError,
73 InconsistentDataIdError,
74 MissingCollectionError,
75 MissingDatasetTypeError,
76 NoDefaultCollectionError,
77 OrphanedRecordError,
78)
79from ..interfaces import ButlerAttributeExistsError
81if TYPE_CHECKING:
82 from .._registry import Registry
85class RegistryTests(ABC):
86 """Generic tests for the `Registry` class that can be subclassed to
87 generate tests for different configurations.
88 """
90 collectionsManager: str | None = None
91 """Name of the collections manager class, if subclass provides value for
92 this member then it overrides name specified in default configuration
93 (`str`).
94 """
96 datasetsManager: str | dict[str, str] | None = None
97 """Name or configuration dictionary of the datasets manager class, if
98 subclass provides value for this member then it overrides name specified
99 in default configuration (`str` or `dict`).
100 """
102 @classmethod
103 @abstractmethod
104 def getDataDir(cls) -> str:
105 """Return the root directory containing test data YAML files."""
106 raise NotImplementedError()
108 def makeRegistryConfig(self) -> RegistryConfig:
109 """Create RegistryConfig used to create a registry.
111 This method should be called by a subclass from `makeRegistry`.
112 Returned instance will be pre-configured based on the values of class
113 members, and default-configured for all other parameters. Subclasses
114 that need default configuration should just instantiate
115 `RegistryConfig` directly.
116 """
117 config = RegistryConfig()
118 if self.collectionsManager:
119 config["managers", "collections"] = self.collectionsManager
120 if self.datasetsManager:
121 config["managers", "datasets"] = self.datasetsManager
122 return config
124 @abstractmethod
125 def makeRegistry(self, share_repo_with: Registry | None = None) -> Registry | None:
126 """Return the Registry instance to be tested.
128 Parameters
129 ----------
130 share_repo_with : `Registry`, optional
131 If provided, the new registry should point to the same data
132 repository as this existing registry.
134 Returns
135 -------
136 registry : `Registry`
137 New `Registry` instance, or `None` *only* if `share_repo_with` is
138 not `None` and this test case does not support that argument
139 (e.g. it is impossible with in-memory SQLite DBs).
140 """
141 raise NotImplementedError()
143 def loadData(self, registry: Registry, filename: str):
144 """Load registry test data from ``getDataDir/<filename>``,
145 which should be a YAML import/export file.
146 """
147 from ...transfers import YamlRepoImportBackend
149 with open(os.path.join(self.getDataDir(), filename)) as stream:
150 backend = YamlRepoImportBackend(stream, registry)
151 backend.register()
152 backend.load(datastore=None)
154 def checkQueryResults(self, results, expected):
155 """Check that a query results object contains expected values.
157 Parameters
158 ----------
159 results : `DataCoordinateQueryResults` or `DatasetQueryResults`
160 A lazy-evaluation query results object.
161 expected : `list`
162 A list of `DataCoordinate` o `DatasetRef` objects that should be
163 equal to results of the query, aside from ordering.
164 """
165 self.assertCountEqual(list(results), expected)
166 self.assertEqual(results.count(), len(expected))
167 if expected:
168 self.assertTrue(results.any())
169 else:
170 self.assertFalse(results.any())
172 def testOpaque(self):
173 """Tests for `Registry.registerOpaqueTable`,
174 `Registry.insertOpaqueData`, `Registry.fetchOpaqueData`, and
175 `Registry.deleteOpaqueData`.
176 """
177 registry = self.makeRegistry()
178 table = "opaque_table_for_testing"
179 registry.registerOpaqueTable(
180 table,
181 spec=ddl.TableSpec(
182 fields=[
183 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, primaryKey=True),
184 ddl.FieldSpec("name", dtype=sqlalchemy.String, length=16, nullable=False),
185 ddl.FieldSpec("count", dtype=sqlalchemy.SmallInteger, nullable=True),
186 ],
187 ),
188 )
189 rows = [
190 {"id": 1, "name": "one", "count": None},
191 {"id": 2, "name": "two", "count": 5},
192 {"id": 3, "name": "three", "count": 6},
193 ]
194 registry.insertOpaqueData(table, *rows)
195 self.assertCountEqual(rows, list(registry.fetchOpaqueData(table)))
196 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=1)))
197 self.assertEqual(rows[1:2], list(registry.fetchOpaqueData(table, name="two")))
198 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=(1, 3), name=("one", "two"))))
199 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=(1, 2, 3))))
200 # Test very long IN clause which exceeds sqlite limit on number of
201 # parameters. SQLite says the limit is 32k but it looks like it is
202 # much higher.
203 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=list(range(300_000)))))
204 # Two IN clauses, each longer than 1k batch size, first with
205 # duplicates, second has matching elements in different batches (after
206 # sorting).
207 self.assertEqual(
208 rows[0:2],
209 list(
210 registry.fetchOpaqueData(
211 table,
212 id=list(range(1000)) + list(range(100, 0, -1)),
213 name=["one"] + [f"q{i}" for i in range(2200)] + ["two"],
214 )
215 ),
216 )
217 self.assertEqual([], list(registry.fetchOpaqueData(table, id=1, name="two")))
218 registry.deleteOpaqueData(table, id=3)
219 self.assertCountEqual(rows[:2], list(registry.fetchOpaqueData(table)))
220 registry.deleteOpaqueData(table)
221 self.assertEqual([], list(registry.fetchOpaqueData(table)))
223 def testDatasetType(self):
224 """Tests for `Registry.registerDatasetType` and
225 `Registry.getDatasetType`.
226 """
227 registry = self.makeRegistry()
228 # Check valid insert
229 datasetTypeName = "test"
230 storageClass = StorageClass("testDatasetType")
231 registry.storageClasses.registerStorageClass(storageClass)
232 dimensions = registry.dimensions.extract(("instrument", "visit"))
233 differentDimensions = registry.dimensions.extract(("instrument", "patch"))
234 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
235 # Inserting for the first time should return True
236 self.assertTrue(registry.registerDatasetType(inDatasetType))
237 outDatasetType1 = registry.getDatasetType(datasetTypeName)
238 self.assertEqual(outDatasetType1, inDatasetType)
240 # Re-inserting should work
241 self.assertFalse(registry.registerDatasetType(inDatasetType))
242 # Except when they are not identical
243 with self.assertRaises(ConflictingDefinitionError):
244 nonIdenticalDatasetType = DatasetType(datasetTypeName, differentDimensions, storageClass)
245 registry.registerDatasetType(nonIdenticalDatasetType)
247 # Template can be None
248 datasetTypeName = "testNoneTemplate"
249 storageClass = StorageClass("testDatasetType2")
250 registry.storageClasses.registerStorageClass(storageClass)
251 dimensions = registry.dimensions.extract(("instrument", "visit"))
252 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
253 registry.registerDatasetType(inDatasetType)
254 outDatasetType2 = registry.getDatasetType(datasetTypeName)
255 self.assertEqual(outDatasetType2, inDatasetType)
257 allTypes = set(registry.queryDatasetTypes())
258 self.assertEqual(allTypes, {outDatasetType1, outDatasetType2})
260 def testDimensions(self):
261 """Tests for `Registry.insertDimensionData`,
262 `Registry.syncDimensionData`, and `Registry.expandDataId`.
263 """
264 registry = self.makeRegistry()
265 dimensionName = "instrument"
266 dimension = registry.dimensions[dimensionName]
267 dimensionValue = {
268 "name": "DummyCam",
269 "visit_max": 10,
270 "visit_system": 0,
271 "exposure_max": 10,
272 "detector_max": 2,
273 "class_name": "lsst.pipe.base.Instrument",
274 }
275 registry.insertDimensionData(dimensionName, dimensionValue)
276 # Inserting the same value twice should fail
277 with self.assertRaises(sqlalchemy.exc.IntegrityError):
278 registry.insertDimensionData(dimensionName, dimensionValue)
279 # expandDataId should retrieve the record we just inserted
280 self.assertEqual(
281 registry.expandDataId(instrument="DummyCam", graph=dimension.graph)
282 .records[dimensionName]
283 .toDict(),
284 dimensionValue,
285 )
286 # expandDataId should raise if there is no record with the given ID.
287 with self.assertRaises(DataIdValueError):
288 registry.expandDataId({"instrument": "Unknown"}, graph=dimension.graph)
289 # band doesn't have a table; insert should fail.
290 with self.assertRaises(TypeError):
291 registry.insertDimensionData("band", {"band": "i"})
292 dimensionName2 = "physical_filter"
293 dimension2 = registry.dimensions[dimensionName2]
294 dimensionValue2 = {"name": "DummyCam_i", "band": "i"}
295 # Missing required dependency ("instrument") should fail
296 with self.assertRaises(KeyError):
297 registry.insertDimensionData(dimensionName2, dimensionValue2)
298 # Adding required dependency should fix the failure
299 dimensionValue2["instrument"] = "DummyCam"
300 registry.insertDimensionData(dimensionName2, dimensionValue2)
301 # expandDataId should retrieve the record we just inserted.
302 self.assertEqual(
303 registry.expandDataId(instrument="DummyCam", physical_filter="DummyCam_i", graph=dimension2.graph)
304 .records[dimensionName2]
305 .toDict(),
306 dimensionValue2,
307 )
308 # Use syncDimensionData to insert a new record successfully.
309 dimensionName3 = "detector"
310 dimensionValue3 = {
311 "instrument": "DummyCam",
312 "id": 1,
313 "full_name": "one",
314 "name_in_raft": "zero",
315 "purpose": "SCIENCE",
316 }
317 self.assertTrue(registry.syncDimensionData(dimensionName3, dimensionValue3))
318 # Sync that again. Note that one field ("raft") is NULL, and that
319 # should be okay.
320 self.assertFalse(registry.syncDimensionData(dimensionName3, dimensionValue3))
321 # Now try that sync with the same primary key but a different value.
322 # This should fail.
323 with self.assertRaises(ConflictingDefinitionError):
324 registry.syncDimensionData(
325 dimensionName3,
326 {
327 "instrument": "DummyCam",
328 "id": 1,
329 "full_name": "one",
330 "name_in_raft": "four",
331 "purpose": "SCIENCE",
332 },
333 )
335 @unittest.skipIf(np is None, "numpy not available.")
336 def testNumpyDataId(self):
337 """Test that we can use a numpy int in a dataId."""
338 registry = self.makeRegistry()
339 dimensionEntries = [
340 ("instrument", {"instrument": "DummyCam"}),
341 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
342 # Using an np.int64 here fails unless Records.fromDict is also
343 # patched to look for numbers.Integral
344 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
345 ]
346 for args in dimensionEntries:
347 registry.insertDimensionData(*args)
349 # Try a normal integer and something that looks like an int but
350 # is not.
351 for visit_id in (42, np.int64(42)):
352 with self.subTest(visit_id=visit_id, id_type=type(visit_id).__name__):
353 expanded = registry.expandDataId({"instrument": "DummyCam", "visit": visit_id})
354 self.assertEqual(expanded["visit"], int(visit_id))
355 self.assertIsInstance(expanded["visit"], int)
357 def testDataIdRelationships(self):
358 """Test that `Registry.expandDataId` raises an exception when the given
359 keys are inconsistent.
360 """
361 registry = self.makeRegistry()
362 self.loadData(registry, "base.yaml")
363 # Insert a few more dimension records for the next test.
364 registry.insertDimensionData(
365 "exposure",
366 {"instrument": "Cam1", "id": 1, "obs_id": "one", "physical_filter": "Cam1-G"},
367 )
368 registry.insertDimensionData(
369 "exposure",
370 {"instrument": "Cam1", "id": 2, "obs_id": "two", "physical_filter": "Cam1-G"},
371 )
372 registry.insertDimensionData(
373 "visit_system",
374 {"instrument": "Cam1", "id": 0, "name": "one-to-one"},
375 )
376 registry.insertDimensionData(
377 "visit",
378 {"instrument": "Cam1", "id": 1, "name": "one", "physical_filter": "Cam1-G", "visit_system": 0},
379 )
380 registry.insertDimensionData(
381 "visit_definition",
382 {"instrument": "Cam1", "visit": 1, "exposure": 1, "visit_system": 0},
383 )
384 with self.assertRaises(InconsistentDataIdError):
385 registry.expandDataId(
386 {"instrument": "Cam1", "visit": 1, "exposure": 2},
387 )
389 def testDataset(self):
390 """Basic tests for `Registry.insertDatasets`, `Registry.getDataset`,
391 and `Registry.removeDatasets`.
392 """
393 registry = self.makeRegistry()
394 self.loadData(registry, "base.yaml")
395 run = "tésτ"
396 registry.registerRun(run)
397 datasetType = registry.getDatasetType("bias")
398 dataId = {"instrument": "Cam1", "detector": 2}
399 (ref,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
400 outRef = registry.getDataset(ref.id)
401 self.assertIsNotNone(ref.id)
402 self.assertEqual(ref, outRef)
403 with self.assertRaises(ConflictingDefinitionError):
404 registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
405 registry.removeDatasets([ref])
406 self.assertIsNone(registry.findDataset(datasetType, dataId, collections=[run]))
408 def testFindDataset(self):
409 """Tests for `Registry.findDataset`."""
410 registry = self.makeRegistry()
411 self.loadData(registry, "base.yaml")
412 run = "tésτ"
413 datasetType = registry.getDatasetType("bias")
414 dataId = {"instrument": "Cam1", "detector": 4}
415 registry.registerRun(run)
416 (inputRef,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
417 outputRef = registry.findDataset(datasetType, dataId, collections=[run])
418 self.assertEqual(outputRef, inputRef)
419 # Check that retrieval with invalid dataId raises
420 with self.assertRaises(LookupError):
421 dataId = {"instrument": "Cam1"} # no detector
422 registry.findDataset(datasetType, dataId, collections=run)
423 # Check that different dataIds match to different datasets
424 dataId1 = {"instrument": "Cam1", "detector": 1}
425 (inputRef1,) = registry.insertDatasets(datasetType, dataIds=[dataId1], run=run)
426 dataId2 = {"instrument": "Cam1", "detector": 2}
427 (inputRef2,) = registry.insertDatasets(datasetType, dataIds=[dataId2], run=run)
428 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef1)
429 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef2)
430 self.assertNotEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef2)
431 self.assertNotEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef1)
432 # Check that requesting a non-existing dataId returns None
433 nonExistingDataId = {"instrument": "Cam1", "detector": 3}
434 self.assertIsNone(registry.findDataset(datasetType, nonExistingDataId, collections=run))
435 # Search more than one collection, in which two have the right
436 # dataset type and another does not.
437 registry.registerRun("empty")
438 self.loadData(registry, "datasets-uuid.yaml")
439 bias1 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_g"])
440 self.assertIsNotNone(bias1)
441 bias2 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_r"])
442 self.assertIsNotNone(bias2)
443 self.assertEqual(
444 bias1,
445 registry.findDataset(
446 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "imported_r"]
447 ),
448 )
449 self.assertEqual(
450 bias2,
451 registry.findDataset(
452 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_r", "imported_g"]
453 ),
454 )
455 # Search more than one collection, with one of them a CALIBRATION
456 # collection.
457 registry.registerCollection("Cam1/calib", CollectionType.CALIBRATION)
458 timespan = Timespan(
459 begin=astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai"),
460 end=astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai"),
461 )
462 registry.certify("Cam1/calib", [bias2], timespan=timespan)
463 self.assertEqual(
464 bias1,
465 registry.findDataset(
466 "bias",
467 instrument="Cam1",
468 detector=2,
469 collections=["empty", "imported_g", "Cam1/calib"],
470 timespan=timespan,
471 ),
472 )
473 self.assertEqual(
474 bias2,
475 registry.findDataset(
476 "bias",
477 instrument="Cam1",
478 detector=2,
479 collections=["empty", "Cam1/calib", "imported_g"],
480 timespan=timespan,
481 ),
482 )
483 # If we try to search those same collections without a timespan, it
484 # should still work, since the CALIBRATION collection is ignored.
485 self.assertEqual(
486 bias1,
487 registry.findDataset(
488 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "Cam1/calib"]
489 ),
490 )
491 self.assertEqual(
492 bias1,
493 registry.findDataset(
494 "bias", instrument="Cam1", detector=2, collections=["empty", "Cam1/calib", "imported_g"]
495 ),
496 )
498 def testRemoveDatasetTypeSuccess(self):
499 """Test that Registry.removeDatasetType works when there are no
500 datasets of that type present.
501 """
502 registry = self.makeRegistry()
503 self.loadData(registry, "base.yaml")
504 registry.removeDatasetType("flat")
505 with self.assertRaises(MissingDatasetTypeError):
506 registry.getDatasetType("flat")
508 def testRemoveDatasetTypeFailure(self):
509 """Test that Registry.removeDatasetType raises when there are datasets
510 of that type present or if the dataset type is for a component.
511 """
512 registry = self.makeRegistry()
513 self.loadData(registry, "base.yaml")
514 self.loadData(registry, "datasets.yaml")
515 with self.assertRaises(OrphanedRecordError):
516 registry.removeDatasetType("flat")
517 with self.assertRaises(ValueError):
518 registry.removeDatasetType(DatasetType.nameWithComponent("flat", "image"))
520 def testImportDatasetsUUID(self):
521 """Test for `Registry._importDatasets` with UUID dataset ID."""
522 if isinstance(self.datasetsManager, str):
523 if not self.datasetsManager.endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
524 self.skipTest(f"Unexpected dataset manager {self.datasetsManager}")
525 elif isinstance(self.datasetsManager, dict):
526 if not self.datasetsManager["cls"].endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
527 self.skipTest(f"Unexpected dataset manager {self.datasetsManager['cls']}")
529 registry = self.makeRegistry()
530 self.loadData(registry, "base.yaml")
531 for run in range(6):
532 registry.registerRun(f"run{run}")
533 datasetTypeBias = registry.getDatasetType("bias")
534 datasetTypeFlat = registry.getDatasetType("flat")
535 dataIdBias1 = {"instrument": "Cam1", "detector": 1}
536 dataIdBias2 = {"instrument": "Cam1", "detector": 2}
537 dataIdFlat1 = {"instrument": "Cam1", "detector": 1, "physical_filter": "Cam1-G", "band": "g"}
539 ref = DatasetRef(datasetTypeBias, dataIdBias1, run="run0")
540 (ref1,) = registry._importDatasets([ref])
541 # UUID is used without change
542 self.assertEqual(ref.id, ref1.id)
544 # All different failure modes
545 refs = (
546 # Importing same DatasetRef with different dataset ID is an error
547 DatasetRef(datasetTypeBias, dataIdBias1, run="run0"),
548 # Same DatasetId but different DataId
549 DatasetRef(datasetTypeBias, dataIdBias2, id=ref1.id, run="run0"),
550 DatasetRef(datasetTypeFlat, dataIdFlat1, id=ref1.id, run="run0"),
551 # Same DatasetRef and DatasetId but different run
552 DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run="run1"),
553 )
554 for ref in refs:
555 with self.assertRaises(ConflictingDefinitionError):
556 registry._importDatasets([ref])
558 # Test for non-unique IDs, they can be re-imported multiple times.
559 for run, idGenMode in ((2, DatasetIdGenEnum.DATAID_TYPE), (4, DatasetIdGenEnum.DATAID_TYPE_RUN)):
560 with self.subTest(idGenMode=idGenMode):
561 # Make dataset ref with reproducible dataset ID.
562 ref = DatasetRef(datasetTypeBias, dataIdBias1, run=f"run{run}", id_generation_mode=idGenMode)
563 (ref1,) = registry._importDatasets([ref])
564 self.assertIsInstance(ref1.id, uuid.UUID)
565 self.assertEqual(ref1.id.version, 5)
566 self.assertEqual(ref1.id, ref.id)
568 # Importing it again is OK
569 (ref2,) = registry._importDatasets([ref1])
570 self.assertEqual(ref2.id, ref1.id)
572 # Cannot import to different run with the same ID
573 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run=f"run{run+1}")
574 with self.assertRaises(ConflictingDefinitionError):
575 registry._importDatasets([ref])
577 ref = DatasetRef(
578 datasetTypeBias, dataIdBias1, run=f"run{run+1}", id_generation_mode=idGenMode
579 )
580 if idGenMode is DatasetIdGenEnum.DATAID_TYPE:
581 # Cannot import same DATAID_TYPE ref into a new run
582 with self.assertRaises(ConflictingDefinitionError):
583 (ref2,) = registry._importDatasets([ref])
584 else:
585 # DATAID_TYPE_RUN ref can be imported into a new run
586 (ref2,) = registry._importDatasets([ref])
588 def testDatasetTypeComponentQueries(self):
589 """Test component options when querying for dataset types.
591 All of the behavior here is deprecated, so many of these tests are
592 currently wrapped in a context to check that we get a warning whenever
593 a component dataset is actually returned.
594 """
595 registry = self.makeRegistry()
596 self.loadData(registry, "base.yaml")
597 self.loadData(registry, "datasets.yaml")
598 # Test querying for dataset types with different inputs.
599 # First query for all dataset types; components should only be included
600 # when components=True.
601 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes()).names)
602 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes(components=False)).names)
603 with self.assertWarns(FutureWarning):
604 self.assertLess(
605 {"bias", "flat", "bias.wcs", "flat.photoCalib"},
606 NamedValueSet(registry.queryDatasetTypes(components=True)).names,
607 )
608 # Use a pattern that can match either parent or components. Again,
609 # components are only returned if components=True.
610 self.assertEqual({"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"))).names)
611 self.assertEqual(
612 {"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=False)).names
613 )
614 with self.assertWarns(FutureWarning):
615 self.assertLess(
616 {"bias", "bias.wcs"},
617 NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=True)).names,
618 )
619 # This pattern matches only a component. In this case we also return
620 # that component dataset type if components=None.
621 with self.assertWarns(FutureWarning):
622 self.assertEqual(
623 {"bias.wcs"}, NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"))).names
624 )
625 self.assertEqual(
626 set(),
627 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=False)).names,
628 )
629 with self.assertWarns(FutureWarning):
630 self.assertEqual(
631 {"bias.wcs"},
632 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=True)).names,
633 )
634 # Add a dataset type using a StorageClass that we'll then remove; check
635 # that this does not affect our ability to query for dataset types
636 # (though it will warn).
637 tempStorageClass = StorageClass(
638 name="TempStorageClass",
639 components={
640 "data1": registry.storageClasses.getStorageClass("StructuredDataDict"),
641 "data2": registry.storageClasses.getStorageClass("StructuredDataDict"),
642 },
643 )
644 registry.storageClasses.registerStorageClass(tempStorageClass)
645 datasetType = DatasetType(
646 "temporary",
647 dimensions=["instrument"],
648 storageClass=tempStorageClass,
649 universe=registry.dimensions,
650 )
651 registry.registerDatasetType(datasetType)
652 registry.storageClasses._unregisterStorageClass(tempStorageClass.name)
653 datasetType._storageClass = None
654 del tempStorageClass
655 # Querying for all dataset types, including components, should include
656 # at least all non-component dataset types (and I don't want to
657 # enumerate all of the Exposure components for bias and flat here).
658 with self.assertWarns(FutureWarning):
659 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
660 everything = NamedValueSet(registry.queryDatasetTypes(components=True))
661 self.assertIn("TempStorageClass", cm.output[0])
662 self.assertLess({"bias", "flat", "temporary"}, everything.names)
663 # It should not include "temporary.columns", because we tried to remove
664 # the storage class that would tell it about that. So if the next line
665 # fails (i.e. "temporary.columns" _is_ in everything.names), it means
666 # this part of the test isn't doing anything, because the _unregister
667 # call about isn't simulating the real-life case we want it to
668 # simulate, in which different versions of daf_butler in entirely
669 # different Python processes interact with the same repo.
670 self.assertNotIn("temporary.data", everything.names)
671 # Query for dataset types that start with "temp". This should again
672 # not include the component, and also not fail.
673 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
674 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=True))
675 self.assertIn("TempStorageClass", cm.output[0])
676 self.assertEqual({"temporary"}, startsWithTemp.names)
677 # Querying with no components should not warn at all.
678 with self.assertLogs("lsst.daf.butler.registries", logging.WARN) as cm:
679 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=False))
680 # Must issue a warning of our own to be captured.
681 logging.getLogger("lsst.daf.butler.registries").warning("test message")
682 self.assertEqual(len(cm.output), 1)
683 self.assertIn("test message", cm.output[0])
685 def testComponentLookups(self):
686 """Test searching for component datasets via their parents.
688 All of the behavior here is deprecated, so many of these tests are
689 currently wrapped in a context to check that we get a warning whenever
690 a component dataset is actually returned.
691 """
692 registry = self.makeRegistry()
693 self.loadData(registry, "base.yaml")
694 self.loadData(registry, "datasets.yaml")
695 # Test getting the child dataset type (which does still exist in the
696 # Registry), and check for consistency with
697 # DatasetRef.makeComponentRef.
698 collection = "imported_g"
699 parentType = registry.getDatasetType("bias")
700 childType = registry.getDatasetType("bias.wcs")
701 parentRefResolved = registry.findDataset(
702 parentType, collections=collection, instrument="Cam1", detector=1
703 )
704 self.assertIsInstance(parentRefResolved, DatasetRef)
705 self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType)
706 # Search for a single dataset with findDataset.
707 childRef1 = registry.findDataset("bias.wcs", collections=collection, dataId=parentRefResolved.dataId)
708 self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs"))
709 # Search for detector data IDs constrained by component dataset
710 # existence with queryDataIds.
711 with self.assertWarns(FutureWarning):
712 dataIds = registry.queryDataIds(
713 ["detector"],
714 datasets=["bias.wcs"],
715 collections=collection,
716 ).toSet()
717 self.assertEqual(
718 dataIds,
719 DataCoordinateSet(
720 {
721 DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions)
722 for d in (1, 2, 3)
723 },
724 parentType.dimensions,
725 ),
726 )
727 # Search for multiple datasets of a single type with queryDatasets.
728 with self.assertWarns(FutureWarning):
729 childRefs2 = set(
730 registry.queryDatasets(
731 "bias.wcs",
732 collections=collection,
733 )
734 )
735 self.assertEqual({ref.datasetType for ref in childRefs2}, {childType})
736 self.assertEqual({ref.dataId for ref in childRefs2}, set(dataIds))
738 def testCollections(self):
739 """Tests for registry methods that manage collections."""
740 registry = self.makeRegistry()
741 other_registry = self.makeRegistry(share_repo_with=registry)
742 self.loadData(registry, "base.yaml")
743 self.loadData(registry, "datasets.yaml")
744 run1 = "imported_g"
745 run2 = "imported_r"
746 # Test setting a collection docstring after it has been created.
747 registry.setCollectionDocumentation(run1, "doc for run1")
748 self.assertEqual(registry.getCollectionDocumentation(run1), "doc for run1")
749 registry.setCollectionDocumentation(run1, None)
750 self.assertIsNone(registry.getCollectionDocumentation(run1))
751 datasetType = "bias"
752 # Find some datasets via their run's collection.
753 dataId1 = {"instrument": "Cam1", "detector": 1}
754 ref1 = registry.findDataset(datasetType, dataId1, collections=run1)
755 self.assertIsNotNone(ref1)
756 dataId2 = {"instrument": "Cam1", "detector": 2}
757 ref2 = registry.findDataset(datasetType, dataId2, collections=run1)
758 self.assertIsNotNone(ref2)
759 # Associate those into a new collection, then look for them there.
760 tag1 = "tag1"
761 registry.registerCollection(tag1, type=CollectionType.TAGGED, doc="doc for tag1")
762 # Check that we can query for old and new collections by type.
763 self.assertEqual(set(registry.queryCollections(collectionTypes=CollectionType.RUN)), {run1, run2})
764 self.assertEqual(
765 set(registry.queryCollections(collectionTypes={CollectionType.TAGGED, CollectionType.RUN})),
766 {tag1, run1, run2},
767 )
768 self.assertEqual(registry.getCollectionDocumentation(tag1), "doc for tag1")
769 registry.associate(tag1, [ref1, ref2])
770 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
771 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
772 # Disassociate one and verify that we can't it there anymore...
773 registry.disassociate(tag1, [ref1])
774 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=tag1))
775 # ...but we can still find ref2 in tag1, and ref1 in the run.
776 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run1), ref1)
777 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
778 collections = set(registry.queryCollections())
779 self.assertEqual(collections, {run1, run2, tag1})
780 # Associate both refs into tag1 again; ref2 is already there, but that
781 # should be a harmless no-op.
782 registry.associate(tag1, [ref1, ref2])
783 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
784 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
785 # Get a different dataset (from a different run) that has the same
786 # dataset type and data ID as ref2.
787 ref2b = registry.findDataset(datasetType, dataId2, collections=run2)
788 self.assertNotEqual(ref2, ref2b)
789 # Attempting to associate that into tag1 should be an error.
790 with self.assertRaises(ConflictingDefinitionError):
791 registry.associate(tag1, [ref2b])
792 # That error shouldn't have messed up what we had before.
793 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
794 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
795 # Attempt to associate the conflicting dataset again, this time with
796 # a dataset that isn't in the collection and won't cause a conflict.
797 # Should also fail without modifying anything.
798 dataId3 = {"instrument": "Cam1", "detector": 3}
799 ref3 = registry.findDataset(datasetType, dataId3, collections=run1)
800 with self.assertRaises(ConflictingDefinitionError):
801 registry.associate(tag1, [ref3, ref2b])
802 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
803 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
804 self.assertIsNone(registry.findDataset(datasetType, dataId3, collections=tag1))
805 # Register a chained collection that searches [tag1, run2]
806 chain1 = "chain1"
807 registry.registerCollection(chain1, type=CollectionType.CHAINED)
808 self.assertIs(registry.getCollectionType(chain1), CollectionType.CHAINED)
809 # Chained collection exists, but has no collections in it.
810 self.assertFalse(registry.getCollectionChain(chain1))
811 # If we query for all collections, we should get the chained collection
812 # only if we don't ask to flatten it (i.e. yield only its children).
813 self.assertEqual(set(registry.queryCollections(flattenChains=False)), {tag1, run1, run2, chain1})
814 self.assertEqual(set(registry.queryCollections(flattenChains=True)), {tag1, run1, run2})
815 # Attempt to set its child collections to something circular; that
816 # should fail.
817 with self.assertRaises(ValueError):
818 registry.setCollectionChain(chain1, [tag1, chain1])
819 # Add the child collections.
820 registry.setCollectionChain(chain1, [tag1, run2])
821 self.assertEqual(list(registry.getCollectionChain(chain1)), [tag1, run2])
822 self.assertEqual(registry.getCollectionParentChains(tag1), {chain1})
823 self.assertEqual(registry.getCollectionParentChains(run2), {chain1})
824 # Refresh the other registry that points to the same repo, and make
825 # sure it can see the things we've done (note that this does require
826 # an explicit refresh(); that's the documented behavior, because
827 # caching is ~impossible otherwise).
828 if other_registry is not None:
829 other_registry.refresh()
830 self.assertEqual(list(other_registry.getCollectionChain(chain1)), [tag1, run2])
831 self.assertEqual(other_registry.getCollectionParentChains(tag1), {chain1})
832 self.assertEqual(other_registry.getCollectionParentChains(run2), {chain1})
833 # Searching for dataId1 or dataId2 in the chain should return ref1 and
834 # ref2, because both are in tag1.
835 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain1), ref1)
836 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain1), ref2)
837 # Now disassociate ref2 from tag1. The search (for bias) with
838 # dataId2 in chain1 should then:
839 # 1. not find it in tag1
840 # 2. find a different dataset in run2
841 registry.disassociate(tag1, [ref2])
842 ref2b = registry.findDataset(datasetType, dataId2, collections=chain1)
843 self.assertNotEqual(ref2b, ref2)
844 self.assertEqual(ref2b, registry.findDataset(datasetType, dataId2, collections=run2))
845 # Define a new chain so we can test recursive chains.
846 chain2 = "chain2"
847 registry.registerCollection(chain2, type=CollectionType.CHAINED)
848 registry.setCollectionChain(chain2, [run2, chain1])
849 self.assertEqual(registry.getCollectionParentChains(chain1), {chain2})
850 self.assertEqual(registry.getCollectionParentChains(run2), {chain1, chain2})
851 # Query for collections matching a regex.
852 self.assertCountEqual(
853 list(registry.queryCollections(re.compile("imported_."), flattenChains=False)),
854 ["imported_r", "imported_g"],
855 )
856 # Query for collections matching a regex or an explicit str.
857 self.assertCountEqual(
858 list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)),
859 ["imported_r", "imported_g", "chain1"],
860 )
861 # Search for bias with dataId1 should find it via tag1 in chain2,
862 # recursing, because is not in run1.
863 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=run2))
864 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain2), ref1)
865 # Search for bias with dataId2 should find it in run2 (ref2b).
866 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain2), ref2b)
867 # Search for a flat that is in run2. That should not be found
868 # at the front of chain2, because of the restriction to bias
869 # on run2 there, but it should be found in at the end of chain1.
870 dataId4 = {"instrument": "Cam1", "detector": 3, "physical_filter": "Cam1-R2"}
871 ref4 = registry.findDataset("flat", dataId4, collections=run2)
872 self.assertIsNotNone(ref4)
873 self.assertEqual(ref4, registry.findDataset("flat", dataId4, collections=chain2))
874 # Deleting a collection that's part of a CHAINED collection is not
875 # allowed, and is exception-safe.
876 with self.assertRaises(Exception):
877 registry.removeCollection(run2)
878 self.assertEqual(registry.getCollectionType(run2), CollectionType.RUN)
879 with self.assertRaises(Exception):
880 registry.removeCollection(chain1)
881 self.assertEqual(registry.getCollectionType(chain1), CollectionType.CHAINED)
882 # Actually remove chain2, test that it's gone by asking for its type.
883 registry.removeCollection(chain2)
884 with self.assertRaises(MissingCollectionError):
885 registry.getCollectionType(chain2)
886 # Actually remove run2 and chain1, which should work now.
887 registry.removeCollection(chain1)
888 registry.removeCollection(run2)
889 with self.assertRaises(MissingCollectionError):
890 registry.getCollectionType(run2)
891 with self.assertRaises(MissingCollectionError):
892 registry.getCollectionType(chain1)
893 # Remove tag1 as well, just to test that we can remove TAGGED
894 # collections.
895 registry.removeCollection(tag1)
896 with self.assertRaises(MissingCollectionError):
897 registry.getCollectionType(tag1)
899 def testCollectionChainFlatten(self):
900 """Test that Registry.setCollectionChain obeys its 'flatten' option."""
901 registry = self.makeRegistry()
902 registry.registerCollection("inner", CollectionType.CHAINED)
903 registry.registerCollection("innermost", CollectionType.RUN)
904 registry.setCollectionChain("inner", ["innermost"])
905 registry.registerCollection("outer", CollectionType.CHAINED)
906 registry.setCollectionChain("outer", ["inner"], flatten=False)
907 self.assertEqual(list(registry.getCollectionChain("outer")), ["inner"])
908 registry.setCollectionChain("outer", ["inner"], flatten=True)
909 self.assertEqual(list(registry.getCollectionChain("outer")), ["innermost"])
911 def testBasicTransaction(self):
912 """Test that all operations within a single transaction block are
913 rolled back if an exception propagates out of the block.
914 """
915 registry = self.makeRegistry()
916 storageClass = StorageClass("testDatasetType")
917 registry.storageClasses.registerStorageClass(storageClass)
918 with registry.transaction():
919 registry.insertDimensionData("instrument", {"name": "Cam1", "class_name": "A"})
920 with self.assertRaises(ValueError):
921 with registry.transaction():
922 registry.insertDimensionData("instrument", {"name": "Cam2"})
923 raise ValueError("Oops, something went wrong")
924 # Cam1 should exist
925 self.assertEqual(registry.expandDataId(instrument="Cam1").records["instrument"].class_name, "A")
926 # But Cam2 and Cam3 should both not exist
927 with self.assertRaises(DataIdValueError):
928 registry.expandDataId(instrument="Cam2")
929 with self.assertRaises(DataIdValueError):
930 registry.expandDataId(instrument="Cam3")
932 def testNestedTransaction(self):
933 """Test that operations within a transaction block are not rolled back
934 if an exception propagates out of an inner transaction block and is
935 then caught.
936 """
937 registry = self.makeRegistry()
938 dimension = registry.dimensions["instrument"]
939 dataId1 = {"instrument": "DummyCam"}
940 dataId2 = {"instrument": "DummyCam2"}
941 checkpointReached = False
942 with registry.transaction():
943 # This should be added and (ultimately) committed.
944 registry.insertDimensionData(dimension, dataId1)
945 with self.assertRaises(sqlalchemy.exc.IntegrityError):
946 with registry.transaction(savepoint=True):
947 # This does not conflict, and should succeed (but not
948 # be committed).
949 registry.insertDimensionData(dimension, dataId2)
950 checkpointReached = True
951 # This should conflict and raise, triggerring a rollback
952 # of the previous insertion within the same transaction
953 # context, but not the original insertion in the outer
954 # block.
955 registry.insertDimensionData(dimension, dataId1)
956 self.assertTrue(checkpointReached)
957 self.assertIsNotNone(registry.expandDataId(dataId1, graph=dimension.graph))
958 with self.assertRaises(DataIdValueError):
959 registry.expandDataId(dataId2, graph=dimension.graph)
961 def testInstrumentDimensions(self):
962 """Test queries involving only instrument dimensions, with no joins to
963 skymap.
964 """
965 registry = self.makeRegistry()
967 # need a bunch of dimensions and datasets for test
968 registry.insertDimensionData(
969 "instrument", dict(name="DummyCam", visit_max=25, exposure_max=300, detector_max=6)
970 )
971 registry.insertDimensionData(
972 "physical_filter",
973 dict(instrument="DummyCam", name="dummy_r", band="r"),
974 dict(instrument="DummyCam", name="dummy_i", band="i"),
975 )
976 registry.insertDimensionData(
977 "detector", *[dict(instrument="DummyCam", id=i, full_name=str(i)) for i in range(1, 6)]
978 )
979 registry.insertDimensionData(
980 "visit_system",
981 dict(instrument="DummyCam", id=1, name="default"),
982 )
983 registry.insertDimensionData(
984 "visit",
985 dict(instrument="DummyCam", id=10, name="ten", physical_filter="dummy_i", visit_system=1),
986 dict(instrument="DummyCam", id=11, name="eleven", physical_filter="dummy_r", visit_system=1),
987 dict(instrument="DummyCam", id=20, name="twelve", physical_filter="dummy_r", visit_system=1),
988 )
989 for i in range(1, 6):
990 registry.insertDimensionData(
991 "visit_detector_region",
992 dict(instrument="DummyCam", visit=10, detector=i),
993 dict(instrument="DummyCam", visit=11, detector=i),
994 dict(instrument="DummyCam", visit=20, detector=i),
995 )
996 registry.insertDimensionData(
997 "exposure",
998 dict(instrument="DummyCam", id=100, obs_id="100", physical_filter="dummy_i"),
999 dict(instrument="DummyCam", id=101, obs_id="101", physical_filter="dummy_i"),
1000 dict(instrument="DummyCam", id=110, obs_id="110", physical_filter="dummy_r"),
1001 dict(instrument="DummyCam", id=111, obs_id="111", physical_filter="dummy_r"),
1002 dict(instrument="DummyCam", id=200, obs_id="200", physical_filter="dummy_r"),
1003 dict(instrument="DummyCam", id=201, obs_id="201", physical_filter="dummy_r"),
1004 )
1005 registry.insertDimensionData(
1006 "visit_definition",
1007 dict(instrument="DummyCam", exposure=100, visit_system=1, visit=10),
1008 dict(instrument="DummyCam", exposure=101, visit_system=1, visit=10),
1009 dict(instrument="DummyCam", exposure=110, visit_system=1, visit=11),
1010 dict(instrument="DummyCam", exposure=111, visit_system=1, visit=11),
1011 dict(instrument="DummyCam", exposure=200, visit_system=1, visit=20),
1012 dict(instrument="DummyCam", exposure=201, visit_system=1, visit=20),
1013 )
1014 # dataset types
1015 run1 = "test1_r"
1016 run2 = "test2_r"
1017 tagged2 = "test2_t"
1018 registry.registerRun(run1)
1019 registry.registerRun(run2)
1020 registry.registerCollection(tagged2)
1021 storageClass = StorageClass("testDataset")
1022 registry.storageClasses.registerStorageClass(storageClass)
1023 rawType = DatasetType(
1024 name="RAW",
1025 dimensions=registry.dimensions.extract(("instrument", "exposure", "detector")),
1026 storageClass=storageClass,
1027 )
1028 registry.registerDatasetType(rawType)
1029 calexpType = DatasetType(
1030 name="CALEXP",
1031 dimensions=registry.dimensions.extract(("instrument", "visit", "detector")),
1032 storageClass=storageClass,
1033 )
1034 registry.registerDatasetType(calexpType)
1036 # add pre-existing datasets
1037 for exposure in (100, 101, 110, 111):
1038 for detector in (1, 2, 3):
1039 # note that only 3 of 5 detectors have datasets
1040 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1041 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run1)
1042 # exposures 100 and 101 appear in both run1 and tagged2.
1043 # 100 has different datasets in the different collections
1044 # 101 has the same dataset in both collections.
1045 if exposure == 100:
1046 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1047 if exposure in (100, 101):
1048 registry.associate(tagged2, [ref])
1049 # Add pre-existing datasets to tagged2.
1050 for exposure in (200, 201):
1051 for detector in (3, 4, 5):
1052 # note that only 3 of 5 detectors have datasets
1053 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1054 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1055 registry.associate(tagged2, [ref])
1057 dimensions = DimensionGraph(
1058 registry.dimensions, dimensions=(rawType.dimensions.required | calexpType.dimensions.required)
1059 )
1060 # Test that single dim string works as well as list of str
1061 rows = registry.queryDataIds("visit", datasets=rawType, collections=run1).expanded().toSet()
1062 rowsI = registry.queryDataIds(["visit"], datasets=rawType, collections=run1).expanded().toSet()
1063 self.assertEqual(rows, rowsI)
1064 # with empty expression
1065 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=run1).expanded().toSet()
1066 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1067 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111))
1068 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11))
1069 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1071 # second collection
1072 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=tagged2).toSet()
1073 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1074 for dataId in rows:
1075 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1076 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 200, 201))
1077 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 20))
1078 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1080 # with two input datasets
1081 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=[run1, tagged2]).toSet()
1082 self.assertEqual(len(set(rows)), 6 * 3) # 6 exposures times 3 detectors; set needed to de-dupe
1083 for dataId in rows:
1084 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1085 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111, 200, 201))
1086 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11, 20))
1087 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1089 # limit to single visit
1090 rows = registry.queryDataIds(
1091 dimensions, datasets=rawType, collections=run1, where="visit = 10", instrument="DummyCam"
1092 ).toSet()
1093 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1094 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1095 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1096 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1098 # more limiting expression, using link names instead of Table.column
1099 rows = registry.queryDataIds(
1100 dimensions,
1101 datasets=rawType,
1102 collections=run1,
1103 where="visit = 10 and detector > 1 and 'DummyCam'=instrument",
1104 ).toSet()
1105 self.assertEqual(len(rows), 2 * 2) # 2 exposures times 2 detectors
1106 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1107 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1108 self.assertCountEqual({dataId["detector"] for dataId in rows}, (2, 3))
1110 # queryDataIds with only one of `datasets` and `collections` is an
1111 # error.
1112 with self.assertRaises(CollectionError):
1113 registry.queryDataIds(dimensions, datasets=rawType)
1114 with self.assertRaises(ArgumentError):
1115 registry.queryDataIds(dimensions, collections=run1)
1117 # expression excludes everything
1118 rows = registry.queryDataIds(
1119 dimensions, datasets=rawType, collections=run1, where="visit > 1000", instrument="DummyCam"
1120 ).toSet()
1121 self.assertEqual(len(rows), 0)
1123 # Selecting by physical_filter, this is not in the dimensions, but it
1124 # is a part of the full expression so it should work too.
1125 rows = registry.queryDataIds(
1126 dimensions,
1127 datasets=rawType,
1128 collections=run1,
1129 where="physical_filter = 'dummy_r'",
1130 instrument="DummyCam",
1131 ).toSet()
1132 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1133 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (110, 111))
1134 self.assertCountEqual({dataId["visit"] for dataId in rows}, (11,))
1135 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1137 def testSkyMapDimensions(self):
1138 """Tests involving only skymap dimensions, no joins to instrument."""
1139 registry = self.makeRegistry()
1141 # need a bunch of dimensions and datasets for test, we want
1142 # "band" in the test so also have to add physical_filter
1143 # dimensions
1144 registry.insertDimensionData("instrument", dict(instrument="DummyCam"))
1145 registry.insertDimensionData(
1146 "physical_filter",
1147 dict(instrument="DummyCam", name="dummy_r", band="r"),
1148 dict(instrument="DummyCam", name="dummy_i", band="i"),
1149 )
1150 registry.insertDimensionData("skymap", dict(name="DummyMap", hash=b"sha!"))
1151 for tract in range(10):
1152 registry.insertDimensionData("tract", dict(skymap="DummyMap", id=tract))
1153 registry.insertDimensionData(
1154 "patch",
1155 *[dict(skymap="DummyMap", tract=tract, id=patch, cell_x=0, cell_y=0) for patch in range(10)],
1156 )
1158 # dataset types
1159 run = "tésτ"
1160 registry.registerRun(run)
1161 storageClass = StorageClass("testDataset")
1162 registry.storageClasses.registerStorageClass(storageClass)
1163 calexpType = DatasetType(
1164 name="deepCoadd_calexp",
1165 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1166 storageClass=storageClass,
1167 )
1168 registry.registerDatasetType(calexpType)
1169 mergeType = DatasetType(
1170 name="deepCoadd_mergeDet",
1171 dimensions=registry.dimensions.extract(("skymap", "tract", "patch")),
1172 storageClass=storageClass,
1173 )
1174 registry.registerDatasetType(mergeType)
1175 measType = DatasetType(
1176 name="deepCoadd_meas",
1177 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1178 storageClass=storageClass,
1179 )
1180 registry.registerDatasetType(measType)
1182 dimensions = DimensionGraph(
1183 registry.dimensions,
1184 dimensions=(
1185 calexpType.dimensions.required | mergeType.dimensions.required | measType.dimensions.required
1186 ),
1187 )
1189 # add pre-existing datasets
1190 for tract in (1, 3, 5):
1191 for patch in (2, 4, 6, 7):
1192 dataId = dict(skymap="DummyMap", tract=tract, patch=patch)
1193 registry.insertDatasets(mergeType, dataIds=[dataId], run=run)
1194 for aFilter in ("i", "r"):
1195 dataId = dict(skymap="DummyMap", tract=tract, patch=patch, band=aFilter)
1196 registry.insertDatasets(calexpType, dataIds=[dataId], run=run)
1198 # with empty expression
1199 rows = registry.queryDataIds(dimensions, datasets=[calexpType, mergeType], collections=run).toSet()
1200 self.assertEqual(len(rows), 3 * 4 * 2) # 4 tracts x 4 patches x 2 filters
1201 for dataId in rows:
1202 self.assertCountEqual(dataId.keys(), ("skymap", "tract", "patch", "band"))
1203 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1204 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1205 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1207 # limit to 2 tracts and 2 patches
1208 rows = registry.queryDataIds(
1209 dimensions,
1210 datasets=[calexpType, mergeType],
1211 collections=run,
1212 where="tract IN (1, 5) AND patch IN (2, 7)",
1213 skymap="DummyMap",
1214 ).toSet()
1215 self.assertEqual(len(rows), 2 * 2 * 2) # 2 tracts x 2 patches x 2 filters
1216 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 5))
1217 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 7))
1218 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1220 # limit to single filter
1221 rows = registry.queryDataIds(
1222 dimensions, datasets=[calexpType, mergeType], collections=run, where="band = 'i'"
1223 ).toSet()
1224 self.assertEqual(len(rows), 3 * 4 * 1) # 4 tracts x 4 patches x 2 filters
1225 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1226 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1227 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",))
1229 # Specifying non-existing skymap is an exception
1230 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
1231 rows = registry.queryDataIds(
1232 dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
1233 ).toSet()
1235 def testSpatialJoin(self):
1236 """Test queries that involve spatial overlap joins."""
1237 registry = self.makeRegistry()
1238 self.loadData(registry, "hsc-rc2-subset.yaml")
1240 # Dictionary of spatial DatabaseDimensionElements, keyed by the name of
1241 # the TopologicalFamily they belong to. We'll relate all elements in
1242 # each family to all of the elements in each other family.
1243 families = defaultdict(set)
1244 # Dictionary of {element.name: {dataId: region}}.
1245 regions = {}
1246 for element in registry.dimensions.getDatabaseElements():
1247 if element.spatial is not None:
1248 families[element.spatial.name].add(element)
1249 regions[element.name] = {
1250 record.dataId: record.region for record in registry.queryDimensionRecords(element)
1251 }
1253 # If this check fails, it's not necessarily a problem - it may just be
1254 # a reasonable change to the default dimension definitions - but the
1255 # test below depends on there being more than one family to do anything
1256 # useful.
1257 self.assertEqual(len(families), 2)
1259 # Overlap DatabaseDimensionElements with each other.
1260 for family1, family2 in itertools.combinations(families, 2):
1261 for element1, element2 in itertools.product(families[family1], families[family2]):
1262 graph = DimensionGraph.union(element1.graph, element2.graph)
1263 # Construct expected set of overlapping data IDs via a
1264 # brute-force comparison of the regions we've already fetched.
1265 expected = {
1266 DataCoordinate.standardize({**dataId1.byName(), **dataId2.byName()}, graph=graph)
1267 for (dataId1, region1), (dataId2, region2) in itertools.product(
1268 regions[element1.name].items(), regions[element2.name].items()
1269 )
1270 if not region1.isDisjointFrom(region2)
1271 }
1272 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1273 queried = set(registry.queryDataIds(graph))
1274 self.assertEqual(expected, queried)
1276 # Overlap each DatabaseDimensionElement with the commonSkyPix system.
1277 commonSkyPix = registry.dimensions.commonSkyPix
1278 for elementName, regions in regions.items():
1279 graph = DimensionGraph.union(registry.dimensions[elementName].graph, commonSkyPix.graph)
1280 expected = set()
1281 for dataId, region in regions.items():
1282 for begin, end in commonSkyPix.pixelization.envelope(region):
1283 expected.update(
1284 DataCoordinate.standardize({commonSkyPix.name: index, **dataId.byName()}, graph=graph)
1285 for index in range(begin, end)
1286 )
1287 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1288 queried = set(registry.queryDataIds(graph))
1289 self.assertEqual(expected, queried)
1291 def testAbstractQuery(self):
1292 """Test that we can run a query that just lists the known
1293 bands. This is tricky because band is
1294 backed by a query against physical_filter.
1295 """
1296 registry = self.makeRegistry()
1297 registry.insertDimensionData("instrument", dict(name="DummyCam"))
1298 registry.insertDimensionData(
1299 "physical_filter",
1300 dict(instrument="DummyCam", name="dummy_i", band="i"),
1301 dict(instrument="DummyCam", name="dummy_i2", band="i"),
1302 dict(instrument="DummyCam", name="dummy_r", band="r"),
1303 )
1304 rows = registry.queryDataIds(["band"]).toSet()
1305 self.assertCountEqual(
1306 rows,
1307 [
1308 DataCoordinate.standardize(band="i", universe=registry.dimensions),
1309 DataCoordinate.standardize(band="r", universe=registry.dimensions),
1310 ],
1311 )
1313 def testAttributeManager(self):
1314 """Test basic functionality of attribute manager."""
1315 # number of attributes with schema versions in a fresh database,
1316 # 6 managers with 2 records per manager, plus config for dimensions
1317 VERSION_COUNT = 6 * 2 + 1
1319 registry = self.makeRegistry()
1320 attributes = registry._managers.attributes
1322 # check what get() returns for non-existing key
1323 self.assertIsNone(attributes.get("attr"))
1324 self.assertEqual(attributes.get("attr", ""), "")
1325 self.assertEqual(attributes.get("attr", "Value"), "Value")
1326 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1328 # cannot store empty key or value
1329 with self.assertRaises(ValueError):
1330 attributes.set("", "value")
1331 with self.assertRaises(ValueError):
1332 attributes.set("attr", "")
1334 # set value of non-existing key
1335 attributes.set("attr", "value")
1336 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1337 self.assertEqual(attributes.get("attr"), "value")
1339 # update value of existing key
1340 with self.assertRaises(ButlerAttributeExistsError):
1341 attributes.set("attr", "value2")
1343 attributes.set("attr", "value2", force=True)
1344 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1345 self.assertEqual(attributes.get("attr"), "value2")
1347 # delete existing key
1348 self.assertTrue(attributes.delete("attr"))
1349 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1351 # delete non-existing key
1352 self.assertFalse(attributes.delete("non-attr"))
1354 # store bunch of keys and get the list back
1355 data = [
1356 ("version.core", "1.2.3"),
1357 ("version.dimensions", "3.2.1"),
1358 ("config.managers.opaque", "ByNameOpaqueTableStorageManager"),
1359 ]
1360 for key, value in data:
1361 attributes.set(key, value)
1362 items = dict(attributes.items())
1363 for key, value in data:
1364 self.assertEqual(items[key], value)
1366 def testQueryDatasetsDeduplication(self):
1367 """Test that the findFirst option to queryDatasets selects datasets
1368 from collections in the order given".
1369 """
1370 registry = self.makeRegistry()
1371 self.loadData(registry, "base.yaml")
1372 self.loadData(registry, "datasets.yaml")
1373 self.assertCountEqual(
1374 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"])),
1375 [
1376 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1377 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1378 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1379 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1380 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1381 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1382 ],
1383 )
1384 self.assertCountEqual(
1385 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"], findFirst=True)),
1386 [
1387 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1388 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1389 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1390 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1391 ],
1392 )
1393 self.assertCountEqual(
1394 list(registry.queryDatasets("bias", collections=["imported_r", "imported_g"], findFirst=True)),
1395 [
1396 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1397 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1398 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1399 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1400 ],
1401 )
1403 def testQueryResults(self):
1404 """Test querying for data IDs and then manipulating the QueryResults
1405 object returned to perform other queries.
1406 """
1407 registry = self.makeRegistry()
1408 self.loadData(registry, "base.yaml")
1409 self.loadData(registry, "datasets.yaml")
1410 bias = registry.getDatasetType("bias")
1411 flat = registry.getDatasetType("flat")
1412 # Obtain expected results from methods other than those we're testing
1413 # here. That includes:
1414 # - the dimensions of the data IDs we want to query:
1415 expectedGraph = DimensionGraph(registry.dimensions, names=["detector", "physical_filter"])
1416 # - the dimensions of some other data IDs we'll extract from that:
1417 expectedSubsetGraph = DimensionGraph(registry.dimensions, names=["detector"])
1418 # - the data IDs we expect to obtain from the first queries:
1419 expectedDataIds = DataCoordinateSet(
1420 {
1421 DataCoordinate.standardize(
1422 instrument="Cam1", detector=d, physical_filter=p, universe=registry.dimensions
1423 )
1424 for d, p in itertools.product({1, 2, 3}, {"Cam1-G", "Cam1-R1", "Cam1-R2"})
1425 },
1426 graph=expectedGraph,
1427 hasFull=False,
1428 hasRecords=False,
1429 )
1430 # - the flat datasets we expect to find from those data IDs, in just
1431 # one collection (so deduplication is irrelevant):
1432 expectedFlats = [
1433 registry.findDataset(
1434 flat, instrument="Cam1", detector=1, physical_filter="Cam1-R1", collections="imported_r"
1435 ),
1436 registry.findDataset(
1437 flat, instrument="Cam1", detector=2, physical_filter="Cam1-R1", collections="imported_r"
1438 ),
1439 registry.findDataset(
1440 flat, instrument="Cam1", detector=3, physical_filter="Cam1-R2", collections="imported_r"
1441 ),
1442 ]
1443 # - the data IDs we expect to extract from that:
1444 expectedSubsetDataIds = expectedDataIds.subset(expectedSubsetGraph)
1445 # - the bias datasets we expect to find from those data IDs, after we
1446 # subset-out the physical_filter dimension, both with duplicates:
1447 expectedAllBiases = [
1448 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1449 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_g"),
1450 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_g"),
1451 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1452 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1453 ]
1454 # - ...and without duplicates:
1455 expectedDeduplicatedBiases = [
1456 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1457 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1458 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1459 ]
1460 # Test against those expected results, using a "lazy" query for the
1461 # data IDs (which re-executes that query each time we use it to do
1462 # something new).
1463 dataIds = registry.queryDataIds(
1464 ["detector", "physical_filter"],
1465 where="detector.purpose = 'SCIENCE'", # this rejects detector=4
1466 instrument="Cam1",
1467 )
1468 self.assertEqual(dataIds.graph, expectedGraph)
1469 self.assertEqual(dataIds.toSet(), expectedDataIds)
1470 self.assertCountEqual(
1471 list(
1472 dataIds.findDatasets(
1473 flat,
1474 collections=["imported_r"],
1475 )
1476 ),
1477 expectedFlats,
1478 )
1479 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1480 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1481 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1482 self.assertCountEqual(
1483 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=False)),
1484 expectedAllBiases,
1485 )
1486 self.assertCountEqual(
1487 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)),
1488 expectedDeduplicatedBiases,
1489 )
1491 # Check dimensions match.
1492 with self.assertRaises(ValueError):
1493 subsetDataIds.findDatasets("flat", collections=["imported_r", "imported_g"], findFirst=True)
1495 # Use a component dataset type.
1496 self.assertCountEqual(
1497 [
1498 ref.makeComponentRef("image")
1499 for ref in subsetDataIds.findDatasets(
1500 bias,
1501 collections=["imported_r", "imported_g"],
1502 findFirst=False,
1503 )
1504 ],
1505 [ref.makeComponentRef("image") for ref in expectedAllBiases],
1506 )
1508 # Use a named dataset type that does not exist and a dataset type
1509 # object that does not exist.
1510 unknown_type = DatasetType("not_known", dimensions=bias.dimensions, storageClass="Exposure")
1512 # Test both string name and dataset type object.
1513 test_type: str | DatasetType
1514 for test_type, test_type_name in (
1515 (unknown_type, unknown_type.name),
1516 (unknown_type.name, unknown_type.name),
1517 ):
1518 with self.assertRaisesRegex(DatasetTypeError, expected_regex=test_type_name):
1519 list(
1520 subsetDataIds.findDatasets(
1521 test_type, collections=["imported_r", "imported_g"], findFirst=True
1522 )
1523 )
1525 # Materialize the bias dataset queries (only) by putting the results
1526 # into temporary tables, then repeat those tests.
1527 with subsetDataIds.findDatasets(
1528 bias, collections=["imported_r", "imported_g"], findFirst=False
1529 ).materialize() as biases:
1530 self.assertCountEqual(list(biases), expectedAllBiases)
1531 with subsetDataIds.findDatasets(
1532 bias, collections=["imported_r", "imported_g"], findFirst=True
1533 ).materialize() as biases:
1534 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1535 # Materialize the data ID subset query, but not the dataset queries.
1536 with subsetDataIds.materialize() as subsetDataIds:
1537 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1538 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1539 self.assertCountEqual(
1540 list(
1541 subsetDataIds.findDatasets(
1542 bias, collections=["imported_r", "imported_g"], findFirst=False
1543 )
1544 ),
1545 expectedAllBiases,
1546 )
1547 self.assertCountEqual(
1548 list(
1549 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1550 ),
1551 expectedDeduplicatedBiases,
1552 )
1553 # Materialize the dataset queries, too.
1554 with subsetDataIds.findDatasets(
1555 bias, collections=["imported_r", "imported_g"], findFirst=False
1556 ).materialize() as biases:
1557 self.assertCountEqual(list(biases), expectedAllBiases)
1558 with subsetDataIds.findDatasets(
1559 bias, collections=["imported_r", "imported_g"], findFirst=True
1560 ).materialize() as biases:
1561 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1562 # Materialize the original query, but none of the follow-up queries.
1563 with dataIds.materialize() as dataIds:
1564 self.assertEqual(dataIds.graph, expectedGraph)
1565 self.assertEqual(dataIds.toSet(), expectedDataIds)
1566 self.assertCountEqual(
1567 list(
1568 dataIds.findDatasets(
1569 flat,
1570 collections=["imported_r"],
1571 )
1572 ),
1573 expectedFlats,
1574 )
1575 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1576 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1577 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1578 self.assertCountEqual(
1579 list(
1580 subsetDataIds.findDatasets(
1581 bias, collections=["imported_r", "imported_g"], findFirst=False
1582 )
1583 ),
1584 expectedAllBiases,
1585 )
1586 self.assertCountEqual(
1587 list(
1588 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1589 ),
1590 expectedDeduplicatedBiases,
1591 )
1592 # Materialize just the bias dataset queries.
1593 with subsetDataIds.findDatasets(
1594 bias, collections=["imported_r", "imported_g"], findFirst=False
1595 ).materialize() as biases:
1596 self.assertCountEqual(list(biases), expectedAllBiases)
1597 with subsetDataIds.findDatasets(
1598 bias, collections=["imported_r", "imported_g"], findFirst=True
1599 ).materialize() as biases:
1600 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1601 # Materialize the subset data ID query, but not the dataset
1602 # queries.
1603 with subsetDataIds.materialize() as subsetDataIds:
1604 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1605 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1606 self.assertCountEqual(
1607 list(
1608 subsetDataIds.findDatasets(
1609 bias, collections=["imported_r", "imported_g"], findFirst=False
1610 )
1611 ),
1612 expectedAllBiases,
1613 )
1614 self.assertCountEqual(
1615 list(
1616 subsetDataIds.findDatasets(
1617 bias, collections=["imported_r", "imported_g"], findFirst=True
1618 )
1619 ),
1620 expectedDeduplicatedBiases,
1621 )
1622 # Materialize the bias dataset queries, too, so now we're
1623 # materializing every single step.
1624 with subsetDataIds.findDatasets(
1625 bias, collections=["imported_r", "imported_g"], findFirst=False
1626 ).materialize() as biases:
1627 self.assertCountEqual(list(biases), expectedAllBiases)
1628 with subsetDataIds.findDatasets(
1629 bias, collections=["imported_r", "imported_g"], findFirst=True
1630 ).materialize() as biases:
1631 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1633 def testStorageClassPropagation(self):
1634 """Test that queries for datasets respect the storage class passed in
1635 as part of a full dataset type.
1636 """
1637 registry = self.makeRegistry()
1638 self.loadData(registry, "base.yaml")
1639 dataset_type_in_registry = DatasetType(
1640 "tbl", dimensions=["instrument"], storageClass="Packages", universe=registry.dimensions
1641 )
1642 registry.registerDatasetType(dataset_type_in_registry)
1643 run = "run1"
1644 registry.registerRun(run)
1645 (inserted_ref,) = registry.insertDatasets(
1646 dataset_type_in_registry, [registry.expandDataId(instrument="Cam1")], run=run
1647 )
1648 self.assertEqual(inserted_ref.datasetType, dataset_type_in_registry)
1649 query_dataset_type = DatasetType(
1650 "tbl", dimensions=["instrument"], storageClass="StructuredDataDict", universe=registry.dimensions
1651 )
1652 self.assertNotEqual(dataset_type_in_registry, query_dataset_type)
1653 query_datasets_result = registry.queryDatasets(query_dataset_type, collections=[run])
1654 self.assertEqual(query_datasets_result.parentDatasetType, query_dataset_type) # type: ignore
1655 (query_datasets_ref,) = query_datasets_result
1656 self.assertEqual(query_datasets_ref.datasetType, query_dataset_type)
1657 query_data_ids_find_datasets_result = registry.queryDataIds(["instrument"]).findDatasets(
1658 query_dataset_type, collections=[run]
1659 )
1660 self.assertEqual(query_data_ids_find_datasets_result.parentDatasetType, query_dataset_type)
1661 (query_data_ids_find_datasets_ref,) = query_data_ids_find_datasets_result
1662 self.assertEqual(query_data_ids_find_datasets_ref.datasetType, query_dataset_type)
1663 query_dataset_types_result = registry.queryDatasetTypes(query_dataset_type)
1664 self.assertEqual(list(query_dataset_types_result), [query_dataset_type])
1665 find_dataset_ref = registry.findDataset(query_dataset_type, instrument="Cam1", collections=[run])
1666 self.assertEqual(find_dataset_ref.datasetType, query_dataset_type)
1668 def testEmptyDimensionsQueries(self):
1669 """Test Query and QueryResults objects in the case where there are no
1670 dimensions.
1671 """
1672 # Set up test data: one dataset type, two runs, one dataset in each.
1673 registry = self.makeRegistry()
1674 self.loadData(registry, "base.yaml")
1675 schema = DatasetType("schema", dimensions=registry.dimensions.empty, storageClass="Catalog")
1676 registry.registerDatasetType(schema)
1677 dataId = DataCoordinate.makeEmpty(registry.dimensions)
1678 run1 = "run1"
1679 run2 = "run2"
1680 registry.registerRun(run1)
1681 registry.registerRun(run2)
1682 (dataset1,) = registry.insertDatasets(schema, dataIds=[dataId], run=run1)
1683 (dataset2,) = registry.insertDatasets(schema, dataIds=[dataId], run=run2)
1684 # Query directly for both of the datasets, and each one, one at a time.
1685 self.checkQueryResults(
1686 registry.queryDatasets(schema, collections=[run1, run2], findFirst=False), [dataset1, dataset2]
1687 )
1688 self.checkQueryResults(
1689 registry.queryDatasets(schema, collections=[run1, run2], findFirst=True),
1690 [dataset1],
1691 )
1692 self.checkQueryResults(
1693 registry.queryDatasets(schema, collections=[run2, run1], findFirst=True),
1694 [dataset2],
1695 )
1696 # Query for data IDs with no dimensions.
1697 dataIds = registry.queryDataIds([])
1698 self.checkQueryResults(dataIds, [dataId])
1699 # Use queried data IDs to find the datasets.
1700 self.checkQueryResults(
1701 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1702 [dataset1, dataset2],
1703 )
1704 self.checkQueryResults(
1705 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1706 [dataset1],
1707 )
1708 self.checkQueryResults(
1709 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1710 [dataset2],
1711 )
1712 # Now materialize the data ID query results and repeat those tests.
1713 with dataIds.materialize() as dataIds:
1714 self.checkQueryResults(dataIds, [dataId])
1715 self.checkQueryResults(
1716 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1717 [dataset1],
1718 )
1719 self.checkQueryResults(
1720 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1721 [dataset2],
1722 )
1723 # Query for non-empty data IDs, then subset that to get the empty one.
1724 # Repeat the above tests starting from that.
1725 dataIds = registry.queryDataIds(["instrument"]).subset(registry.dimensions.empty, unique=True)
1726 self.checkQueryResults(dataIds, [dataId])
1727 self.checkQueryResults(
1728 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1729 [dataset1, dataset2],
1730 )
1731 self.checkQueryResults(
1732 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1733 [dataset1],
1734 )
1735 self.checkQueryResults(
1736 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1737 [dataset2],
1738 )
1739 with dataIds.materialize() as dataIds:
1740 self.checkQueryResults(dataIds, [dataId])
1741 self.checkQueryResults(
1742 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1743 [dataset1, dataset2],
1744 )
1745 self.checkQueryResults(
1746 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1747 [dataset1],
1748 )
1749 self.checkQueryResults(
1750 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1751 [dataset2],
1752 )
1753 # Query for non-empty data IDs, then materialize, then subset to get
1754 # the empty one. Repeat again.
1755 with registry.queryDataIds(["instrument"]).materialize() as nonEmptyDataIds:
1756 dataIds = nonEmptyDataIds.subset(registry.dimensions.empty, unique=True)
1757 self.checkQueryResults(dataIds, [dataId])
1758 self.checkQueryResults(
1759 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1760 [dataset1, dataset2],
1761 )
1762 self.checkQueryResults(
1763 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1764 [dataset1],
1765 )
1766 self.checkQueryResults(
1767 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1768 [dataset2],
1769 )
1770 with dataIds.materialize() as dataIds:
1771 self.checkQueryResults(dataIds, [dataId])
1772 self.checkQueryResults(
1773 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1774 [dataset1, dataset2],
1775 )
1776 self.checkQueryResults(
1777 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1778 [dataset1],
1779 )
1780 self.checkQueryResults(
1781 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1782 [dataset2],
1783 )
1784 # Query for non-empty data IDs with a constraint on an empty-data-ID
1785 # dataset that exists.
1786 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=...)
1787 self.checkQueryResults(
1788 dataIds.subset(unique=True),
1789 [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)],
1790 )
1791 # Again query for non-empty data IDs with a constraint on empty-data-ID
1792 # datasets, but when the datasets don't exist. We delete the existing
1793 # dataset and query just that collection rather than creating a new
1794 # empty collection because this is a bit less likely for our build-time
1795 # logic to shortcut-out (via the collection summaries), and such a
1796 # shortcut would make this test a bit more trivial than we'd like.
1797 registry.removeDatasets([dataset2])
1798 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=run2)
1799 self.checkQueryResults(dataIds, [])
1801 def testDimensionDataModifications(self):
1802 """Test that modifying dimension records via:
1803 syncDimensionData(..., update=True) and
1804 insertDimensionData(..., replace=True) works as expected, even in the
1805 presence of datasets using those dimensions and spatial overlap
1806 relationships.
1807 """
1809 def unpack_range_set(ranges: lsst.sphgeom.RangeSet) -> Iterator[int]:
1810 """Unpack a sphgeom.RangeSet into the integers it contains."""
1811 for begin, end in ranges:
1812 yield from range(begin, end)
1814 def range_set_hull(
1815 ranges: lsst.sphgeom.RangeSet,
1816 pixelization: lsst.sphgeom.HtmPixelization,
1817 ) -> lsst.sphgeom.ConvexPolygon:
1818 """Create a ConvexPolygon hull of the region defined by a set of
1819 HTM pixelization index ranges.
1820 """
1821 points = []
1822 for index in unpack_range_set(ranges):
1823 points.extend(pixelization.triangle(index).getVertices())
1824 return lsst.sphgeom.ConvexPolygon(points)
1826 # Use HTM to set up an initial parent region (one arbitrary trixel)
1827 # and four child regions (the trixels within the parent at the next
1828 # level. We'll use the parent as a tract/visit region and the children
1829 # as its patch/visit_detector regions.
1830 registry = self.makeRegistry()
1831 htm6 = registry.dimensions.skypix["htm"][6].pixelization
1832 commonSkyPix = registry.dimensions.commonSkyPix.pixelization
1833 index = 12288
1834 child_ranges_small = lsst.sphgeom.RangeSet(index).scaled(4)
1835 assert htm6.universe().contains(child_ranges_small)
1836 child_regions_small = [htm6.triangle(i) for i in unpack_range_set(child_ranges_small)]
1837 parent_region_small = lsst.sphgeom.ConvexPolygon(
1838 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_small))
1839 )
1840 assert all(parent_region_small.contains(c) for c in child_regions_small)
1841 # Make a larger version of each child region, defined to be the set of
1842 # htm6 trixels that overlap the original's bounding circle. Make a new
1843 # parent that's the convex hull of the new children.
1844 child_regions_large = [
1845 range_set_hull(htm6.envelope(c.getBoundingCircle()), htm6) for c in child_regions_small
1846 ]
1847 assert all(large.contains(small) for large, small in zip(child_regions_large, child_regions_small))
1848 parent_region_large = lsst.sphgeom.ConvexPolygon(
1849 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_large))
1850 )
1851 assert all(parent_region_large.contains(c) for c in child_regions_large)
1852 assert parent_region_large.contains(parent_region_small)
1853 assert not parent_region_small.contains(parent_region_large)
1854 assert not all(parent_region_small.contains(c) for c in child_regions_large)
1855 # Find some commonSkyPix indices that overlap the large regions but not
1856 # overlap the small regions. We use commonSkyPix here to make sure the
1857 # real tests later involve what's in the database, not just post-query
1858 # filtering of regions.
1859 child_difference_indices = []
1860 for large, small in zip(child_regions_large, child_regions_small):
1861 difference = list(unpack_range_set(commonSkyPix.envelope(large) - commonSkyPix.envelope(small)))
1862 assert difference, "if this is empty, we can't test anything useful with these regions"
1863 assert all(
1864 not commonSkyPix.triangle(d).isDisjointFrom(large)
1865 and commonSkyPix.triangle(d).isDisjointFrom(small)
1866 for d in difference
1867 )
1868 child_difference_indices.append(difference)
1869 parent_difference_indices = list(
1870 unpack_range_set(
1871 commonSkyPix.envelope(parent_region_large) - commonSkyPix.envelope(parent_region_small)
1872 )
1873 )
1874 assert parent_difference_indices, "if this is empty, we can't test anything useful with these regions"
1875 assert all(
1876 (
1877 not commonSkyPix.triangle(d).isDisjointFrom(parent_region_large)
1878 and commonSkyPix.triangle(d).isDisjointFrom(parent_region_small)
1879 )
1880 for d in parent_difference_indices
1881 )
1882 # Now that we've finally got those regions, we'll insert the large ones
1883 # as tract/patch dimension records.
1884 skymap_name = "testing_v1"
1885 registry.insertDimensionData(
1886 "skymap",
1887 {
1888 "name": skymap_name,
1889 "hash": bytes([42]),
1890 "tract_max": 1,
1891 "patch_nx_max": 2,
1892 "patch_ny_max": 2,
1893 },
1894 )
1895 registry.insertDimensionData("tract", {"skymap": skymap_name, "id": 0, "region": parent_region_large})
1896 registry.insertDimensionData(
1897 "patch",
1898 *[
1899 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1900 for n, c in enumerate(child_regions_large)
1901 ],
1902 )
1903 # Add at dataset that uses these dimensions to make sure that modifying
1904 # them doesn't disrupt foreign keys (need to make sure DB doesn't
1905 # implement insert with replace=True as delete-then-insert).
1906 dataset_type = DatasetType(
1907 "coadd",
1908 dimensions=["tract", "patch"],
1909 universe=registry.dimensions,
1910 storageClass="Exposure",
1911 )
1912 registry.registerDatasetType(dataset_type)
1913 registry.registerCollection("the_run", CollectionType.RUN)
1914 registry.insertDatasets(
1915 dataset_type,
1916 [{"skymap": skymap_name, "tract": 0, "patch": 2}],
1917 run="the_run",
1918 )
1919 # Query for tracts and patches that overlap some "difference" htm9
1920 # pixels; there should be overlaps, because the database has
1921 # the "large" suite of regions.
1922 self.assertEqual(
1923 {0},
1924 {
1925 data_id["tract"]
1926 for data_id in registry.queryDataIds(
1927 ["tract"],
1928 skymap=skymap_name,
1929 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1930 )
1931 },
1932 )
1933 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1934 self.assertIn(
1935 patch_id,
1936 {
1937 data_id["patch"]
1938 for data_id in registry.queryDataIds(
1939 ["patch"],
1940 skymap=skymap_name,
1941 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1942 )
1943 },
1944 )
1945 # Use sync to update the tract region and insert to update the regions
1946 # of the patches, to the "small" suite.
1947 updated = registry.syncDimensionData(
1948 "tract",
1949 {"skymap": skymap_name, "id": 0, "region": parent_region_small},
1950 update=True,
1951 )
1952 self.assertEqual(updated, {"region": parent_region_large})
1953 registry.insertDimensionData(
1954 "patch",
1955 *[
1956 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1957 for n, c in enumerate(child_regions_small)
1958 ],
1959 replace=True,
1960 )
1961 # Query again; there now should be no such overlaps, because the
1962 # database has the "small" suite of regions.
1963 self.assertFalse(
1964 set(
1965 registry.queryDataIds(
1966 ["tract"],
1967 skymap=skymap_name,
1968 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1969 )
1970 )
1971 )
1972 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1973 self.assertNotIn(
1974 patch_id,
1975 {
1976 data_id["patch"]
1977 for data_id in registry.queryDataIds(
1978 ["patch"],
1979 skymap=skymap_name,
1980 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1981 )
1982 },
1983 )
1984 # Update back to the large regions and query one more time.
1985 updated = registry.syncDimensionData(
1986 "tract",
1987 {"skymap": skymap_name, "id": 0, "region": parent_region_large},
1988 update=True,
1989 )
1990 self.assertEqual(updated, {"region": parent_region_small})
1991 registry.insertDimensionData(
1992 "patch",
1993 *[
1994 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1995 for n, c in enumerate(child_regions_large)
1996 ],
1997 replace=True,
1998 )
1999 self.assertEqual(
2000 {0},
2001 {
2002 data_id["tract"]
2003 for data_id in registry.queryDataIds(
2004 ["tract"],
2005 skymap=skymap_name,
2006 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
2007 )
2008 },
2009 )
2010 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
2011 self.assertIn(
2012 patch_id,
2013 {
2014 data_id["patch"]
2015 for data_id in registry.queryDataIds(
2016 ["patch"],
2017 skymap=skymap_name,
2018 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
2019 )
2020 },
2021 )
2023 def testCalibrationCollections(self):
2024 """Test operations on `~CollectionType.CALIBRATION` collections,
2025 including `Registry.certify`, `Registry.decertify`, and
2026 `Registry.findDataset`.
2027 """
2028 # Setup - make a Registry, fill it with some datasets in
2029 # non-calibration collections.
2030 registry = self.makeRegistry()
2031 self.loadData(registry, "base.yaml")
2032 self.loadData(registry, "datasets.yaml")
2033 # Set up some timestamps.
2034 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")
2035 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai")
2036 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai")
2037 t4 = astropy.time.Time("2020-01-01T04:00:00", format="isot", scale="tai")
2038 t5 = astropy.time.Time("2020-01-01T05:00:00", format="isot", scale="tai")
2039 allTimespans = [
2040 Timespan(a, b) for a, b in itertools.combinations([None, t1, t2, t3, t4, t5, None], r=2)
2041 ]
2042 # Get references to some datasets.
2043 bias2a = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g")
2044 bias3a = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g")
2045 bias2b = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r")
2046 bias3b = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r")
2047 # Register the main calibration collection we'll be working with.
2048 collection = "Cam1/calibs/default"
2049 registry.registerCollection(collection, type=CollectionType.CALIBRATION)
2050 # Cannot associate into a calibration collection (no timespan).
2051 with self.assertRaises(CollectionTypeError):
2052 registry.associate(collection, [bias2a])
2053 # Certify 2a dataset with [t2, t4) validity.
2054 registry.certify(collection, [bias2a], Timespan(begin=t2, end=t4))
2055 # Test that we can query for this dataset via the new collection, both
2056 # on its own and with a RUN collection, as long as we don't try to join
2057 # in temporal dimensions or use findFirst=True.
2058 self.assertEqual(
2059 set(registry.queryDatasets("bias", findFirst=False, collections=collection)),
2060 {bias2a},
2061 )
2062 self.assertEqual(
2063 set(registry.queryDatasets("bias", findFirst=False, collections=[collection, "imported_r"])),
2064 {
2065 bias2a,
2066 bias2b,
2067 bias3b,
2068 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2069 },
2070 )
2071 self.assertEqual(
2072 set(registry.queryDataIds("detector", datasets="bias", collections=collection)),
2073 {registry.expandDataId(instrument="Cam1", detector=2)},
2074 )
2075 self.assertEqual(
2076 set(registry.queryDataIds("detector", datasets="bias", collections=[collection, "imported_r"])),
2077 {
2078 registry.expandDataId(instrument="Cam1", detector=2),
2079 registry.expandDataId(instrument="Cam1", detector=3),
2080 registry.expandDataId(instrument="Cam1", detector=4),
2081 },
2082 )
2084 # We should not be able to certify 2b with anything overlapping that
2085 # window.
2086 with self.assertRaises(ConflictingDefinitionError):
2087 registry.certify(collection, [bias2b], Timespan(begin=None, end=t3))
2088 with self.assertRaises(ConflictingDefinitionError):
2089 registry.certify(collection, [bias2b], Timespan(begin=None, end=t5))
2090 with self.assertRaises(ConflictingDefinitionError):
2091 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t3))
2092 with self.assertRaises(ConflictingDefinitionError):
2093 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t5))
2094 with self.assertRaises(ConflictingDefinitionError):
2095 registry.certify(collection, [bias2b], Timespan(begin=t1, end=None))
2096 with self.assertRaises(ConflictingDefinitionError):
2097 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t3))
2098 with self.assertRaises(ConflictingDefinitionError):
2099 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t5))
2100 with self.assertRaises(ConflictingDefinitionError):
2101 registry.certify(collection, [bias2b], Timespan(begin=t2, end=None))
2102 # We should be able to certify 3a with a range overlapping that window,
2103 # because it's for a different detector.
2104 # We'll certify 3a over [t1, t3).
2105 registry.certify(collection, [bias3a], Timespan(begin=t1, end=t3))
2106 # Now we'll certify 2b and 3b together over [t4, ∞).
2107 registry.certify(collection, [bias2b, bias3b], Timespan(begin=t4, end=None))
2109 # Fetch all associations and check that they are what we expect.
2110 self.assertCountEqual(
2111 list(
2112 registry.queryDatasetAssociations(
2113 "bias",
2114 collections=[collection, "imported_g", "imported_r"],
2115 )
2116 ),
2117 [
2118 DatasetAssociation(
2119 ref=registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
2120 collection="imported_g",
2121 timespan=None,
2122 ),
2123 DatasetAssociation(
2124 ref=registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2125 collection="imported_r",
2126 timespan=None,
2127 ),
2128 DatasetAssociation(ref=bias2a, collection="imported_g", timespan=None),
2129 DatasetAssociation(ref=bias3a, collection="imported_g", timespan=None),
2130 DatasetAssociation(ref=bias2b, collection="imported_r", timespan=None),
2131 DatasetAssociation(ref=bias3b, collection="imported_r", timespan=None),
2132 DatasetAssociation(ref=bias2a, collection=collection, timespan=Timespan(begin=t2, end=t4)),
2133 DatasetAssociation(ref=bias3a, collection=collection, timespan=Timespan(begin=t1, end=t3)),
2134 DatasetAssociation(ref=bias2b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2135 DatasetAssociation(ref=bias3b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2136 ],
2137 )
2139 class Ambiguous:
2140 """Tag class to denote lookups that should be ambiguous."""
2142 pass
2144 def assertLookup(
2145 detector: int, timespan: Timespan, expected: DatasetRef | type[Ambiguous] | None
2146 ) -> None:
2147 """Local function that asserts that a bias lookup returns the given
2148 expected result.
2149 """
2150 if expected is Ambiguous:
2151 with self.assertRaises((DatasetTypeError, LookupError)):
2152 registry.findDataset(
2153 "bias",
2154 collections=collection,
2155 instrument="Cam1",
2156 detector=detector,
2157 timespan=timespan,
2158 )
2159 else:
2160 self.assertEqual(
2161 expected,
2162 registry.findDataset(
2163 "bias",
2164 collections=collection,
2165 instrument="Cam1",
2166 detector=detector,
2167 timespan=timespan,
2168 ),
2169 )
2171 # Systematically test lookups against expected results.
2172 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2173 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2174 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2175 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2176 assertLookup(detector=2, timespan=Timespan(None, t5), expected=Ambiguous)
2177 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2178 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2179 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2180 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2181 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=Ambiguous)
2182 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2183 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2184 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2185 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=Ambiguous)
2186 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2187 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=bias2a)
2188 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=Ambiguous)
2189 assertLookup(detector=2, timespan=Timespan(t3, None), expected=Ambiguous)
2190 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=bias2b)
2191 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2192 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2193 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2194 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2195 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2196 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2197 assertLookup(detector=3, timespan=Timespan(None, t5), expected=Ambiguous)
2198 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2199 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2200 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2201 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2202 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=Ambiguous)
2203 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2204 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2205 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2206 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=Ambiguous)
2207 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2208 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2209 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=bias3b)
2210 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2211 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=bias3b)
2212 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2213 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2215 # Decertify [t3, t5) for all data IDs, and do test lookups again.
2216 # This should truncate bias2a to [t2, t3), leave bias3a unchanged at
2217 # [t1, t3), and truncate bias2b and bias3b to [t5, ∞).
2218 registry.decertify(collection=collection, datasetType="bias", timespan=Timespan(t3, t5))
2219 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2220 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2221 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2222 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2223 assertLookup(detector=2, timespan=Timespan(None, t5), expected=bias2a)
2224 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2225 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2226 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2227 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2228 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=bias2a)
2229 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2230 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2231 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2232 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=bias2a)
2233 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2234 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=None)
2235 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=None)
2236 assertLookup(detector=2, timespan=Timespan(t3, None), expected=bias2b)
2237 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=None)
2238 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2239 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2240 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2241 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2242 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2243 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2244 assertLookup(detector=3, timespan=Timespan(None, t5), expected=bias3a)
2245 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2246 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2247 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2248 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2249 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=bias3a)
2250 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2251 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2252 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2253 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=bias3a)
2254 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2255 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2256 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=None)
2257 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2258 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=None)
2259 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2260 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2262 # Decertify everything, this time with explicit data IDs, then check
2263 # that no lookups succeed.
2264 registry.decertify(
2265 collection,
2266 "bias",
2267 Timespan(None, None),
2268 dataIds=[
2269 dict(instrument="Cam1", detector=2),
2270 dict(instrument="Cam1", detector=3),
2271 ],
2272 )
2273 for detector in (2, 3):
2274 for timespan in allTimespans:
2275 assertLookup(detector=detector, timespan=timespan, expected=None)
2276 # Certify bias2a and bias3a over (-∞, ∞), check that all lookups return
2277 # those.
2278 registry.certify(
2279 collection,
2280 [bias2a, bias3a],
2281 Timespan(None, None),
2282 )
2283 for timespan in allTimespans:
2284 assertLookup(detector=2, timespan=timespan, expected=bias2a)
2285 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2286 # Decertify just bias2 over [t2, t4).
2287 # This should split a single certification row into two (and leave the
2288 # other existing row, for bias3a, alone).
2289 registry.decertify(
2290 collection, "bias", Timespan(t2, t4), dataIds=[dict(instrument="Cam1", detector=2)]
2291 )
2292 for timespan in allTimespans:
2293 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2294 overlapsBefore = timespan.overlaps(Timespan(None, t2))
2295 overlapsAfter = timespan.overlaps(Timespan(t4, None))
2296 if overlapsBefore and overlapsAfter:
2297 expected = Ambiguous
2298 elif overlapsBefore or overlapsAfter:
2299 expected = bias2a
2300 else:
2301 expected = None
2302 assertLookup(detector=2, timespan=timespan, expected=expected)
2304 def testSkipCalibs(self):
2305 """Test how queries handle skipping of calibration collections."""
2306 registry = self.makeRegistry()
2307 self.loadData(registry, "base.yaml")
2308 self.loadData(registry, "datasets.yaml")
2310 coll_calib = "Cam1/calibs/default"
2311 registry.registerCollection(coll_calib, type=CollectionType.CALIBRATION)
2313 # Add all biases to the calibration collection.
2314 # Without this, the logic that prunes dataset subqueries based on
2315 # datasetType-collection summary information will fire before the logic
2316 # we want to test below. This is a good thing (it avoids the dreaded
2317 # NotImplementedError a bit more often) everywhere but here.
2318 registry.certify(coll_calib, registry.queryDatasets("bias", collections=...), Timespan(None, None))
2320 coll_list = [coll_calib, "imported_g", "imported_r"]
2321 chain = "Cam1/chain"
2322 registry.registerCollection(chain, type=CollectionType.CHAINED)
2323 registry.setCollectionChain(chain, coll_list)
2325 # explicit list will raise if findFirst=True or there are temporal
2326 # dimensions
2327 with self.assertRaises(NotImplementedError):
2328 registry.queryDatasets("bias", collections=coll_list, findFirst=True)
2329 with self.assertRaises(NotImplementedError):
2330 registry.queryDataIds(
2331 ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
2332 ).count()
2334 # chain will skip
2335 datasets = list(registry.queryDatasets("bias", collections=chain))
2336 self.assertGreater(len(datasets), 0)
2338 dataIds = list(registry.queryDataIds(["instrument", "detector"], datasets="bias", collections=chain))
2339 self.assertGreater(len(dataIds), 0)
2341 # glob will skip too
2342 datasets = list(registry.queryDatasets("bias", collections="*d*"))
2343 self.assertGreater(len(datasets), 0)
2345 # regular expression will skip too
2346 pattern = re.compile(".*")
2347 datasets = list(registry.queryDatasets("bias", collections=pattern))
2348 self.assertGreater(len(datasets), 0)
2350 # ellipsis should work as usual
2351 datasets = list(registry.queryDatasets("bias", collections=...))
2352 self.assertGreater(len(datasets), 0)
2354 # few tests with findFirst
2355 datasets = list(registry.queryDatasets("bias", collections=chain, findFirst=True))
2356 self.assertGreater(len(datasets), 0)
2358 def testIngestTimeQuery(self):
2359 registry = self.makeRegistry()
2360 self.loadData(registry, "base.yaml")
2361 dt0 = datetime.utcnow()
2362 self.loadData(registry, "datasets.yaml")
2363 dt1 = datetime.utcnow()
2365 datasets = list(registry.queryDatasets(..., collections=...))
2366 len0 = len(datasets)
2367 self.assertGreater(len0, 0)
2369 where = "ingest_date > T'2000-01-01'"
2370 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2371 len1 = len(datasets)
2372 self.assertEqual(len0, len1)
2374 # no one will ever use this piece of software in 30 years
2375 where = "ingest_date > T'2050-01-01'"
2376 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2377 len2 = len(datasets)
2378 self.assertEqual(len2, 0)
2380 # Check more exact timing to make sure there is no 37 seconds offset
2381 # (after fixing DM-30124). SQLite time precision is 1 second, make
2382 # sure that we don't test with higher precision.
2383 tests = [
2384 # format: (timestamp, operator, expected_len)
2385 (dt0 - timedelta(seconds=1), ">", len0),
2386 (dt0 - timedelta(seconds=1), "<", 0),
2387 (dt1 + timedelta(seconds=1), "<", len0),
2388 (dt1 + timedelta(seconds=1), ">", 0),
2389 ]
2390 for dt, op, expect_len in tests:
2391 dt_str = dt.isoformat(sep=" ")
2393 where = f"ingest_date {op} T'{dt_str}'"
2394 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2395 self.assertEqual(len(datasets), expect_len)
2397 # same with bind using datetime or astropy Time
2398 where = f"ingest_date {op} ingest_time"
2399 datasets = list(
2400 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt})
2401 )
2402 self.assertEqual(len(datasets), expect_len)
2404 dt_astropy = astropy.time.Time(dt, format="datetime")
2405 datasets = list(
2406 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt_astropy})
2407 )
2408 self.assertEqual(len(datasets), expect_len)
2410 def testTimespanQueries(self):
2411 """Test query expressions involving timespans."""
2412 registry = self.makeRegistry()
2413 self.loadData(registry, "hsc-rc2-subset.yaml")
2414 # All exposures in the database; mapping from ID to timespan.
2415 visits = {record.id: record.timespan for record in registry.queryDimensionRecords("visit")}
2416 # Just those IDs, sorted (which is also temporal sorting, because HSC
2417 # exposure IDs are monotonically increasing).
2418 ids = sorted(visits.keys())
2419 self.assertGreater(len(ids), 20)
2420 # Pick some quasi-random indexes into `ids` to play with.
2421 i1 = int(len(ids) * 0.1)
2422 i2 = int(len(ids) * 0.3)
2423 i3 = int(len(ids) * 0.6)
2424 i4 = int(len(ids) * 0.8)
2425 # Extract some times from those: just before the beginning of i1 (which
2426 # should be after the end of the exposure before), exactly the
2427 # beginning of i2, just after the beginning of i3 (and before its end),
2428 # and the exact end of i4.
2429 t1 = visits[ids[i1]].begin - astropy.time.TimeDelta(1.0, format="sec")
2430 self.assertGreater(t1, visits[ids[i1 - 1]].end)
2431 t2 = visits[ids[i2]].begin
2432 t3 = visits[ids[i3]].begin + astropy.time.TimeDelta(1.0, format="sec")
2433 self.assertLess(t3, visits[ids[i3]].end)
2434 t4 = visits[ids[i4]].end
2435 # Make sure those are actually in order.
2436 self.assertEqual([t1, t2, t3, t4], sorted([t4, t3, t2, t1]))
2438 bind = {
2439 "t1": t1,
2440 "t2": t2,
2441 "t3": t3,
2442 "t4": t4,
2443 "ts23": Timespan(t2, t3),
2444 }
2446 def query(where):
2447 """Return results as a sorted, deduplicated list of visit IDs."""
2448 return sorted(
2449 {
2450 dataId["visit"]
2451 for dataId in registry.queryDataIds("visit", instrument="HSC", bind=bind, where=where)
2452 }
2453 )
2455 # Try a bunch of timespan queries, mixing up the bounds themselves,
2456 # where they appear in the expression, and how we get the timespan into
2457 # the expression.
2459 # t1 is before the start of i1, so this should not include i1.
2460 self.assertEqual(ids[:i1], query("visit.timespan OVERLAPS (null, t1)"))
2461 # t2 is exactly at the start of i2, but ends are exclusive, so these
2462 # should not include i2.
2463 self.assertEqual(ids[i1:i2], query("(t1, t2) OVERLAPS visit.timespan"))
2464 self.assertEqual(ids[:i2], query("visit.timespan < (t2, t4)"))
2465 # t3 is in the middle of i3, so this should include i3.
2466 self.assertEqual(ids[i2 : i3 + 1], query("visit.timespan OVERLAPS ts23"))
2467 # This one should not include t3 by the same reasoning.
2468 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > (t1, t3)"))
2469 # t4 is exactly at the end of i4, so this should include i4.
2470 self.assertEqual(ids[i3 : i4 + 1], query(f"visit.timespan OVERLAPS (T'{t3.tai.isot}', t4)"))
2471 # i4's upper bound of t4 is exclusive so this should not include t4.
2472 self.assertEqual(ids[i4 + 1 :], query("visit.timespan OVERLAPS (t4, NULL)"))
2474 # Now some timespan vs. time scalar queries.
2475 self.assertEqual(ids[:i2], query("visit.timespan < t2"))
2476 self.assertEqual(ids[:i2], query("t2 > visit.timespan"))
2477 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > t3"))
2478 self.assertEqual(ids[i3 + 1 :], query("t3 < visit.timespan"))
2479 self.assertEqual(ids[i3 : i3 + 1], query("visit.timespan OVERLAPS t3"))
2480 self.assertEqual(ids[i3 : i3 + 1], query(f"T'{t3.tai.isot}' OVERLAPS visit.timespan"))
2482 # Empty timespans should not overlap anything.
2483 self.assertEqual([], query("visit.timespan OVERLAPS (t3, t2)"))
2485 def testCollectionSummaries(self):
2486 """Test recording and retrieval of collection summaries."""
2487 self.maxDiff = None
2488 registry = self.makeRegistry()
2489 # Importing datasets from yaml should go through the code path where
2490 # we update collection summaries as we insert datasets.
2491 self.loadData(registry, "base.yaml")
2492 self.loadData(registry, "datasets.yaml")
2493 flat = registry.getDatasetType("flat")
2494 expected1 = CollectionSummary()
2495 expected1.dataset_types.add(registry.getDatasetType("bias"))
2496 expected1.add_data_ids(
2497 flat, [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)]
2498 )
2499 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2500 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2501 # Create a chained collection with both of the imported runs; the
2502 # summary should be the same, because it's a union with itself.
2503 chain = "chain"
2504 registry.registerCollection(chain, CollectionType.CHAINED)
2505 registry.setCollectionChain(chain, ["imported_r", "imported_g"])
2506 self.assertEqual(registry.getCollectionSummary(chain), expected1)
2507 # Associate flats only into a tagged collection and a calibration
2508 # collection to check summaries of those.
2509 tag = "tag"
2510 registry.registerCollection(tag, CollectionType.TAGGED)
2511 registry.associate(tag, registry.queryDatasets(flat, collections="imported_g"))
2512 calibs = "calibs"
2513 registry.registerCollection(calibs, CollectionType.CALIBRATION)
2514 registry.certify(
2515 calibs, registry.queryDatasets(flat, collections="imported_g"), timespan=Timespan(None, None)
2516 )
2517 expected2 = expected1.copy()
2518 expected2.dataset_types.discard("bias")
2519 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2520 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2521 # Explicitly calling Registry.refresh() should load those same
2522 # summaries, via a totally different code path.
2523 registry.refresh()
2524 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2525 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2526 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2527 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2529 def testBindInQueryDatasets(self):
2530 """Test that the bind parameter is correctly forwarded in
2531 queryDatasets recursion.
2532 """
2533 registry = self.makeRegistry()
2534 # Importing datasets from yaml should go through the code path where
2535 # we update collection summaries as we insert datasets.
2536 self.loadData(registry, "base.yaml")
2537 self.loadData(registry, "datasets.yaml")
2538 self.assertEqual(
2539 set(registry.queryDatasets("flat", band="r", collections=...)),
2540 set(registry.queryDatasets("flat", where="band=my_band", bind={"my_band": "r"}, collections=...)),
2541 )
2543 def testQueryIntRangeExpressions(self):
2544 """Test integer range expressions in ``where`` arguments.
2546 Note that our expressions use inclusive stop values, unlike Python's.
2547 """
2548 registry = self.makeRegistry()
2549 self.loadData(registry, "base.yaml")
2550 self.assertEqual(
2551 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..2)")),
2552 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 2]},
2553 )
2554 self.assertEqual(
2555 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..4:2)")),
2556 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 3]},
2557 )
2558 self.assertEqual(
2559 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (2..4:2)")),
2560 {registry.expandDataId(instrument="Cam1", detector=n) for n in [2, 4]},
2561 )
2563 def testQueryResultSummaries(self):
2564 """Test summary methods like `count`, `any`, and `explain_no_results`
2565 on `DataCoordinateQueryResults` and `DatasetQueryResults`.
2566 """
2567 registry = self.makeRegistry()
2568 self.loadData(registry, "base.yaml")
2569 self.loadData(registry, "datasets.yaml")
2570 self.loadData(registry, "spatial.yaml")
2571 # Default test dataset has two collections, each with both flats and
2572 # biases. Add a new collection with only biases.
2573 registry.registerCollection("biases", CollectionType.TAGGED)
2574 registry.associate("biases", registry.queryDatasets("bias", collections=["imported_g"]))
2575 # First query yields two results, and involves no postprocessing.
2576 query1 = registry.queryDataIds(["physical_filter"], band="r")
2577 self.assertTrue(query1.any(execute=False, exact=False))
2578 self.assertTrue(query1.any(execute=True, exact=False))
2579 self.assertTrue(query1.any(execute=True, exact=True))
2580 self.assertEqual(query1.count(exact=False), 2)
2581 self.assertEqual(query1.count(exact=True), 2)
2582 self.assertFalse(list(query1.explain_no_results()))
2583 # Second query should yield no results, which we should see when
2584 # we attempt to expand the data ID.
2585 query2 = registry.queryDataIds(["physical_filter"], band="h")
2586 # There's no execute=False, exact=Fals test here because the behavior
2587 # not something we want to guarantee in this case (and exact=False
2588 # says either answer is legal).
2589 self.assertFalse(query2.any(execute=True, exact=False))
2590 self.assertFalse(query2.any(execute=True, exact=True))
2591 self.assertEqual(query2.count(exact=False), 0)
2592 self.assertEqual(query2.count(exact=True), 0)
2593 self.assertTrue(list(query2.explain_no_results()))
2594 # These queries yield no results due to various problems that can be
2595 # spotted prior to execution, yielding helpful diagnostics.
2596 base_query = registry.queryDataIds(["detector", "physical_filter"])
2597 queries_and_snippets = [
2598 (
2599 # Dataset type name doesn't match any existing dataset types.
2600 registry.queryDatasets("nonexistent", collections=...),
2601 ["nonexistent"],
2602 ),
2603 (
2604 # Dataset type object isn't registered.
2605 registry.queryDatasets(
2606 DatasetType(
2607 "nonexistent",
2608 dimensions=["instrument"],
2609 universe=registry.dimensions,
2610 storageClass="Image",
2611 ),
2612 collections=...,
2613 ),
2614 ["nonexistent"],
2615 ),
2616 (
2617 # No datasets of this type in this collection.
2618 registry.queryDatasets("flat", collections=["biases"]),
2619 ["flat", "biases"],
2620 ),
2621 (
2622 # No datasets of this type in this collection.
2623 base_query.findDatasets("flat", collections=["biases"]),
2624 ["flat", "biases"],
2625 ),
2626 (
2627 # No collections matching at all.
2628 registry.queryDatasets("flat", collections=re.compile("potato.+")),
2629 ["potato"],
2630 ),
2631 ]
2632 # The behavior of these additional queries is slated to change in the
2633 # future, so we also check for deprecation warnings.
2634 with self.assertWarns(FutureWarning):
2635 queries_and_snippets.append(
2636 (
2637 # Dataset type name doesn't match any existing dataset
2638 # types.
2639 registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...),
2640 ["nonexistent"],
2641 )
2642 )
2643 with self.assertWarns(FutureWarning):
2644 queries_and_snippets.append(
2645 (
2646 # Dataset type name doesn't match any existing dataset
2647 # types.
2648 registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...),
2649 ["nonexistent"],
2650 )
2651 )
2652 for query, snippets in queries_and_snippets:
2653 self.assertFalse(query.any(execute=False, exact=False))
2654 self.assertFalse(query.any(execute=True, exact=False))
2655 self.assertFalse(query.any(execute=True, exact=True))
2656 self.assertEqual(query.count(exact=False), 0)
2657 self.assertEqual(query.count(exact=True), 0)
2658 messages = list(query.explain_no_results())
2659 self.assertTrue(messages)
2660 # Want all expected snippets to appear in at least one message.
2661 self.assertTrue(
2662 any(
2663 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2664 ),
2665 messages,
2666 )
2668 # This query does yield results, but should also emit a warning because
2669 # dataset type patterns to queryDataIds is deprecated; just look for
2670 # the warning.
2671 with self.assertWarns(FutureWarning):
2672 registry.queryDataIds(["detector"], datasets=re.compile("^nonexistent$"), collections=...)
2674 # These queries yield no results due to problems that can be identified
2675 # by cheap follow-up queries, yielding helpful diagnostics.
2676 for query, snippets in [
2677 (
2678 # No records for one of the involved dimensions.
2679 registry.queryDataIds(["subfilter"]),
2680 ["no rows", "subfilter"],
2681 ),
2682 (
2683 # No records for one of the involved dimensions.
2684 registry.queryDimensionRecords("subfilter"),
2685 ["no rows", "subfilter"],
2686 ),
2687 ]:
2688 self.assertFalse(query.any(execute=True, exact=False))
2689 self.assertFalse(query.any(execute=True, exact=True))
2690 self.assertEqual(query.count(exact=True), 0)
2691 messages = list(query.explain_no_results())
2692 self.assertTrue(messages)
2693 # Want all expected snippets to appear in at least one message.
2694 self.assertTrue(
2695 any(
2696 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2697 ),
2698 messages,
2699 )
2701 # This query yields four overlaps in the database, but one is filtered
2702 # out in postprocessing. The count queries aren't accurate because
2703 # they don't account for duplication that happens due to an internal
2704 # join against commonSkyPix.
2705 query3 = registry.queryDataIds(["visit", "tract"], instrument="Cam1", skymap="SkyMap1")
2706 self.assertEqual(
2707 {
2708 DataCoordinate.standardize(
2709 instrument="Cam1",
2710 skymap="SkyMap1",
2711 visit=v,
2712 tract=t,
2713 universe=registry.dimensions,
2714 )
2715 for v, t in [(1, 0), (2, 0), (2, 1)]
2716 },
2717 set(query3),
2718 )
2719 self.assertTrue(query3.any(execute=False, exact=False))
2720 self.assertTrue(query3.any(execute=True, exact=False))
2721 self.assertTrue(query3.any(execute=True, exact=True))
2722 self.assertGreaterEqual(query3.count(exact=False), 4)
2723 self.assertGreaterEqual(query3.count(exact=True, discard=True), 3)
2724 self.assertFalse(list(query3.explain_no_results()))
2725 # This query yields overlaps in the database, but all are filtered
2726 # out in postprocessing. The count queries again aren't very useful.
2727 # We have to use `where=` here to avoid an optimization that
2728 # (currently) skips the spatial postprocess-filtering because it
2729 # recognizes that no spatial join is necessary. That's not ideal, but
2730 # fixing it is out of scope for this ticket.
2731 query4 = registry.queryDataIds(
2732 ["visit", "tract"],
2733 instrument="Cam1",
2734 skymap="SkyMap1",
2735 where="visit=1 AND detector=1 AND tract=0 AND patch=4",
2736 )
2737 self.assertFalse(set(query4))
2738 self.assertTrue(query4.any(execute=False, exact=False))
2739 self.assertTrue(query4.any(execute=True, exact=False))
2740 self.assertFalse(query4.any(execute=True, exact=True))
2741 self.assertGreaterEqual(query4.count(exact=False), 1)
2742 self.assertEqual(query4.count(exact=True, discard=True), 0)
2743 messages = query4.explain_no_results()
2744 self.assertTrue(messages)
2745 self.assertTrue(any("overlap" in message for message in messages))
2746 # This query should yield results from one dataset type but not the
2747 # other, which is not registered.
2748 query5 = registry.queryDatasets(["bias", "nonexistent"], collections=["biases"])
2749 self.assertTrue(set(query5))
2750 self.assertTrue(query5.any(execute=False, exact=False))
2751 self.assertTrue(query5.any(execute=True, exact=False))
2752 self.assertTrue(query5.any(execute=True, exact=True))
2753 self.assertGreaterEqual(query5.count(exact=False), 1)
2754 self.assertGreaterEqual(query5.count(exact=True), 1)
2755 self.assertFalse(list(query5.explain_no_results()))
2756 # This query applies a selection that yields no results, fully in the
2757 # database. Explaining why it fails involves traversing the relation
2758 # tree and running a LIMIT 1 query at each level that has the potential
2759 # to remove rows.
2760 query6 = registry.queryDimensionRecords(
2761 "detector", where="detector.purpose = 'no-purpose'", instrument="Cam1"
2762 )
2763 self.assertEqual(query6.count(exact=True), 0)
2764 messages = query6.explain_no_results()
2765 self.assertTrue(messages)
2766 self.assertTrue(any("no-purpose" in message for message in messages))
2768 def testQueryDataIdsExpressionError(self):
2769 """Test error checking of 'where' expressions in queryDataIds."""
2770 registry = self.makeRegistry()
2771 self.loadData(registry, "base.yaml")
2772 bind = {"time": astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")}
2773 with self.assertRaisesRegex(LookupError, r"No dimension element with name 'foo' in 'foo\.bar'\."):
2774 registry.queryDataIds(["detector"], where="foo.bar = 12")
2775 with self.assertRaisesRegex(
2776 LookupError, "Dimension element name cannot be inferred in this context."
2777 ):
2778 registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind)
2780 def testQueryDataIdsOrderBy(self):
2781 """Test order_by and limit on result returned by queryDataIds()."""
2782 registry = self.makeRegistry()
2783 self.loadData(registry, "base.yaml")
2784 self.loadData(registry, "datasets.yaml")
2785 self.loadData(registry, "spatial.yaml")
2787 def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
2788 return registry.queryDataIds(
2789 dimensions, datasets=datasets, collections=collections, instrument="Cam1", skymap="SkyMap1"
2790 )
2792 Test = namedtuple(
2793 "testQueryDataIdsOrderByTest",
2794 ("order_by", "keys", "result", "limit", "datasets", "collections"),
2795 defaults=(None, None, None),
2796 )
2798 test_data = (
2799 Test("tract,visit", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2800 Test("-tract,visit", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))),
2801 Test("tract,-visit", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))),
2802 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2), (0, 2), (0, 1), (0, 1))),
2803 Test(
2804 "tract.id,visit.id",
2805 "tract,visit",
2806 ((0, 1), (0, 1), (0, 2)),
2807 limit=(3,),
2808 ),
2809 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2)), limit=(3,)),
2810 Test("tract,visit", "tract,visit", ((0, 2), (1, 2), (1, 2)), limit=(3, 3)),
2811 Test("-tract,-visit", "tract,visit", ((0, 1),), limit=(3, 5)),
2812 Test(
2813 "tract,visit.exposure_time", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))
2814 ),
2815 Test(
2816 "-tract,-visit.exposure_time", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))
2817 ),
2818 Test("tract,-exposure_time", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2819 Test("tract,visit.name", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2820 Test(
2821 "tract,-timespan.begin,timespan.end",
2822 "tract,visit",
2823 ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2)),
2824 ),
2825 Test("visit.day_obs,exposure.day_obs", "visit,exposure", ()),
2826 Test("visit.timespan.begin,-exposure.timespan.begin", "visit,exposure", ()),
2827 Test(
2828 "tract,detector",
2829 "tract,detector",
2830 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2831 datasets="flat",
2832 collections="imported_r",
2833 ),
2834 Test(
2835 "tract,detector.full_name",
2836 "tract,detector",
2837 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2838 datasets="flat",
2839 collections="imported_r",
2840 ),
2841 Test(
2842 "tract,detector.raft,detector.name_in_raft",
2843 "tract,detector",
2844 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2845 datasets="flat",
2846 collections="imported_r",
2847 ),
2848 )
2850 for test in test_data:
2851 order_by = test.order_by.split(",")
2852 keys = test.keys.split(",")
2853 query = do_query(keys, test.datasets, test.collections).order_by(*order_by)
2854 if test.limit is not None:
2855 query = query.limit(*test.limit)
2856 dataIds = tuple(tuple(dataId[k] for k in keys) for dataId in query)
2857 self.assertEqual(dataIds, test.result)
2859 # and materialize
2860 query = do_query(keys).order_by(*order_by)
2861 if test.limit is not None:
2862 query = query.limit(*test.limit)
2863 with self.assertRaises(RelationalAlgebraError):
2864 with query.materialize():
2865 pass
2867 # errors in a name
2868 for order_by in ("", "-"):
2869 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2870 list(do_query().order_by(order_by))
2872 for order_by in ("undimension.name", "-undimension.name"):
2873 with self.assertRaisesRegex(ValueError, "Unknown dimension element 'undimension'"):
2874 list(do_query().order_by(order_by))
2876 for order_by in ("attract", "-attract"):
2877 with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
2878 list(do_query().order_by(order_by))
2880 with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
2881 list(do_query(("exposure", "visit")).order_by("exposure_time"))
2883 with self.assertRaisesRegex(
2884 ValueError,
2885 r"Timespan exists in more than one dimension element \(exposure, visit\); "
2886 r"qualify timespan with specific dimension name\.",
2887 ):
2888 list(do_query(("exposure", "visit")).order_by("timespan.begin"))
2890 with self.assertRaisesRegex(
2891 ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
2892 ):
2893 list(do_query("tract").order_by("timespan.begin"))
2895 with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
2896 list(do_query("tract").order_by("tract.timespan.begin"))
2898 with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
2899 list(do_query("tract").order_by("tract.name"))
2901 with self.assertRaisesRegex(
2902 ValueError, r"Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?"
2903 ):
2904 list(do_query("visit").order_by("timestamp.begin"))
2906 def testQueryDataIdsGovernorExceptions(self):
2907 """Test exceptions raised by queryDataIds() for incorrect governors."""
2908 registry = self.makeRegistry()
2909 self.loadData(registry, "base.yaml")
2910 self.loadData(registry, "datasets.yaml")
2911 self.loadData(registry, "spatial.yaml")
2913 def do_query(dimensions, dataId=None, where="", bind=None, **kwargs):
2914 return registry.queryDataIds(dimensions, dataId=dataId, where=where, bind=bind, **kwargs)
2916 Test = namedtuple(
2917 "testQueryDataIdExceptionsTest",
2918 ("dimensions", "dataId", "where", "bind", "kwargs", "exception", "count"),
2919 defaults=(None, None, None, {}, None, 0),
2920 )
2922 test_data = (
2923 Test("tract,visit", count=6),
2924 Test("tract,visit", kwargs={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2925 Test(
2926 "tract,visit", kwargs={"instrument": "Cam2", "skymap": "SkyMap1"}, exception=DataIdValueError
2927 ),
2928 Test("tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2929 Test(
2930 "tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap2"}, exception=DataIdValueError
2931 ),
2932 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap1'", count=6),
2933 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap5'", exception=DataIdValueError),
2934 Test(
2935 "tract,visit",
2936 where="instrument=cam AND skymap=map",
2937 bind={"cam": "Cam1", "map": "SkyMap1"},
2938 count=6,
2939 ),
2940 Test(
2941 "tract,visit",
2942 where="instrument=cam AND skymap=map",
2943 bind={"cam": "Cam", "map": "SkyMap"},
2944 exception=DataIdValueError,
2945 ),
2946 )
2948 for test in test_data:
2949 dimensions = test.dimensions.split(",")
2950 if test.exception:
2951 with self.assertRaises(test.exception):
2952 do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs).count()
2953 else:
2954 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2955 self.assertEqual(query.count(discard=True), test.count)
2957 # and materialize
2958 if test.exception:
2959 with self.assertRaises(test.exception):
2960 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2961 with query.materialize() as materialized:
2962 materialized.count(discard=True)
2963 else:
2964 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2965 with query.materialize() as materialized:
2966 self.assertEqual(materialized.count(discard=True), test.count)
2968 def testQueryDimensionRecordsOrderBy(self):
2969 """Test order_by and limit on result returned by
2970 queryDimensionRecords().
2971 """
2972 registry = self.makeRegistry()
2973 self.loadData(registry, "base.yaml")
2974 self.loadData(registry, "datasets.yaml")
2975 self.loadData(registry, "spatial.yaml")
2977 def do_query(element, datasets=None, collections=None):
2978 return registry.queryDimensionRecords(
2979 element, instrument="Cam1", datasets=datasets, collections=collections
2980 )
2982 query = do_query("detector")
2983 self.assertEqual(len(list(query)), 4)
2985 Test = namedtuple(
2986 "testQueryDataIdsOrderByTest",
2987 ("element", "order_by", "result", "limit", "datasets", "collections"),
2988 defaults=(None, None, None),
2989 )
2991 test_data = (
2992 Test("detector", "detector", (1, 2, 3, 4)),
2993 Test("detector", "-detector", (4, 3, 2, 1)),
2994 Test("detector", "raft,-name_in_raft", (2, 1, 4, 3)),
2995 Test("detector", "-detector.purpose", (4,), limit=(1,)),
2996 Test("detector", "-purpose,detector.raft,name_in_raft", (2, 3), limit=(2, 2)),
2997 Test("visit", "visit", (1, 2)),
2998 Test("visit", "-visit.id", (2, 1)),
2999 Test("visit", "zenith_angle", (1, 2)),
3000 Test("visit", "-visit.name", (2, 1)),
3001 Test("visit", "day_obs,-timespan.begin", (2, 1)),
3002 )
3004 for test in test_data:
3005 order_by = test.order_by.split(",")
3006 query = do_query(test.element).order_by(*order_by)
3007 if test.limit is not None:
3008 query = query.limit(*test.limit)
3009 dataIds = tuple(rec.id for rec in query)
3010 self.assertEqual(dataIds, test.result)
3012 # errors in a name
3013 for order_by in ("", "-"):
3014 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
3015 list(do_query("detector").order_by(order_by))
3017 for order_by in ("undimension.name", "-undimension.name"):
3018 with self.assertRaisesRegex(ValueError, "Element name mismatch: 'undimension'"):
3019 list(do_query("detector").order_by(order_by))
3021 for order_by in ("attract", "-attract"):
3022 with self.assertRaisesRegex(ValueError, "Field 'attract' does not exist in 'detector'."):
3023 list(do_query("detector").order_by(order_by))
3025 for order_by in ("timestamp.begin", "-timestamp.begin"):
3026 with self.assertRaisesRegex(
3027 ValueError,
3028 r"Element name mismatch: 'timestamp' instead of 'visit'; "
3029 r"perhaps you meant 'timespan.begin'\?",
3030 ):
3031 list(do_query("visit").order_by(order_by))
3033 def testQueryDimensionRecordsExceptions(self):
3034 """Test exceptions raised by queryDimensionRecords()."""
3035 registry = self.makeRegistry()
3036 self.loadData(registry, "base.yaml")
3037 self.loadData(registry, "datasets.yaml")
3038 self.loadData(registry, "spatial.yaml")
3040 result = registry.queryDimensionRecords("detector")
3041 self.assertEqual(result.count(), 4)
3042 result = registry.queryDimensionRecords("detector", instrument="Cam1")
3043 self.assertEqual(result.count(), 4)
3044 result = registry.queryDimensionRecords("detector", dataId={"instrument": "Cam1"})
3045 self.assertEqual(result.count(), 4)
3046 result = registry.queryDimensionRecords("detector", where="instrument='Cam1'")
3047 self.assertEqual(result.count(), 4)
3048 result = registry.queryDimensionRecords("detector", where="instrument=instr", bind={"instr": "Cam1"})
3049 self.assertEqual(result.count(), 4)
3051 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3052 result = registry.queryDimensionRecords("detector", instrument="NotCam1")
3053 result.count()
3055 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3056 result = registry.queryDimensionRecords("detector", dataId={"instrument": "NotCam1"})
3057 result.count()
3059 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3060 result = registry.queryDimensionRecords("detector", where="instrument='NotCam1'")
3061 result.count()
3063 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3064 result = registry.queryDimensionRecords(
3065 "detector", where="instrument=instr", bind={"instr": "NotCam1"}
3066 )
3067 result.count()
3069 def testDatasetConstrainedDimensionRecordQueries(self):
3070 """Test that queryDimensionRecords works even when given a dataset
3071 constraint whose dimensions extend beyond the requested dimension
3072 element's.
3073 """
3074 registry = self.makeRegistry()
3075 self.loadData(registry, "base.yaml")
3076 self.loadData(registry, "datasets.yaml")
3077 # Query for physical_filter dimension records, using a dataset that
3078 # has both physical_filter and dataset dimensions.
3079 records = registry.queryDimensionRecords(
3080 "physical_filter",
3081 datasets=["flat"],
3082 collections="imported_r",
3083 )
3084 self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"})
3085 # Trying to constrain by all dataset types is an error.
3086 with self.assertRaises(TypeError):
3087 list(registry.queryDimensionRecords("physical_filter", datasets=..., collections="imported_r"))
3089 def testSkyPixDatasetQueries(self):
3090 """Test that we can build queries involving skypix dimensions as long
3091 as a dataset type that uses those dimensions is included.
3092 """
3093 registry = self.makeRegistry()
3094 self.loadData(registry, "base.yaml")
3095 dataset_type = DatasetType(
3096 "a", dimensions=["htm7", "instrument"], universe=registry.dimensions, storageClass="int"
3097 )
3098 registry.registerDatasetType(dataset_type)
3099 run = "r"
3100 registry.registerRun(run)
3101 # First try queries where there are no datasets; the concern is whether
3102 # we can even build and execute these queries without raising, even
3103 # when "doomed" query shortcuts are in play.
3104 self.assertFalse(
3105 list(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run))
3106 )
3107 self.assertFalse(list(registry.queryDatasets(dataset_type, collections=run)))
3108 # Now add a dataset and see that we can get it back.
3109 htm7 = registry.dimensions.skypix["htm"][7].pixelization
3110 data_id = registry.expandDataId(instrument="Cam1", htm7=htm7.universe()[0][0])
3111 (ref,) = registry.insertDatasets(dataset_type, [data_id], run=run)
3112 self.assertEqual(
3113 set(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run)),
3114 {data_id},
3115 )
3116 self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})
3118 def testDatasetIdFactory(self):
3119 """Simple test for DatasetIdFactory, mostly to catch potential changes
3120 in its API.
3121 """
3122 registry = self.makeRegistry()
3123 factory = DatasetIdFactory()
3124 dataset_type = DatasetType(
3125 "datasetType",
3126 dimensions=["detector", "instrument"],
3127 universe=registry.dimensions,
3128 storageClass="int",
3129 )
3130 run = "run"
3131 data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)
3133 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
3134 self.assertIsInstance(datasetId, uuid.UUID)
3135 self.assertEqual(datasetId.version, 4)
3137 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
3138 self.assertIsInstance(datasetId, uuid.UUID)
3139 self.assertEqual(datasetId.version, 5)
3141 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
3142 self.assertIsInstance(datasetId, uuid.UUID)
3143 self.assertEqual(datasetId.version, 5)
3145 def testExposureQueries(self):
3146 """Test query methods using arguments sourced from the exposure log
3147 service.
3149 The most complete test dataset currently available to daf_butler tests
3150 is hsc-rc2-subset.yaml export (which is unfortunately distinct from the
3151 the lsst/rc2_subset GitHub repo), but that does not have 'exposure'
3152 dimension records as it was focused on providing nontrivial spatial
3153 overlaps between visit+detector and tract+patch. So in this test we
3154 need to translate queries that originally used the exposure dimension
3155 to use the (very similar) visit dimension instead.
3156 """
3157 registry = self.makeRegistry()
3158 self.loadData(registry, "hsc-rc2-subset.yaml")
3159 self.assertEqual(
3160 [
3161 record.id
3162 for record in registry.queryDimensionRecords("visit", instrument="HSC")
3163 .order_by("id")
3164 .limit(5)
3165 ],
3166 [318, 322, 326, 330, 332],
3167 )
3168 self.assertEqual(
3169 [
3170 data_id["visit"]
3171 for data_id in registry.queryDataIds(["visit"], instrument="HSC").order_by("id").limit(5)
3172 ],
3173 [318, 322, 326, 330, 332],
3174 )
3175 self.assertEqual(
3176 [
3177 record.id
3178 for record in registry.queryDimensionRecords("detector", instrument="HSC")
3179 .order_by("full_name")
3180 .limit(5)
3181 ],
3182 [73, 72, 71, 70, 65],
3183 )
3184 self.assertEqual(
3185 [
3186 data_id["detector"]
3187 for data_id in registry.queryDataIds(["detector"], instrument="HSC")
3188 .order_by("full_name")
3189 .limit(5)
3190 ],
3191 [73, 72, 71, 70, 65],
3192 )
3194 def test_long_query_names(self) -> None:
3195 """Test that queries involving very long names are handled correctly.
3197 This is especially important for PostgreSQL, which truncates symbols
3198 longer than 64 chars, but it's worth testing for all DBs.
3199 """
3200 registry = self.makeRegistry()
3201 name = "abcd" * 17
3202 registry.registerDatasetType(
3203 DatasetType(
3204 name,
3205 dimensions=(),
3206 storageClass="Exposure",
3207 universe=registry.dimensions,
3208 )
3209 )
3210 # Need to search more than one collection actually containing a
3211 # matching dataset to avoid optimizations that sidestep bugs due to
3212 # truncation by making findFirst=True a no-op.
3213 run1 = "run1"
3214 registry.registerRun(run1)
3215 run2 = "run2"
3216 registry.registerRun(run2)
3217 (ref1,) = registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run1)
3218 registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run2)
3219 self.assertEqual(
3220 set(registry.queryDatasets(name, collections=[run1, run2], findFirst=True)),
3221 {ref1},
3222 )
3224 def test_skypix_constraint_queries(self) -> None:
3225 """Test queries spatially constrained by a skypix data ID."""
3226 registry = self.makeRegistry()
3227 self.loadData(registry, "hsc-rc2-subset.yaml")
3228 patch_regions = {
3229 (data_id["tract"], data_id["patch"]): data_id.region
3230 for data_id in registry.queryDataIds(["patch"]).expanded()
3231 }
3232 skypix_dimension: SkyPixDimension = registry.dimensions["htm11"]
3233 # This check ensures the test doesn't become trivial due to a config
3234 # change; if it does, just pick a different HTML level.
3235 self.assertNotEqual(skypix_dimension, registry.dimensions.commonSkyPix)
3236 # Gather all skypix IDs that definitely overlap at least one of these
3237 # patches.
3238 relevant_skypix_ids = lsst.sphgeom.RangeSet()
3239 for patch_region in patch_regions.values():
3240 relevant_skypix_ids |= skypix_dimension.pixelization.interior(patch_region)
3241 # Look for a "nontrivial" skypix_id that overlaps at least one patch
3242 # and does not overlap at least one other patch.
3243 for skypix_id in itertools.chain.from_iterable(
3244 range(begin, end) for begin, end in relevant_skypix_ids
3245 ):
3246 skypix_region = skypix_dimension.pixelization.pixel(skypix_id)
3247 overlapping_patches = {
3248 patch_key
3249 for patch_key, patch_region in patch_regions.items()
3250 if not patch_region.isDisjointFrom(skypix_region)
3251 }
3252 if overlapping_patches and overlapping_patches != patch_regions.keys():
3253 break
3254 else:
3255 raise RuntimeError("Could not find usable skypix ID for this dimension configuration.")
3256 self.assertEqual(
3257 {
3258 (data_id["tract"], data_id["patch"])
3259 for data_id in registry.queryDataIds(
3260 ["patch"],
3261 dataId={skypix_dimension.name: skypix_id},
3262 )
3263 },
3264 overlapping_patches,
3265 )
3266 # Test that a three-way join that includes the common skypix system in
3267 # the dimensions doesn't generate redundant join terms in the query.
3268 full_data_ids = set(
3269 registry.queryDataIds(
3270 ["tract", "visit", "htm7"], skymap="hsc_rings_v1", instrument="HSC"
3271 ).expanded()
3272 )
3273 self.assertGreater(len(full_data_ids), 0)
3274 for data_id in full_data_ids:
3275 self.assertFalse(data_id.records["tract"].region.isDisjointFrom(data_id.records["htm7"].region))
3276 self.assertFalse(data_id.records["visit"].region.isDisjointFrom(data_id.records["htm7"].region))
3278 def test_spatial_constraint_queries(self) -> None:
3279 """Test queries in which one spatial dimension in the constraint (data
3280 ID or ``where`` string) constrains a different spatial dimension in the
3281 query result columns.
3282 """
3283 registry = self.makeRegistry()
3284 self.loadData(registry, "hsc-rc2-subset.yaml")
3285 patch_regions = {
3286 (data_id["tract"], data_id["patch"]): data_id.region
3287 for data_id in registry.queryDataIds(["patch"]).expanded()
3288 }
3289 observation_regions = {
3290 (data_id["visit"], data_id["detector"]): data_id.region
3291 for data_id in registry.queryDataIds(["visit", "detector"]).expanded()
3292 }
3293 all_combos = {
3294 (patch_key, observation_key)
3295 for patch_key, observation_key in itertools.product(patch_regions, observation_regions)
3296 }
3297 overlapping_combos = {
3298 (patch_key, observation_key)
3299 for patch_key, observation_key in all_combos
3300 if not patch_regions[patch_key].isDisjointFrom(observation_regions[observation_key])
3301 }
3302 # Check a direct spatial join with no constraint first.
3303 self.assertEqual(
3304 {
3305 ((data_id["tract"], data_id["patch"]), (data_id["visit"], data_id["detector"]))
3306 for data_id in registry.queryDataIds(["patch", "visit", "detector"])
3307 },
3308 overlapping_combos,
3309 )
3310 overlaps_by_patch: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3311 overlaps_by_observation: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3312 for patch_key, observation_key in overlapping_combos:
3313 overlaps_by_patch[patch_key].add(observation_key)
3314 overlaps_by_observation[observation_key].add(patch_key)
3315 # Find patches and observations that overlap at least one of the other
3316 # but not all of the other.
3317 nontrivial_patch = next(
3318 iter(
3319 patch_key
3320 for patch_key, observation_keys in overlaps_by_patch.items()
3321 if observation_keys and observation_keys != observation_regions.keys()
3322 )
3323 )
3324 nontrivial_observation = next(
3325 iter(
3326 observation_key
3327 for observation_key, patch_keys in overlaps_by_observation.items()
3328 if patch_keys and patch_keys != patch_regions.keys()
3329 )
3330 )
3331 # Use the nontrivial patches and observations as constraints on the
3332 # other dimensions in various ways, first via a 'where' expression.
3333 # It's better in general to us 'bind' instead of f-strings, but these
3334 # all integers so there are no quoting concerns.
3335 self.assertEqual(
3336 {
3337 (data_id["visit"], data_id["detector"])
3338 for data_id in registry.queryDataIds(
3339 ["visit", "detector"],
3340 where=f"tract={nontrivial_patch[0]} AND patch={nontrivial_patch[1]}",
3341 skymap="hsc_rings_v1",
3342 )
3343 },
3344 overlaps_by_patch[nontrivial_patch],
3345 )
3346 self.assertEqual(
3347 {
3348 (data_id["tract"], data_id["patch"])
3349 for data_id in registry.queryDataIds(
3350 ["patch"],
3351 where=f"visit={nontrivial_observation[0]} AND detector={nontrivial_observation[1]}",
3352 instrument="HSC",
3353 )
3354 },
3355 overlaps_by_observation[nontrivial_observation],
3356 )
3357 # and then via the dataId argument.
3358 self.assertEqual(
3359 {
3360 (data_id["visit"], data_id["detector"])
3361 for data_id in registry.queryDataIds(
3362 ["visit", "detector"],
3363 dataId={
3364 "tract": nontrivial_patch[0],
3365 "patch": nontrivial_patch[1],
3366 },
3367 skymap="hsc_rings_v1",
3368 )
3369 },
3370 overlaps_by_patch[nontrivial_patch],
3371 )
3372 self.assertEqual(
3373 {
3374 (data_id["tract"], data_id["patch"])
3375 for data_id in registry.queryDataIds(
3376 ["patch"],
3377 dataId={
3378 "visit": nontrivial_observation[0],
3379 "detector": nontrivial_observation[1],
3380 },
3381 instrument="HSC",
3382 )
3383 },
3384 overlaps_by_observation[nontrivial_observation],
3385 )
3387 def test_query_projection_drop_postprocessing(self) -> None:
3388 """Test that projections and deduplications on query objects can
3389 drop post-query region filtering to ensure the query remains in
3390 the SQL engine.
3391 """
3392 registry = self.makeRegistry()
3393 self.loadData(registry, "base.yaml")
3394 self.loadData(registry, "spatial.yaml")
3396 def pop_transfer(tree: Relation) -> Relation:
3397 """If a relation tree terminates with a transfer to a new engine,
3398 return the relation prior to that transfer. If not, return the
3399 original relation.
3400 """
3401 match tree:
3402 case Transfer(target=target):
3403 return target
3404 case _:
3405 return tree
3407 # There's no public way to get a Query object yet, so we get one from a
3408 # DataCoordinateQueryResults private attribute. When a public API is
3409 # available this test should use it.
3410 query = registry.queryDataIds(["visit", "detector", "tract", "patch"])._query
3411 # We expect this query to terminate in the iteration engine originally,
3412 # because region-filtering is necessary.
3413 self.assertIsInstance(pop_transfer(query.relation).engine, iteration.Engine)
3414 # If we deduplicate, we usually have to do that downstream of the
3415 # filtering. That means the deduplication has to happen in the
3416 # iteration engine.
3417 self.assertIsInstance(pop_transfer(query.projected(unique=True).relation).engine, iteration.Engine)
3418 # If we pass drop_postprocessing, we instead drop the region filtering
3419 # so the deduplication can happen in SQL (though there might still be
3420 # transfer to iteration at the tail of the tree that we can ignore;
3421 # that's what the pop_transfer takes care of here).
3422 self.assertIsInstance(
3423 pop_transfer(query.projected(unique=True, drop_postprocessing=True).relation).engine,
3424 sql.Engine,
3425 )
3427 def test_query_empty_collections(self) -> None:
3428 """Test for registry query methods with empty collections. The methods
3429 should return empty result set (or None when applicable) and provide
3430 "doomed" diagnostics.
3431 """
3432 registry = self.makeRegistry()
3433 self.loadData(registry, "base.yaml")
3434 self.loadData(registry, "datasets.yaml")
3436 # Tests for registry.findDataset()
3437 with self.assertRaises(NoDefaultCollectionError):
3438 registry.findDataset("bias", instrument="Cam1", detector=1)
3439 self.assertIsNotNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=...))
3440 self.assertIsNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=[]))
3442 # Tests for registry.queryDatasets()
3443 with self.assertRaises(NoDefaultCollectionError):
3444 registry.queryDatasets("bias")
3445 self.assertTrue(list(registry.queryDatasets("bias", collections=...)))
3447 result = registry.queryDatasets("bias", collections=[])
3448 self.assertEqual(len(list(result)), 0)
3449 messages = list(result.explain_no_results())
3450 self.assertTrue(messages)
3451 self.assertTrue(any("because collection list is empty" in message for message in messages))
3453 # Tests for registry.queryDataIds()
3454 with self.assertRaises(NoDefaultCollectionError):
3455 registry.queryDataIds("detector", datasets="bias")
3456 self.assertTrue(list(registry.queryDataIds("detector", datasets="bias", collections=...)))
3458 result = registry.queryDataIds("detector", datasets="bias", collections=[])
3459 self.assertEqual(len(list(result)), 0)
3460 messages = list(result.explain_no_results())
3461 self.assertTrue(messages)
3462 self.assertTrue(any("because collection list is empty" in message for message in messages))
3464 # Tests for registry.queryDimensionRecords()
3465 with self.assertRaises(NoDefaultCollectionError):
3466 registry.queryDimensionRecords("detector", datasets="bias")
3467 self.assertTrue(list(registry.queryDimensionRecords("detector", datasets="bias", collections=...)))
3469 result = registry.queryDimensionRecords("detector", datasets="bias", collections=[])
3470 self.assertEqual(len(list(result)), 0)
3471 messages = list(result.explain_no_results())
3472 self.assertTrue(messages)
3473 self.assertTrue(any("because collection list is empty" in message for message in messages))