Coverage for python/lsst/daf/butler/registry/tests/_registry.py: 4%
1463 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-17 02:31 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-17 02:31 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RegistryTests"]
25import itertools
26import logging
27import os
28import re
29import unittest
30import uuid
31from abc import ABC, abstractmethod
32from collections import defaultdict, namedtuple
33from datetime import datetime, timedelta
34from typing import TYPE_CHECKING, Iterator, Optional, Type, Union
36import astropy.time
37import sqlalchemy
39try:
40 import numpy as np
41except ImportError:
42 np = None
44import lsst.sphgeom
45from lsst.daf.relation import Relation, RelationalAlgebraError, Transfer, iteration, sql
47from ...core import (
48 DataCoordinate,
49 DataCoordinateSet,
50 DatasetAssociation,
51 DatasetIdGenEnum,
52 DatasetRef,
53 DatasetType,
54 DimensionGraph,
55 NamedValueSet,
56 SkyPixDimension,
57 StorageClass,
58 Timespan,
59 ddl,
60)
61from .._collection_summary import CollectionSummary
62from .._collectionType import CollectionType
63from .._config import RegistryConfig
64from .._exceptions import (
65 ArgumentError,
66 CollectionError,
67 CollectionTypeError,
68 ConflictingDefinitionError,
69 DataIdValueError,
70 DatasetTypeError,
71 InconsistentDataIdError,
72 MissingCollectionError,
73 MissingDatasetTypeError,
74 NoDefaultCollectionError,
75 OrphanedRecordError,
76)
77from ..interfaces import ButlerAttributeExistsError
79if TYPE_CHECKING:
80 from .._registry import Registry
83class RegistryTests(ABC):
84 """Generic tests for the `Registry` class that can be subclassed to
85 generate tests for different configurations.
86 """
88 collectionsManager: Optional[str] = None
89 """Name of the collections manager class, if subclass provides value for
90 this member then it overrides name specified in default configuration
91 (`str`).
92 """
94 datasetsManager: Optional[str | dict[str, str]] = None
95 """Name or configuration dictionary of the datasets manager class, if
96 subclass provides value for this member then it overrides name specified
97 in default configuration (`str` or `dict`).
98 """
100 @classmethod
101 @abstractmethod
102 def getDataDir(cls) -> str:
103 """Return the root directory containing test data YAML files."""
104 raise NotImplementedError()
106 def makeRegistryConfig(self) -> RegistryConfig:
107 """Create RegistryConfig used to create a registry.
109 This method should be called by a subclass from `makeRegistry`.
110 Returned instance will be pre-configured based on the values of class
111 members, and default-configured for all other parameters. Subclasses
112 that need default configuration should just instantiate
113 `RegistryConfig` directly.
114 """
115 config = RegistryConfig()
116 if self.collectionsManager:
117 config["managers", "collections"] = self.collectionsManager
118 if self.datasetsManager:
119 config["managers", "datasets"] = self.datasetsManager
120 return config
122 @abstractmethod
123 def makeRegistry(self, share_repo_with: Optional[Registry] = None) -> Optional[Registry]:
124 """Return the Registry instance to be tested.
126 Parameters
127 ----------
128 share_repo_with : `Registry`, optional
129 If provided, the new registry should point to the same data
130 repository as this existing registry.
132 Returns
133 -------
134 registry : `Registry`
135 New `Registry` instance, or `None` *only* if `share_repo_with` is
136 not `None` and this test case does not support that argument
137 (e.g. it is impossible with in-memory SQLite DBs).
138 """
139 raise NotImplementedError()
141 def loadData(self, registry: Registry, filename: str):
142 """Load registry test data from ``getDataDir/<filename>``,
143 which should be a YAML import/export file.
144 """
145 from ...transfers import YamlRepoImportBackend
147 with open(os.path.join(self.getDataDir(), filename), "r") as stream:
148 backend = YamlRepoImportBackend(stream, registry)
149 backend.register()
150 backend.load(datastore=None)
152 def checkQueryResults(self, results, expected):
153 """Check that a query results object contains expected values.
155 Parameters
156 ----------
157 results : `DataCoordinateQueryResults` or `DatasetQueryResults`
158 A lazy-evaluation query results object.
159 expected : `list`
160 A list of `DataCoordinate` o `DatasetRef` objects that should be
161 equal to results of the query, aside from ordering.
162 """
163 self.assertCountEqual(list(results), expected)
164 self.assertEqual(results.count(), len(expected))
165 if expected:
166 self.assertTrue(results.any())
167 else:
168 self.assertFalse(results.any())
170 def testOpaque(self):
171 """Tests for `Registry.registerOpaqueTable`,
172 `Registry.insertOpaqueData`, `Registry.fetchOpaqueData`, and
173 `Registry.deleteOpaqueData`.
174 """
175 registry = self.makeRegistry()
176 table = "opaque_table_for_testing"
177 registry.registerOpaqueTable(
178 table,
179 spec=ddl.TableSpec(
180 fields=[
181 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, primaryKey=True),
182 ddl.FieldSpec("name", dtype=sqlalchemy.String, length=16, nullable=False),
183 ddl.FieldSpec("count", dtype=sqlalchemy.SmallInteger, nullable=True),
184 ],
185 ),
186 )
187 rows = [
188 {"id": 1, "name": "one", "count": None},
189 {"id": 2, "name": "two", "count": 5},
190 {"id": 3, "name": "three", "count": 6},
191 ]
192 registry.insertOpaqueData(table, *rows)
193 self.assertCountEqual(rows, list(registry.fetchOpaqueData(table)))
194 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=1)))
195 self.assertEqual(rows[1:2], list(registry.fetchOpaqueData(table, name="two")))
196 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=(1, 3), name=("one", "two"))))
197 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=(1, 2, 3))))
198 # Test very long IN clause which exceeds sqlite limit on number of
199 # parameters. SQLite says the limit is 32k but it looks like it is
200 # much higher.
201 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=list(range(300_000)))))
202 # Two IN clauses, each longer than 1k batch size, first with
203 # duplicates, second has matching elements in different batches (after
204 # sorting).
205 self.assertEqual(
206 rows[0:2],
207 list(
208 registry.fetchOpaqueData(
209 table,
210 id=list(range(1000)) + list(range(100, 0, -1)),
211 name=["one"] + [f"q{i}" for i in range(2200)] + ["two"],
212 )
213 ),
214 )
215 self.assertEqual([], list(registry.fetchOpaqueData(table, id=1, name="two")))
216 registry.deleteOpaqueData(table, id=3)
217 self.assertCountEqual(rows[:2], list(registry.fetchOpaqueData(table)))
218 registry.deleteOpaqueData(table)
219 self.assertEqual([], list(registry.fetchOpaqueData(table)))
221 def testDatasetType(self):
222 """Tests for `Registry.registerDatasetType` and
223 `Registry.getDatasetType`.
224 """
225 registry = self.makeRegistry()
226 # Check valid insert
227 datasetTypeName = "test"
228 storageClass = StorageClass("testDatasetType")
229 registry.storageClasses.registerStorageClass(storageClass)
230 dimensions = registry.dimensions.extract(("instrument", "visit"))
231 differentDimensions = registry.dimensions.extract(("instrument", "patch"))
232 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
233 # Inserting for the first time should return True
234 self.assertTrue(registry.registerDatasetType(inDatasetType))
235 outDatasetType1 = registry.getDatasetType(datasetTypeName)
236 self.assertEqual(outDatasetType1, inDatasetType)
238 # Re-inserting should work
239 self.assertFalse(registry.registerDatasetType(inDatasetType))
240 # Except when they are not identical
241 with self.assertRaises(ConflictingDefinitionError):
242 nonIdenticalDatasetType = DatasetType(datasetTypeName, differentDimensions, storageClass)
243 registry.registerDatasetType(nonIdenticalDatasetType)
245 # Template can be None
246 datasetTypeName = "testNoneTemplate"
247 storageClass = StorageClass("testDatasetType2")
248 registry.storageClasses.registerStorageClass(storageClass)
249 dimensions = registry.dimensions.extract(("instrument", "visit"))
250 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
251 registry.registerDatasetType(inDatasetType)
252 outDatasetType2 = registry.getDatasetType(datasetTypeName)
253 self.assertEqual(outDatasetType2, inDatasetType)
255 allTypes = set(registry.queryDatasetTypes())
256 self.assertEqual(allTypes, {outDatasetType1, outDatasetType2})
258 def testDimensions(self):
259 """Tests for `Registry.insertDimensionData`,
260 `Registry.syncDimensionData`, and `Registry.expandDataId`.
261 """
262 registry = self.makeRegistry()
263 dimensionName = "instrument"
264 dimension = registry.dimensions[dimensionName]
265 dimensionValue = {
266 "name": "DummyCam",
267 "visit_max": 10,
268 "visit_system": 0,
269 "exposure_max": 10,
270 "detector_max": 2,
271 "class_name": "lsst.pipe.base.Instrument",
272 }
273 registry.insertDimensionData(dimensionName, dimensionValue)
274 # Inserting the same value twice should fail
275 with self.assertRaises(sqlalchemy.exc.IntegrityError):
276 registry.insertDimensionData(dimensionName, dimensionValue)
277 # expandDataId should retrieve the record we just inserted
278 self.assertEqual(
279 registry.expandDataId(instrument="DummyCam", graph=dimension.graph)
280 .records[dimensionName]
281 .toDict(),
282 dimensionValue,
283 )
284 # expandDataId should raise if there is no record with the given ID.
285 with self.assertRaises(DataIdValueError):
286 registry.expandDataId({"instrument": "Unknown"}, graph=dimension.graph)
287 # band doesn't have a table; insert should fail.
288 with self.assertRaises(TypeError):
289 registry.insertDimensionData("band", {"band": "i"})
290 dimensionName2 = "physical_filter"
291 dimension2 = registry.dimensions[dimensionName2]
292 dimensionValue2 = {"name": "DummyCam_i", "band": "i"}
293 # Missing required dependency ("instrument") should fail
294 with self.assertRaises(KeyError):
295 registry.insertDimensionData(dimensionName2, dimensionValue2)
296 # Adding required dependency should fix the failure
297 dimensionValue2["instrument"] = "DummyCam"
298 registry.insertDimensionData(dimensionName2, dimensionValue2)
299 # expandDataId should retrieve the record we just inserted.
300 self.assertEqual(
301 registry.expandDataId(instrument="DummyCam", physical_filter="DummyCam_i", graph=dimension2.graph)
302 .records[dimensionName2]
303 .toDict(),
304 dimensionValue2,
305 )
306 # Use syncDimensionData to insert a new record successfully.
307 dimensionName3 = "detector"
308 dimensionValue3 = {
309 "instrument": "DummyCam",
310 "id": 1,
311 "full_name": "one",
312 "name_in_raft": "zero",
313 "purpose": "SCIENCE",
314 }
315 self.assertTrue(registry.syncDimensionData(dimensionName3, dimensionValue3))
316 # Sync that again. Note that one field ("raft") is NULL, and that
317 # should be okay.
318 self.assertFalse(registry.syncDimensionData(dimensionName3, dimensionValue3))
319 # Now try that sync with the same primary key but a different value.
320 # This should fail.
321 with self.assertRaises(ConflictingDefinitionError):
322 registry.syncDimensionData(
323 dimensionName3,
324 {
325 "instrument": "DummyCam",
326 "id": 1,
327 "full_name": "one",
328 "name_in_raft": "four",
329 "purpose": "SCIENCE",
330 },
331 )
333 @unittest.skipIf(np is None, "numpy not available.")
334 def testNumpyDataId(self):
335 """Test that we can use a numpy int in a dataId."""
336 registry = self.makeRegistry()
337 dimensionEntries = [
338 ("instrument", {"instrument": "DummyCam"}),
339 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
340 # Using an np.int64 here fails unless Records.fromDict is also
341 # patched to look for numbers.Integral
342 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
343 ]
344 for args in dimensionEntries:
345 registry.insertDimensionData(*args)
347 # Try a normal integer and something that looks like an int but
348 # is not.
349 for visit_id in (42, np.int64(42)):
350 with self.subTest(visit_id=visit_id, id_type=type(visit_id).__name__):
351 expanded = registry.expandDataId({"instrument": "DummyCam", "visit": visit_id})
352 self.assertEqual(expanded["visit"], int(visit_id))
353 self.assertIsInstance(expanded["visit"], int)
355 def testDataIdRelationships(self):
356 """Test that `Registry.expandDataId` raises an exception when the given
357 keys are inconsistent.
358 """
359 registry = self.makeRegistry()
360 self.loadData(registry, "base.yaml")
361 # Insert a few more dimension records for the next test.
362 registry.insertDimensionData(
363 "exposure",
364 {"instrument": "Cam1", "id": 1, "obs_id": "one", "physical_filter": "Cam1-G"},
365 )
366 registry.insertDimensionData(
367 "exposure",
368 {"instrument": "Cam1", "id": 2, "obs_id": "two", "physical_filter": "Cam1-G"},
369 )
370 registry.insertDimensionData(
371 "visit_system",
372 {"instrument": "Cam1", "id": 0, "name": "one-to-one"},
373 )
374 registry.insertDimensionData(
375 "visit",
376 {"instrument": "Cam1", "id": 1, "name": "one", "physical_filter": "Cam1-G", "visit_system": 0},
377 )
378 registry.insertDimensionData(
379 "visit_definition",
380 {"instrument": "Cam1", "visit": 1, "exposure": 1, "visit_system": 0},
381 )
382 with self.assertRaises(InconsistentDataIdError):
383 registry.expandDataId(
384 {"instrument": "Cam1", "visit": 1, "exposure": 2},
385 )
387 def testDataset(self):
388 """Basic tests for `Registry.insertDatasets`, `Registry.getDataset`,
389 and `Registry.removeDatasets`.
390 """
391 registry = self.makeRegistry()
392 self.loadData(registry, "base.yaml")
393 run = "tésτ"
394 registry.registerRun(run)
395 datasetType = registry.getDatasetType("bias")
396 dataId = {"instrument": "Cam1", "detector": 2}
397 (ref,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
398 outRef = registry.getDataset(ref.id)
399 self.assertIsNotNone(ref.id)
400 self.assertEqual(ref, outRef)
401 with self.assertRaises(ConflictingDefinitionError):
402 registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
403 registry.removeDatasets([ref])
404 self.assertIsNone(registry.findDataset(datasetType, dataId, collections=[run]))
406 def testFindDataset(self):
407 """Tests for `Registry.findDataset`."""
408 registry = self.makeRegistry()
409 self.loadData(registry, "base.yaml")
410 run = "tésτ"
411 datasetType = registry.getDatasetType("bias")
412 dataId = {"instrument": "Cam1", "detector": 4}
413 registry.registerRun(run)
414 (inputRef,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
415 outputRef = registry.findDataset(datasetType, dataId, collections=[run])
416 self.assertEqual(outputRef, inputRef)
417 # Check that retrieval with invalid dataId raises
418 with self.assertRaises(LookupError):
419 dataId = {"instrument": "Cam1"} # no detector
420 registry.findDataset(datasetType, dataId, collections=run)
421 # Check that different dataIds match to different datasets
422 dataId1 = {"instrument": "Cam1", "detector": 1}
423 (inputRef1,) = registry.insertDatasets(datasetType, dataIds=[dataId1], run=run)
424 dataId2 = {"instrument": "Cam1", "detector": 2}
425 (inputRef2,) = registry.insertDatasets(datasetType, dataIds=[dataId2], run=run)
426 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef1)
427 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef2)
428 self.assertNotEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef2)
429 self.assertNotEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef1)
430 # Check that requesting a non-existing dataId returns None
431 nonExistingDataId = {"instrument": "Cam1", "detector": 3}
432 self.assertIsNone(registry.findDataset(datasetType, nonExistingDataId, collections=run))
433 # Search more than one collection, in which two have the right
434 # dataset type and another does not.
435 registry.registerRun("empty")
436 self.loadData(registry, "datasets-uuid.yaml")
437 bias1 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_g"])
438 self.assertIsNotNone(bias1)
439 bias2 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_r"])
440 self.assertIsNotNone(bias2)
441 self.assertEqual(
442 bias1,
443 registry.findDataset(
444 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "imported_r"]
445 ),
446 )
447 self.assertEqual(
448 bias2,
449 registry.findDataset(
450 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_r", "imported_g"]
451 ),
452 )
453 # Search more than one collection, with one of them a CALIBRATION
454 # collection.
455 registry.registerCollection("Cam1/calib", CollectionType.CALIBRATION)
456 timespan = Timespan(
457 begin=astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai"),
458 end=astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai"),
459 )
460 registry.certify("Cam1/calib", [bias2], timespan=timespan)
461 self.assertEqual(
462 bias1,
463 registry.findDataset(
464 "bias",
465 instrument="Cam1",
466 detector=2,
467 collections=["empty", "imported_g", "Cam1/calib"],
468 timespan=timespan,
469 ),
470 )
471 self.assertEqual(
472 bias2,
473 registry.findDataset(
474 "bias",
475 instrument="Cam1",
476 detector=2,
477 collections=["empty", "Cam1/calib", "imported_g"],
478 timespan=timespan,
479 ),
480 )
481 # If we try to search those same collections without a timespan, it
482 # should still work, since the CALIBRATION collection is ignored.
483 self.assertEqual(
484 bias1,
485 registry.findDataset(
486 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "Cam1/calib"]
487 ),
488 )
489 self.assertEqual(
490 bias1,
491 registry.findDataset(
492 "bias", instrument="Cam1", detector=2, collections=["empty", "Cam1/calib", "imported_g"]
493 ),
494 )
496 def testRemoveDatasetTypeSuccess(self):
497 """Test that Registry.removeDatasetType works when there are no
498 datasets of that type present.
499 """
500 registry = self.makeRegistry()
501 self.loadData(registry, "base.yaml")
502 registry.removeDatasetType("flat")
503 with self.assertRaises(MissingDatasetTypeError):
504 registry.getDatasetType("flat")
506 def testRemoveDatasetTypeFailure(self):
507 """Test that Registry.removeDatasetType raises when there are datasets
508 of that type present or if the dataset type is for a component.
509 """
510 registry = self.makeRegistry()
511 self.loadData(registry, "base.yaml")
512 self.loadData(registry, "datasets.yaml")
513 with self.assertRaises(OrphanedRecordError):
514 registry.removeDatasetType("flat")
515 with self.assertRaises(ValueError):
516 registry.removeDatasetType(DatasetType.nameWithComponent("flat", "image"))
518 def testImportDatasetsUUID(self):
519 """Test for `Registry._importDatasets` with UUID dataset ID."""
520 if isinstance(self.datasetsManager, str):
521 if not self.datasetsManager.endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
522 self.skipTest(f"Unexpected dataset manager {self.datasetsManager}")
523 elif isinstance(self.datasetsManager, dict):
524 if not self.datasetsManager["cls"].endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
525 self.skipTest(f"Unexpected dataset manager {self.datasetsManager['cls']}")
527 registry = self.makeRegistry()
528 self.loadData(registry, "base.yaml")
529 for run in range(6):
530 registry.registerRun(f"run{run}")
531 datasetTypeBias = registry.getDatasetType("bias")
532 datasetTypeFlat = registry.getDatasetType("flat")
533 dataIdBias1 = {"instrument": "Cam1", "detector": 1}
534 dataIdBias2 = {"instrument": "Cam1", "detector": 2}
535 dataIdFlat1 = {"instrument": "Cam1", "detector": 1, "physical_filter": "Cam1-G", "band": "g"}
537 ref = DatasetRef(datasetTypeBias, dataIdBias1, run="run0")
538 (ref1,) = registry._importDatasets([ref])
539 # UUID is used without change
540 self.assertEqual(ref.id, ref1.id)
542 # All different failure modes
543 refs = (
544 # Importing same DatasetRef with different dataset ID is an error
545 DatasetRef(datasetTypeBias, dataIdBias1, run="run0"),
546 # Same DatasetId but different DataId
547 DatasetRef(datasetTypeBias, dataIdBias2, id=ref1.id, run="run0"),
548 DatasetRef(datasetTypeFlat, dataIdFlat1, id=ref1.id, run="run0"),
549 # Same DatasetRef and DatasetId but different run
550 DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run="run1"),
551 )
552 for ref in refs:
553 with self.assertRaises(ConflictingDefinitionError):
554 registry._importDatasets([ref])
556 # Test for non-unique IDs, they can be re-imported multiple times.
557 for run, idGenMode in ((2, DatasetIdGenEnum.DATAID_TYPE), (4, DatasetIdGenEnum.DATAID_TYPE_RUN)):
558 with self.subTest(idGenMode=idGenMode):
559 # Use integer dataset ID to force UUID calculation in _import
560 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=0, run=f"run{run}")
561 (ref1,) = registry._importDatasets([ref], idGenerationMode=idGenMode)
562 self.assertIsInstance(ref1.id, uuid.UUID)
563 self.assertEqual(ref1.id.version, 5)
565 # Importing it again is OK
566 (ref2,) = registry._importDatasets([ref1])
567 self.assertEqual(ref2.id, ref1.id)
569 # Cannot import to different run with the same ID
570 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run=f"run{run+1}")
571 with self.assertRaises(ConflictingDefinitionError):
572 registry._importDatasets([ref])
574 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=0, run=f"run{run+1}")
575 if idGenMode is DatasetIdGenEnum.DATAID_TYPE:
576 # Cannot import same DATAID_TYPE ref into a new run
577 with self.assertRaises(ConflictingDefinitionError):
578 (ref2,) = registry._importDatasets([ref], idGenerationMode=idGenMode)
579 else:
580 # DATAID_TYPE_RUN ref can be imported into a new run
581 (ref2,) = registry._importDatasets([ref], idGenerationMode=idGenMode)
583 def testDatasetTypeComponentQueries(self):
584 """Test component options when querying for dataset types.
586 All of the behavior here is deprecated, so many of these tests are
587 currently wrapped in a context to check that we get a warning whenever
588 a component dataset is actually returned.
589 """
590 registry = self.makeRegistry()
591 self.loadData(registry, "base.yaml")
592 self.loadData(registry, "datasets.yaml")
593 # Test querying for dataset types with different inputs.
594 # First query for all dataset types; components should only be included
595 # when components=True.
596 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes()).names)
597 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes(components=False)).names)
598 with self.assertWarns(FutureWarning):
599 self.assertLess(
600 {"bias", "flat", "bias.wcs", "flat.photoCalib"},
601 NamedValueSet(registry.queryDatasetTypes(components=True)).names,
602 )
603 # Use a pattern that can match either parent or components. Again,
604 # components are only returned if components=True.
605 self.assertEqual({"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"))).names)
606 self.assertEqual(
607 {"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=False)).names
608 )
609 with self.assertWarns(FutureWarning):
610 self.assertLess(
611 {"bias", "bias.wcs"},
612 NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=True)).names,
613 )
614 # This pattern matches only a component. In this case we also return
615 # that component dataset type if components=None.
616 with self.assertWarns(FutureWarning):
617 self.assertEqual(
618 {"bias.wcs"}, NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"))).names
619 )
620 self.assertEqual(
621 set(),
622 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=False)).names,
623 )
624 with self.assertWarns(FutureWarning):
625 self.assertEqual(
626 {"bias.wcs"},
627 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=True)).names,
628 )
629 # Add a dataset type using a StorageClass that we'll then remove; check
630 # that this does not affect our ability to query for dataset types
631 # (though it will warn).
632 tempStorageClass = StorageClass(
633 name="TempStorageClass",
634 components={
635 "data1": registry.storageClasses.getStorageClass("StructuredDataDict"),
636 "data2": registry.storageClasses.getStorageClass("StructuredDataDict"),
637 },
638 )
639 registry.storageClasses.registerStorageClass(tempStorageClass)
640 datasetType = DatasetType(
641 "temporary",
642 dimensions=["instrument"],
643 storageClass=tempStorageClass,
644 universe=registry.dimensions,
645 )
646 registry.registerDatasetType(datasetType)
647 registry.storageClasses._unregisterStorageClass(tempStorageClass.name)
648 datasetType._storageClass = None
649 del tempStorageClass
650 # Querying for all dataset types, including components, should include
651 # at least all non-component dataset types (and I don't want to
652 # enumerate all of the Exposure components for bias and flat here).
653 with self.assertWarns(FutureWarning):
654 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
655 everything = NamedValueSet(registry.queryDatasetTypes(components=True))
656 self.assertIn("TempStorageClass", cm.output[0])
657 self.assertLess({"bias", "flat", "temporary"}, everything.names)
658 # It should not include "temporary.columns", because we tried to remove
659 # the storage class that would tell it about that. So if the next line
660 # fails (i.e. "temporary.columns" _is_ in everything.names), it means
661 # this part of the test isn't doing anything, because the _unregister
662 # call about isn't simulating the real-life case we want it to
663 # simulate, in which different versions of daf_butler in entirely
664 # different Python processes interact with the same repo.
665 self.assertNotIn("temporary.data", everything.names)
666 # Query for dataset types that start with "temp". This should again
667 # not include the component, and also not fail.
668 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
669 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=True))
670 self.assertIn("TempStorageClass", cm.output[0])
671 self.assertEqual({"temporary"}, startsWithTemp.names)
672 # Querying with no components should not warn at all.
673 with self.assertLogs("lsst.daf.butler.registries", logging.WARN) as cm:
674 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=False))
675 # Must issue a warning of our own to be captured.
676 logging.getLogger("lsst.daf.butler.registries").warning("test message")
677 self.assertEqual(len(cm.output), 1)
678 self.assertIn("test message", cm.output[0])
680 def testComponentLookups(self):
681 """Test searching for component datasets via their parents.
683 All of the behavior here is deprecated, so many of these tests are
684 currently wrapped in a context to check that we get a warning whenever
685 a component dataset is actually returned.
686 """
687 registry = self.makeRegistry()
688 self.loadData(registry, "base.yaml")
689 self.loadData(registry, "datasets.yaml")
690 # Test getting the child dataset type (which does still exist in the
691 # Registry), and check for consistency with
692 # DatasetRef.makeComponentRef.
693 collection = "imported_g"
694 parentType = registry.getDatasetType("bias")
695 childType = registry.getDatasetType("bias.wcs")
696 parentRefResolved = registry.findDataset(
697 parentType, collections=collection, instrument="Cam1", detector=1
698 )
699 self.assertIsInstance(parentRefResolved, DatasetRef)
700 self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType)
701 # Search for a single dataset with findDataset.
702 childRef1 = registry.findDataset("bias.wcs", collections=collection, dataId=parentRefResolved.dataId)
703 self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs"))
704 # Search for detector data IDs constrained by component dataset
705 # existence with queryDataIds.
706 with self.assertWarns(FutureWarning):
707 dataIds = registry.queryDataIds(
708 ["detector"],
709 datasets=["bias.wcs"],
710 collections=collection,
711 ).toSet()
712 self.assertEqual(
713 dataIds,
714 DataCoordinateSet(
715 {
716 DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions)
717 for d in (1, 2, 3)
718 },
719 parentType.dimensions,
720 ),
721 )
722 # Search for multiple datasets of a single type with queryDatasets.
723 with self.assertWarns(FutureWarning):
724 childRefs2 = set(
725 registry.queryDatasets(
726 "bias.wcs",
727 collections=collection,
728 )
729 )
730 self.assertEqual(
731 {ref.unresolved() for ref in childRefs2}, {DatasetRef(childType, dataId) for dataId in dataIds}
732 )
734 def testCollections(self):
735 """Tests for registry methods that manage collections."""
736 registry = self.makeRegistry()
737 other_registry = self.makeRegistry(share_repo_with=registry)
738 self.loadData(registry, "base.yaml")
739 self.loadData(registry, "datasets.yaml")
740 run1 = "imported_g"
741 run2 = "imported_r"
742 # Test setting a collection docstring after it has been created.
743 registry.setCollectionDocumentation(run1, "doc for run1")
744 self.assertEqual(registry.getCollectionDocumentation(run1), "doc for run1")
745 registry.setCollectionDocumentation(run1, None)
746 self.assertIsNone(registry.getCollectionDocumentation(run1))
747 datasetType = "bias"
748 # Find some datasets via their run's collection.
749 dataId1 = {"instrument": "Cam1", "detector": 1}
750 ref1 = registry.findDataset(datasetType, dataId1, collections=run1)
751 self.assertIsNotNone(ref1)
752 dataId2 = {"instrument": "Cam1", "detector": 2}
753 ref2 = registry.findDataset(datasetType, dataId2, collections=run1)
754 self.assertIsNotNone(ref2)
755 # Associate those into a new collection, then look for them there.
756 tag1 = "tag1"
757 registry.registerCollection(tag1, type=CollectionType.TAGGED, doc="doc for tag1")
758 # Check that we can query for old and new collections by type.
759 self.assertEqual(set(registry.queryCollections(collectionTypes=CollectionType.RUN)), {run1, run2})
760 self.assertEqual(
761 set(registry.queryCollections(collectionTypes={CollectionType.TAGGED, CollectionType.RUN})),
762 {tag1, run1, run2},
763 )
764 self.assertEqual(registry.getCollectionDocumentation(tag1), "doc for tag1")
765 registry.associate(tag1, [ref1, ref2])
766 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
767 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
768 # Disassociate one and verify that we can't it there anymore...
769 registry.disassociate(tag1, [ref1])
770 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=tag1))
771 # ...but we can still find ref2 in tag1, and ref1 in the run.
772 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run1), ref1)
773 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
774 collections = set(registry.queryCollections())
775 self.assertEqual(collections, {run1, run2, tag1})
776 # Associate both refs into tag1 again; ref2 is already there, but that
777 # should be a harmless no-op.
778 registry.associate(tag1, [ref1, ref2])
779 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
780 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
781 # Get a different dataset (from a different run) that has the same
782 # dataset type and data ID as ref2.
783 ref2b = registry.findDataset(datasetType, dataId2, collections=run2)
784 self.assertNotEqual(ref2, ref2b)
785 # Attempting to associate that into tag1 should be an error.
786 with self.assertRaises(ConflictingDefinitionError):
787 registry.associate(tag1, [ref2b])
788 # That error shouldn't have messed up what we had before.
789 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
790 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
791 # Attempt to associate the conflicting dataset again, this time with
792 # a dataset that isn't in the collection and won't cause a conflict.
793 # Should also fail without modifying anything.
794 dataId3 = {"instrument": "Cam1", "detector": 3}
795 ref3 = registry.findDataset(datasetType, dataId3, collections=run1)
796 with self.assertRaises(ConflictingDefinitionError):
797 registry.associate(tag1, [ref3, ref2b])
798 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
799 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
800 self.assertIsNone(registry.findDataset(datasetType, dataId3, collections=tag1))
801 # Register a chained collection that searches [tag1, run2]
802 chain1 = "chain1"
803 registry.registerCollection(chain1, type=CollectionType.CHAINED)
804 self.assertIs(registry.getCollectionType(chain1), CollectionType.CHAINED)
805 # Chained collection exists, but has no collections in it.
806 self.assertFalse(registry.getCollectionChain(chain1))
807 # If we query for all collections, we should get the chained collection
808 # only if we don't ask to flatten it (i.e. yield only its children).
809 self.assertEqual(set(registry.queryCollections(flattenChains=False)), {tag1, run1, run2, chain1})
810 self.assertEqual(set(registry.queryCollections(flattenChains=True)), {tag1, run1, run2})
811 # Attempt to set its child collections to something circular; that
812 # should fail.
813 with self.assertRaises(ValueError):
814 registry.setCollectionChain(chain1, [tag1, chain1])
815 # Add the child collections.
816 registry.setCollectionChain(chain1, [tag1, run2])
817 self.assertEqual(list(registry.getCollectionChain(chain1)), [tag1, run2])
818 self.assertEqual(registry.getCollectionParentChains(tag1), {chain1})
819 self.assertEqual(registry.getCollectionParentChains(run2), {chain1})
820 # Refresh the other registry that points to the same repo, and make
821 # sure it can see the things we've done (note that this does require
822 # an explicit refresh(); that's the documented behavior, because
823 # caching is ~impossible otherwise).
824 if other_registry is not None:
825 other_registry.refresh()
826 self.assertEqual(list(other_registry.getCollectionChain(chain1)), [tag1, run2])
827 self.assertEqual(other_registry.getCollectionParentChains(tag1), {chain1})
828 self.assertEqual(other_registry.getCollectionParentChains(run2), {chain1})
829 # Searching for dataId1 or dataId2 in the chain should return ref1 and
830 # ref2, because both are in tag1.
831 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain1), ref1)
832 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain1), ref2)
833 # Now disassociate ref2 from tag1. The search (for bias) with
834 # dataId2 in chain1 should then:
835 # 1. not find it in tag1
836 # 2. find a different dataset in run2
837 registry.disassociate(tag1, [ref2])
838 ref2b = registry.findDataset(datasetType, dataId2, collections=chain1)
839 self.assertNotEqual(ref2b, ref2)
840 self.assertEqual(ref2b, registry.findDataset(datasetType, dataId2, collections=run2))
841 # Define a new chain so we can test recursive chains.
842 chain2 = "chain2"
843 registry.registerCollection(chain2, type=CollectionType.CHAINED)
844 registry.setCollectionChain(chain2, [run2, chain1])
845 self.assertEqual(registry.getCollectionParentChains(chain1), {chain2})
846 self.assertEqual(registry.getCollectionParentChains(run2), {chain1, chain2})
847 # Query for collections matching a regex.
848 self.assertCountEqual(
849 list(registry.queryCollections(re.compile("imported_."), flattenChains=False)),
850 ["imported_r", "imported_g"],
851 )
852 # Query for collections matching a regex or an explicit str.
853 self.assertCountEqual(
854 list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)),
855 ["imported_r", "imported_g", "chain1"],
856 )
857 # Search for bias with dataId1 should find it via tag1 in chain2,
858 # recursing, because is not in run1.
859 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=run2))
860 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain2), ref1)
861 # Search for bias with dataId2 should find it in run2 (ref2b).
862 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain2), ref2b)
863 # Search for a flat that is in run2. That should not be found
864 # at the front of chain2, because of the restriction to bias
865 # on run2 there, but it should be found in at the end of chain1.
866 dataId4 = {"instrument": "Cam1", "detector": 3, "physical_filter": "Cam1-R2"}
867 ref4 = registry.findDataset("flat", dataId4, collections=run2)
868 self.assertIsNotNone(ref4)
869 self.assertEqual(ref4, registry.findDataset("flat", dataId4, collections=chain2))
870 # Deleting a collection that's part of a CHAINED collection is not
871 # allowed, and is exception-safe.
872 with self.assertRaises(Exception):
873 registry.removeCollection(run2)
874 self.assertEqual(registry.getCollectionType(run2), CollectionType.RUN)
875 with self.assertRaises(Exception):
876 registry.removeCollection(chain1)
877 self.assertEqual(registry.getCollectionType(chain1), CollectionType.CHAINED)
878 # Actually remove chain2, test that it's gone by asking for its type.
879 registry.removeCollection(chain2)
880 with self.assertRaises(MissingCollectionError):
881 registry.getCollectionType(chain2)
882 # Actually remove run2 and chain1, which should work now.
883 registry.removeCollection(chain1)
884 registry.removeCollection(run2)
885 with self.assertRaises(MissingCollectionError):
886 registry.getCollectionType(run2)
887 with self.assertRaises(MissingCollectionError):
888 registry.getCollectionType(chain1)
889 # Remove tag1 as well, just to test that we can remove TAGGED
890 # collections.
891 registry.removeCollection(tag1)
892 with self.assertRaises(MissingCollectionError):
893 registry.getCollectionType(tag1)
895 def testCollectionChainFlatten(self):
896 """Test that Registry.setCollectionChain obeys its 'flatten' option."""
897 registry = self.makeRegistry()
898 registry.registerCollection("inner", CollectionType.CHAINED)
899 registry.registerCollection("innermost", CollectionType.RUN)
900 registry.setCollectionChain("inner", ["innermost"])
901 registry.registerCollection("outer", CollectionType.CHAINED)
902 registry.setCollectionChain("outer", ["inner"], flatten=False)
903 self.assertEqual(list(registry.getCollectionChain("outer")), ["inner"])
904 registry.setCollectionChain("outer", ["inner"], flatten=True)
905 self.assertEqual(list(registry.getCollectionChain("outer")), ["innermost"])
907 def testBasicTransaction(self):
908 """Test that all operations within a single transaction block are
909 rolled back if an exception propagates out of the block.
910 """
911 registry = self.makeRegistry()
912 storageClass = StorageClass("testDatasetType")
913 registry.storageClasses.registerStorageClass(storageClass)
914 with registry.transaction():
915 registry.insertDimensionData("instrument", {"name": "Cam1", "class_name": "A"})
916 with self.assertRaises(ValueError):
917 with registry.transaction():
918 registry.insertDimensionData("instrument", {"name": "Cam2"})
919 raise ValueError("Oops, something went wrong")
920 # Cam1 should exist
921 self.assertEqual(registry.expandDataId(instrument="Cam1").records["instrument"].class_name, "A")
922 # But Cam2 and Cam3 should both not exist
923 with self.assertRaises(DataIdValueError):
924 registry.expandDataId(instrument="Cam2")
925 with self.assertRaises(DataIdValueError):
926 registry.expandDataId(instrument="Cam3")
928 def testNestedTransaction(self):
929 """Test that operations within a transaction block are not rolled back
930 if an exception propagates out of an inner transaction block and is
931 then caught.
932 """
933 registry = self.makeRegistry()
934 dimension = registry.dimensions["instrument"]
935 dataId1 = {"instrument": "DummyCam"}
936 dataId2 = {"instrument": "DummyCam2"}
937 checkpointReached = False
938 with registry.transaction():
939 # This should be added and (ultimately) committed.
940 registry.insertDimensionData(dimension, dataId1)
941 with self.assertRaises(sqlalchemy.exc.IntegrityError):
942 with registry.transaction(savepoint=True):
943 # This does not conflict, and should succeed (but not
944 # be committed).
945 registry.insertDimensionData(dimension, dataId2)
946 checkpointReached = True
947 # This should conflict and raise, triggerring a rollback
948 # of the previous insertion within the same transaction
949 # context, but not the original insertion in the outer
950 # block.
951 registry.insertDimensionData(dimension, dataId1)
952 self.assertTrue(checkpointReached)
953 self.assertIsNotNone(registry.expandDataId(dataId1, graph=dimension.graph))
954 with self.assertRaises(DataIdValueError):
955 registry.expandDataId(dataId2, graph=dimension.graph)
957 def testInstrumentDimensions(self):
958 """Test queries involving only instrument dimensions, with no joins to
959 skymap."""
960 registry = self.makeRegistry()
962 # need a bunch of dimensions and datasets for test
963 registry.insertDimensionData(
964 "instrument", dict(name="DummyCam", visit_max=25, exposure_max=300, detector_max=6)
965 )
966 registry.insertDimensionData(
967 "physical_filter",
968 dict(instrument="DummyCam", name="dummy_r", band="r"),
969 dict(instrument="DummyCam", name="dummy_i", band="i"),
970 )
971 registry.insertDimensionData(
972 "detector", *[dict(instrument="DummyCam", id=i, full_name=str(i)) for i in range(1, 6)]
973 )
974 registry.insertDimensionData(
975 "visit_system",
976 dict(instrument="DummyCam", id=1, name="default"),
977 )
978 registry.insertDimensionData(
979 "visit",
980 dict(instrument="DummyCam", id=10, name="ten", physical_filter="dummy_i", visit_system=1),
981 dict(instrument="DummyCam", id=11, name="eleven", physical_filter="dummy_r", visit_system=1),
982 dict(instrument="DummyCam", id=20, name="twelve", physical_filter="dummy_r", visit_system=1),
983 )
984 for i in range(1, 6):
985 registry.insertDimensionData(
986 "visit_detector_region",
987 dict(instrument="DummyCam", visit=10, detector=i),
988 dict(instrument="DummyCam", visit=11, detector=i),
989 dict(instrument="DummyCam", visit=20, detector=i),
990 )
991 registry.insertDimensionData(
992 "exposure",
993 dict(instrument="DummyCam", id=100, obs_id="100", physical_filter="dummy_i"),
994 dict(instrument="DummyCam", id=101, obs_id="101", physical_filter="dummy_i"),
995 dict(instrument="DummyCam", id=110, obs_id="110", physical_filter="dummy_r"),
996 dict(instrument="DummyCam", id=111, obs_id="111", physical_filter="dummy_r"),
997 dict(instrument="DummyCam", id=200, obs_id="200", physical_filter="dummy_r"),
998 dict(instrument="DummyCam", id=201, obs_id="201", physical_filter="dummy_r"),
999 )
1000 registry.insertDimensionData(
1001 "visit_definition",
1002 dict(instrument="DummyCam", exposure=100, visit_system=1, visit=10),
1003 dict(instrument="DummyCam", exposure=101, visit_system=1, visit=10),
1004 dict(instrument="DummyCam", exposure=110, visit_system=1, visit=11),
1005 dict(instrument="DummyCam", exposure=111, visit_system=1, visit=11),
1006 dict(instrument="DummyCam", exposure=200, visit_system=1, visit=20),
1007 dict(instrument="DummyCam", exposure=201, visit_system=1, visit=20),
1008 )
1009 # dataset types
1010 run1 = "test1_r"
1011 run2 = "test2_r"
1012 tagged2 = "test2_t"
1013 registry.registerRun(run1)
1014 registry.registerRun(run2)
1015 registry.registerCollection(tagged2)
1016 storageClass = StorageClass("testDataset")
1017 registry.storageClasses.registerStorageClass(storageClass)
1018 rawType = DatasetType(
1019 name="RAW",
1020 dimensions=registry.dimensions.extract(("instrument", "exposure", "detector")),
1021 storageClass=storageClass,
1022 )
1023 registry.registerDatasetType(rawType)
1024 calexpType = DatasetType(
1025 name="CALEXP",
1026 dimensions=registry.dimensions.extract(("instrument", "visit", "detector")),
1027 storageClass=storageClass,
1028 )
1029 registry.registerDatasetType(calexpType)
1031 # add pre-existing datasets
1032 for exposure in (100, 101, 110, 111):
1033 for detector in (1, 2, 3):
1034 # note that only 3 of 5 detectors have datasets
1035 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1036 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run1)
1037 # exposures 100 and 101 appear in both run1 and tagged2.
1038 # 100 has different datasets in the different collections
1039 # 101 has the same dataset in both collections.
1040 if exposure == 100:
1041 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1042 if exposure in (100, 101):
1043 registry.associate(tagged2, [ref])
1044 # Add pre-existing datasets to tagged2.
1045 for exposure in (200, 201):
1046 for detector in (3, 4, 5):
1047 # note that only 3 of 5 detectors have datasets
1048 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1049 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1050 registry.associate(tagged2, [ref])
1052 dimensions = DimensionGraph(
1053 registry.dimensions, dimensions=(rawType.dimensions.required | calexpType.dimensions.required)
1054 )
1055 # Test that single dim string works as well as list of str
1056 rows = registry.queryDataIds("visit", datasets=rawType, collections=run1).expanded().toSet()
1057 rowsI = registry.queryDataIds(["visit"], datasets=rawType, collections=run1).expanded().toSet()
1058 self.assertEqual(rows, rowsI)
1059 # with empty expression
1060 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=run1).expanded().toSet()
1061 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1062 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101, 110, 111))
1063 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10, 11))
1064 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3))
1066 # second collection
1067 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=tagged2).toSet()
1068 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1069 for dataId in rows:
1070 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1071 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101, 200, 201))
1072 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10, 20))
1073 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3, 4, 5))
1075 # with two input datasets
1076 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=[run1, tagged2]).toSet()
1077 self.assertEqual(len(set(rows)), 6 * 3) # 6 exposures times 3 detectors; set needed to de-dupe
1078 for dataId in rows:
1079 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1080 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101, 110, 111, 200, 201))
1081 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10, 11, 20))
1082 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3, 4, 5))
1084 # limit to single visit
1085 rows = registry.queryDataIds(
1086 dimensions, datasets=rawType, collections=run1, where="visit = 10", instrument="DummyCam"
1087 ).toSet()
1088 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1089 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101))
1090 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10,))
1091 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3))
1093 # more limiting expression, using link names instead of Table.column
1094 rows = registry.queryDataIds(
1095 dimensions,
1096 datasets=rawType,
1097 collections=run1,
1098 where="visit = 10 and detector > 1 and 'DummyCam'=instrument",
1099 ).toSet()
1100 self.assertEqual(len(rows), 2 * 2) # 2 exposures times 2 detectors
1101 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (100, 101))
1102 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (10,))
1103 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (2, 3))
1105 # queryDataIds with only one of `datasets` and `collections` is an
1106 # error.
1107 with self.assertRaises(CollectionError):
1108 registry.queryDataIds(dimensions, datasets=rawType)
1109 with self.assertRaises(ArgumentError):
1110 registry.queryDataIds(dimensions, collections=run1)
1112 # expression excludes everything
1113 rows = registry.queryDataIds(
1114 dimensions, datasets=rawType, collections=run1, where="visit > 1000", instrument="DummyCam"
1115 ).toSet()
1116 self.assertEqual(len(rows), 0)
1118 # Selecting by physical_filter, this is not in the dimensions, but it
1119 # is a part of the full expression so it should work too.
1120 rows = registry.queryDataIds(
1121 dimensions,
1122 datasets=rawType,
1123 collections=run1,
1124 where="physical_filter = 'dummy_r'",
1125 instrument="DummyCam",
1126 ).toSet()
1127 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1128 self.assertCountEqual(set(dataId["exposure"] for dataId in rows), (110, 111))
1129 self.assertCountEqual(set(dataId["visit"] for dataId in rows), (11,))
1130 self.assertCountEqual(set(dataId["detector"] for dataId in rows), (1, 2, 3))
1132 def testSkyMapDimensions(self):
1133 """Tests involving only skymap dimensions, no joins to instrument."""
1134 registry = self.makeRegistry()
1136 # need a bunch of dimensions and datasets for test, we want
1137 # "band" in the test so also have to add physical_filter
1138 # dimensions
1139 registry.insertDimensionData("instrument", dict(instrument="DummyCam"))
1140 registry.insertDimensionData(
1141 "physical_filter",
1142 dict(instrument="DummyCam", name="dummy_r", band="r"),
1143 dict(instrument="DummyCam", name="dummy_i", band="i"),
1144 )
1145 registry.insertDimensionData("skymap", dict(name="DummyMap", hash="sha!".encode("utf8")))
1146 for tract in range(10):
1147 registry.insertDimensionData("tract", dict(skymap="DummyMap", id=tract))
1148 registry.insertDimensionData(
1149 "patch",
1150 *[dict(skymap="DummyMap", tract=tract, id=patch, cell_x=0, cell_y=0) for patch in range(10)],
1151 )
1153 # dataset types
1154 run = "tésτ"
1155 registry.registerRun(run)
1156 storageClass = StorageClass("testDataset")
1157 registry.storageClasses.registerStorageClass(storageClass)
1158 calexpType = DatasetType(
1159 name="deepCoadd_calexp",
1160 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1161 storageClass=storageClass,
1162 )
1163 registry.registerDatasetType(calexpType)
1164 mergeType = DatasetType(
1165 name="deepCoadd_mergeDet",
1166 dimensions=registry.dimensions.extract(("skymap", "tract", "patch")),
1167 storageClass=storageClass,
1168 )
1169 registry.registerDatasetType(mergeType)
1170 measType = DatasetType(
1171 name="deepCoadd_meas",
1172 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1173 storageClass=storageClass,
1174 )
1175 registry.registerDatasetType(measType)
1177 dimensions = DimensionGraph(
1178 registry.dimensions,
1179 dimensions=(
1180 calexpType.dimensions.required | mergeType.dimensions.required | measType.dimensions.required
1181 ),
1182 )
1184 # add pre-existing datasets
1185 for tract in (1, 3, 5):
1186 for patch in (2, 4, 6, 7):
1187 dataId = dict(skymap="DummyMap", tract=tract, patch=patch)
1188 registry.insertDatasets(mergeType, dataIds=[dataId], run=run)
1189 for aFilter in ("i", "r"):
1190 dataId = dict(skymap="DummyMap", tract=tract, patch=patch, band=aFilter)
1191 registry.insertDatasets(calexpType, dataIds=[dataId], run=run)
1193 # with empty expression
1194 rows = registry.queryDataIds(dimensions, datasets=[calexpType, mergeType], collections=run).toSet()
1195 self.assertEqual(len(rows), 3 * 4 * 2) # 4 tracts x 4 patches x 2 filters
1196 for dataId in rows:
1197 self.assertCountEqual(dataId.keys(), ("skymap", "tract", "patch", "band"))
1198 self.assertCountEqual(set(dataId["tract"] for dataId in rows), (1, 3, 5))
1199 self.assertCountEqual(set(dataId["patch"] for dataId in rows), (2, 4, 6, 7))
1200 self.assertCountEqual(set(dataId["band"] for dataId in rows), ("i", "r"))
1202 # limit to 2 tracts and 2 patches
1203 rows = registry.queryDataIds(
1204 dimensions,
1205 datasets=[calexpType, mergeType],
1206 collections=run,
1207 where="tract IN (1, 5) AND patch IN (2, 7)",
1208 skymap="DummyMap",
1209 ).toSet()
1210 self.assertEqual(len(rows), 2 * 2 * 2) # 2 tracts x 2 patches x 2 filters
1211 self.assertCountEqual(set(dataId["tract"] for dataId in rows), (1, 5))
1212 self.assertCountEqual(set(dataId["patch"] for dataId in rows), (2, 7))
1213 self.assertCountEqual(set(dataId["band"] for dataId in rows), ("i", "r"))
1215 # limit to single filter
1216 rows = registry.queryDataIds(
1217 dimensions, datasets=[calexpType, mergeType], collections=run, where="band = 'i'"
1218 ).toSet()
1219 self.assertEqual(len(rows), 3 * 4 * 1) # 4 tracts x 4 patches x 2 filters
1220 self.assertCountEqual(set(dataId["tract"] for dataId in rows), (1, 3, 5))
1221 self.assertCountEqual(set(dataId["patch"] for dataId in rows), (2, 4, 6, 7))
1222 self.assertCountEqual(set(dataId["band"] for dataId in rows), ("i",))
1224 # Specifying non-existing skymap is an exception
1225 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
1226 rows = registry.queryDataIds(
1227 dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
1228 ).toSet()
1230 def testSpatialJoin(self):
1231 """Test queries that involve spatial overlap joins."""
1232 registry = self.makeRegistry()
1233 self.loadData(registry, "hsc-rc2-subset.yaml")
1235 # Dictionary of spatial DatabaseDimensionElements, keyed by the name of
1236 # the TopologicalFamily they belong to. We'll relate all elements in
1237 # each family to all of the elements in each other family.
1238 families = defaultdict(set)
1239 # Dictionary of {element.name: {dataId: region}}.
1240 regions = {}
1241 for element in registry.dimensions.getDatabaseElements():
1242 if element.spatial is not None:
1243 families[element.spatial.name].add(element)
1244 regions[element.name] = {
1245 record.dataId: record.region for record in registry.queryDimensionRecords(element)
1246 }
1248 # If this check fails, it's not necessarily a problem - it may just be
1249 # a reasonable change to the default dimension definitions - but the
1250 # test below depends on there being more than one family to do anything
1251 # useful.
1252 self.assertEqual(len(families), 2)
1254 # Overlap DatabaseDimensionElements with each other.
1255 for family1, family2 in itertools.combinations(families, 2):
1256 for element1, element2 in itertools.product(families[family1], families[family2]):
1257 graph = DimensionGraph.union(element1.graph, element2.graph)
1258 # Construct expected set of overlapping data IDs via a
1259 # brute-force comparison of the regions we've already fetched.
1260 expected = {
1261 DataCoordinate.standardize({**dataId1.byName(), **dataId2.byName()}, graph=graph)
1262 for (dataId1, region1), (dataId2, region2) in itertools.product(
1263 regions[element1.name].items(), regions[element2.name].items()
1264 )
1265 if not region1.isDisjointFrom(region2)
1266 }
1267 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1268 queried = set(registry.queryDataIds(graph))
1269 self.assertEqual(expected, queried)
1271 # Overlap each DatabaseDimensionElement with the commonSkyPix system.
1272 commonSkyPix = registry.dimensions.commonSkyPix
1273 for elementName, regions in regions.items():
1274 graph = DimensionGraph.union(registry.dimensions[elementName].graph, commonSkyPix.graph)
1275 expected = set()
1276 for dataId, region in regions.items():
1277 for begin, end in commonSkyPix.pixelization.envelope(region):
1278 expected.update(
1279 DataCoordinate.standardize({commonSkyPix.name: index, **dataId.byName()}, graph=graph)
1280 for index in range(begin, end)
1281 )
1282 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1283 queried = set(registry.queryDataIds(graph))
1284 self.assertEqual(expected, queried)
1286 def testAbstractQuery(self):
1287 """Test that we can run a query that just lists the known
1288 bands. This is tricky because band is
1289 backed by a query against physical_filter.
1290 """
1291 registry = self.makeRegistry()
1292 registry.insertDimensionData("instrument", dict(name="DummyCam"))
1293 registry.insertDimensionData(
1294 "physical_filter",
1295 dict(instrument="DummyCam", name="dummy_i", band="i"),
1296 dict(instrument="DummyCam", name="dummy_i2", band="i"),
1297 dict(instrument="DummyCam", name="dummy_r", band="r"),
1298 )
1299 rows = registry.queryDataIds(["band"]).toSet()
1300 self.assertCountEqual(
1301 rows,
1302 [
1303 DataCoordinate.standardize(band="i", universe=registry.dimensions),
1304 DataCoordinate.standardize(band="r", universe=registry.dimensions),
1305 ],
1306 )
1308 def testAttributeManager(self):
1309 """Test basic functionality of attribute manager."""
1310 # number of attributes with schema versions in a fresh database,
1311 # 6 managers with 2 records per manager, plus config for dimensions
1312 VERSION_COUNT = 6 * 2 + 1
1314 registry = self.makeRegistry()
1315 attributes = registry._managers.attributes
1317 # check what get() returns for non-existing key
1318 self.assertIsNone(attributes.get("attr"))
1319 self.assertEqual(attributes.get("attr", ""), "")
1320 self.assertEqual(attributes.get("attr", "Value"), "Value")
1321 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1323 # cannot store empty key or value
1324 with self.assertRaises(ValueError):
1325 attributes.set("", "value")
1326 with self.assertRaises(ValueError):
1327 attributes.set("attr", "")
1329 # set value of non-existing key
1330 attributes.set("attr", "value")
1331 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1332 self.assertEqual(attributes.get("attr"), "value")
1334 # update value of existing key
1335 with self.assertRaises(ButlerAttributeExistsError):
1336 attributes.set("attr", "value2")
1338 attributes.set("attr", "value2", force=True)
1339 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1340 self.assertEqual(attributes.get("attr"), "value2")
1342 # delete existing key
1343 self.assertTrue(attributes.delete("attr"))
1344 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1346 # delete non-existing key
1347 self.assertFalse(attributes.delete("non-attr"))
1349 # store bunch of keys and get the list back
1350 data = [
1351 ("version.core", "1.2.3"),
1352 ("version.dimensions", "3.2.1"),
1353 ("config.managers.opaque", "ByNameOpaqueTableStorageManager"),
1354 ]
1355 for key, value in data:
1356 attributes.set(key, value)
1357 items = dict(attributes.items())
1358 for key, value in data:
1359 self.assertEqual(items[key], value)
1361 def testQueryDatasetsDeduplication(self):
1362 """Test that the findFirst option to queryDatasets selects datasets
1363 from collections in the order given".
1364 """
1365 registry = self.makeRegistry()
1366 self.loadData(registry, "base.yaml")
1367 self.loadData(registry, "datasets.yaml")
1368 self.assertCountEqual(
1369 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"])),
1370 [
1371 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1372 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1373 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1374 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1375 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1376 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1377 ],
1378 )
1379 self.assertCountEqual(
1380 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"], findFirst=True)),
1381 [
1382 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1383 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1384 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1385 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1386 ],
1387 )
1388 self.assertCountEqual(
1389 list(registry.queryDatasets("bias", collections=["imported_r", "imported_g"], findFirst=True)),
1390 [
1391 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1392 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1393 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1394 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1395 ],
1396 )
1398 def testQueryResults(self):
1399 """Test querying for data IDs and then manipulating the QueryResults
1400 object returned to perform other queries.
1401 """
1402 registry = self.makeRegistry()
1403 self.loadData(registry, "base.yaml")
1404 self.loadData(registry, "datasets.yaml")
1405 bias = registry.getDatasetType("bias")
1406 flat = registry.getDatasetType("flat")
1407 # Obtain expected results from methods other than those we're testing
1408 # here. That includes:
1409 # - the dimensions of the data IDs we want to query:
1410 expectedGraph = DimensionGraph(registry.dimensions, names=["detector", "physical_filter"])
1411 # - the dimensions of some other data IDs we'll extract from that:
1412 expectedSubsetGraph = DimensionGraph(registry.dimensions, names=["detector"])
1413 # - the data IDs we expect to obtain from the first queries:
1414 expectedDataIds = DataCoordinateSet(
1415 {
1416 DataCoordinate.standardize(
1417 instrument="Cam1", detector=d, physical_filter=p, universe=registry.dimensions
1418 )
1419 for d, p in itertools.product({1, 2, 3}, {"Cam1-G", "Cam1-R1", "Cam1-R2"})
1420 },
1421 graph=expectedGraph,
1422 hasFull=False,
1423 hasRecords=False,
1424 )
1425 # - the flat datasets we expect to find from those data IDs, in just
1426 # one collection (so deduplication is irrelevant):
1427 expectedFlats = [
1428 registry.findDataset(
1429 flat, instrument="Cam1", detector=1, physical_filter="Cam1-R1", collections="imported_r"
1430 ),
1431 registry.findDataset(
1432 flat, instrument="Cam1", detector=2, physical_filter="Cam1-R1", collections="imported_r"
1433 ),
1434 registry.findDataset(
1435 flat, instrument="Cam1", detector=3, physical_filter="Cam1-R2", collections="imported_r"
1436 ),
1437 ]
1438 # - the data IDs we expect to extract from that:
1439 expectedSubsetDataIds = expectedDataIds.subset(expectedSubsetGraph)
1440 # - the bias datasets we expect to find from those data IDs, after we
1441 # subset-out the physical_filter dimension, both with duplicates:
1442 expectedAllBiases = [
1443 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1444 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_g"),
1445 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_g"),
1446 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1447 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1448 ]
1449 # - ...and without duplicates:
1450 expectedDeduplicatedBiases = [
1451 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1452 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1453 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1454 ]
1455 # Test against those expected results, using a "lazy" query for the
1456 # data IDs (which re-executes that query each time we use it to do
1457 # something new).
1458 dataIds = registry.queryDataIds(
1459 ["detector", "physical_filter"],
1460 where="detector.purpose = 'SCIENCE'", # this rejects detector=4
1461 instrument="Cam1",
1462 )
1463 self.assertEqual(dataIds.graph, expectedGraph)
1464 self.assertEqual(dataIds.toSet(), expectedDataIds)
1465 self.assertCountEqual(
1466 list(
1467 dataIds.findDatasets(
1468 flat,
1469 collections=["imported_r"],
1470 )
1471 ),
1472 expectedFlats,
1473 )
1474 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1475 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1476 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1477 self.assertCountEqual(
1478 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=False)),
1479 expectedAllBiases,
1480 )
1481 self.assertCountEqual(
1482 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)),
1483 expectedDeduplicatedBiases,
1484 )
1486 # Check dimensions match.
1487 with self.assertRaises(ValueError):
1488 subsetDataIds.findDatasets("flat", collections=["imported_r", "imported_g"], findFirst=True)
1490 # Use a component dataset type.
1491 self.assertCountEqual(
1492 [
1493 ref.makeComponentRef("image")
1494 for ref in subsetDataIds.findDatasets(
1495 bias,
1496 collections=["imported_r", "imported_g"],
1497 findFirst=False,
1498 )
1499 ],
1500 [ref.makeComponentRef("image") for ref in expectedAllBiases],
1501 )
1503 # Use a named dataset type that does not exist and a dataset type
1504 # object that does not exist.
1505 unknown_type = DatasetType("not_known", dimensions=bias.dimensions, storageClass="Exposure")
1507 # Test both string name and dataset type object.
1508 test_type: Union[str, DatasetType]
1509 for test_type, test_type_name in (
1510 (unknown_type, unknown_type.name),
1511 (unknown_type.name, unknown_type.name),
1512 ):
1513 with self.assertRaisesRegex(DatasetTypeError, expected_regex=test_type_name):
1514 list(
1515 subsetDataIds.findDatasets(
1516 test_type, collections=["imported_r", "imported_g"], findFirst=True
1517 )
1518 )
1520 # Materialize the bias dataset queries (only) by putting the results
1521 # into temporary tables, then repeat those tests.
1522 with subsetDataIds.findDatasets(
1523 bias, collections=["imported_r", "imported_g"], findFirst=False
1524 ).materialize() as biases:
1525 self.assertCountEqual(list(biases), expectedAllBiases)
1526 with subsetDataIds.findDatasets(
1527 bias, collections=["imported_r", "imported_g"], findFirst=True
1528 ).materialize() as biases:
1529 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1530 # Materialize the data ID subset query, but not the dataset queries.
1531 with subsetDataIds.materialize() as subsetDataIds:
1532 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1533 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1534 self.assertCountEqual(
1535 list(
1536 subsetDataIds.findDatasets(
1537 bias, collections=["imported_r", "imported_g"], findFirst=False
1538 )
1539 ),
1540 expectedAllBiases,
1541 )
1542 self.assertCountEqual(
1543 list(
1544 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1545 ),
1546 expectedDeduplicatedBiases,
1547 )
1548 # Materialize the dataset queries, too.
1549 with subsetDataIds.findDatasets(
1550 bias, collections=["imported_r", "imported_g"], findFirst=False
1551 ).materialize() as biases:
1552 self.assertCountEqual(list(biases), expectedAllBiases)
1553 with subsetDataIds.findDatasets(
1554 bias, collections=["imported_r", "imported_g"], findFirst=True
1555 ).materialize() as biases:
1556 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1557 # Materialize the original query, but none of the follow-up queries.
1558 with dataIds.materialize() as dataIds:
1559 self.assertEqual(dataIds.graph, expectedGraph)
1560 self.assertEqual(dataIds.toSet(), expectedDataIds)
1561 self.assertCountEqual(
1562 list(
1563 dataIds.findDatasets(
1564 flat,
1565 collections=["imported_r"],
1566 )
1567 ),
1568 expectedFlats,
1569 )
1570 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1571 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1572 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1573 self.assertCountEqual(
1574 list(
1575 subsetDataIds.findDatasets(
1576 bias, collections=["imported_r", "imported_g"], findFirst=False
1577 )
1578 ),
1579 expectedAllBiases,
1580 )
1581 self.assertCountEqual(
1582 list(
1583 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1584 ),
1585 expectedDeduplicatedBiases,
1586 )
1587 # Materialize just the bias dataset queries.
1588 with subsetDataIds.findDatasets(
1589 bias, collections=["imported_r", "imported_g"], findFirst=False
1590 ).materialize() as biases:
1591 self.assertCountEqual(list(biases), expectedAllBiases)
1592 with subsetDataIds.findDatasets(
1593 bias, collections=["imported_r", "imported_g"], findFirst=True
1594 ).materialize() as biases:
1595 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1596 # Materialize the subset data ID query, but not the dataset
1597 # queries.
1598 with subsetDataIds.materialize() as subsetDataIds:
1599 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1600 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1601 self.assertCountEqual(
1602 list(
1603 subsetDataIds.findDatasets(
1604 bias, collections=["imported_r", "imported_g"], findFirst=False
1605 )
1606 ),
1607 expectedAllBiases,
1608 )
1609 self.assertCountEqual(
1610 list(
1611 subsetDataIds.findDatasets(
1612 bias, collections=["imported_r", "imported_g"], findFirst=True
1613 )
1614 ),
1615 expectedDeduplicatedBiases,
1616 )
1617 # Materialize the bias dataset queries, too, so now we're
1618 # materializing every single step.
1619 with subsetDataIds.findDatasets(
1620 bias, collections=["imported_r", "imported_g"], findFirst=False
1621 ).materialize() as biases:
1622 self.assertCountEqual(list(biases), expectedAllBiases)
1623 with subsetDataIds.findDatasets(
1624 bias, collections=["imported_r", "imported_g"], findFirst=True
1625 ).materialize() as biases:
1626 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1628 def testStorageClassPropagation(self):
1629 """Test that queries for datasets respect the storage class passed in
1630 as part of a full dataset type.
1631 """
1632 registry = self.makeRegistry()
1633 self.loadData(registry, "base.yaml")
1634 dataset_type_in_registry = DatasetType(
1635 "tbl", dimensions=["instrument"], storageClass="DataFrame", universe=registry.dimensions
1636 )
1637 registry.registerDatasetType(dataset_type_in_registry)
1638 run = "run1"
1639 registry.registerRun(run)
1640 (inserted_ref,) = registry.insertDatasets(
1641 dataset_type_in_registry, [registry.expandDataId(instrument="Cam1")], run=run
1642 )
1643 self.assertEqual(inserted_ref.datasetType, dataset_type_in_registry)
1644 query_dataset_type = DatasetType(
1645 "tbl", dimensions=["instrument"], storageClass="ArrowAstropy", universe=registry.dimensions
1646 )
1647 self.assertNotEqual(dataset_type_in_registry, query_dataset_type)
1648 query_datasets_result = registry.queryDatasets(query_dataset_type, collections=[run])
1649 self.assertEqual(query_datasets_result.parentDatasetType, query_dataset_type) # type: ignore
1650 (query_datasets_ref,) = query_datasets_result
1651 self.assertEqual(query_datasets_ref.datasetType, query_dataset_type)
1652 query_data_ids_find_datasets_result = registry.queryDataIds(["instrument"]).findDatasets(
1653 query_dataset_type, collections=[run]
1654 )
1655 self.assertEqual(query_data_ids_find_datasets_result.parentDatasetType, query_dataset_type)
1656 (query_data_ids_find_datasets_ref,) = query_data_ids_find_datasets_result
1657 self.assertEqual(query_data_ids_find_datasets_ref.datasetType, query_dataset_type)
1658 query_dataset_types_result = registry.queryDatasetTypes(query_dataset_type)
1659 self.assertEqual(list(query_dataset_types_result), [query_dataset_type])
1660 find_dataset_ref = registry.findDataset(query_dataset_type, instrument="Cam1", collections=[run])
1661 self.assertEqual(find_dataset_ref.datasetType, query_dataset_type)
1663 def testEmptyDimensionsQueries(self):
1664 """Test Query and QueryResults objects in the case where there are no
1665 dimensions.
1666 """
1667 # Set up test data: one dataset type, two runs, one dataset in each.
1668 registry = self.makeRegistry()
1669 self.loadData(registry, "base.yaml")
1670 schema = DatasetType("schema", dimensions=registry.dimensions.empty, storageClass="Catalog")
1671 registry.registerDatasetType(schema)
1672 dataId = DataCoordinate.makeEmpty(registry.dimensions)
1673 run1 = "run1"
1674 run2 = "run2"
1675 registry.registerRun(run1)
1676 registry.registerRun(run2)
1677 (dataset1,) = registry.insertDatasets(schema, dataIds=[dataId], run=run1)
1678 (dataset2,) = registry.insertDatasets(schema, dataIds=[dataId], run=run2)
1679 # Query directly for both of the datasets, and each one, one at a time.
1680 self.checkQueryResults(
1681 registry.queryDatasets(schema, collections=[run1, run2], findFirst=False), [dataset1, dataset2]
1682 )
1683 self.checkQueryResults(
1684 registry.queryDatasets(schema, collections=[run1, run2], findFirst=True),
1685 [dataset1],
1686 )
1687 self.checkQueryResults(
1688 registry.queryDatasets(schema, collections=[run2, run1], findFirst=True),
1689 [dataset2],
1690 )
1691 # Query for data IDs with no dimensions.
1692 dataIds = registry.queryDataIds([])
1693 self.checkQueryResults(dataIds, [dataId])
1694 # Use queried data IDs to find the datasets.
1695 self.checkQueryResults(
1696 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1697 [dataset1, dataset2],
1698 )
1699 self.checkQueryResults(
1700 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1701 [dataset1],
1702 )
1703 self.checkQueryResults(
1704 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1705 [dataset2],
1706 )
1707 # Now materialize the data ID query results and repeat those tests.
1708 with dataIds.materialize() as dataIds:
1709 self.checkQueryResults(dataIds, [dataId])
1710 self.checkQueryResults(
1711 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1712 [dataset1],
1713 )
1714 self.checkQueryResults(
1715 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1716 [dataset2],
1717 )
1718 # Query for non-empty data IDs, then subset that to get the empty one.
1719 # Repeat the above tests starting from that.
1720 dataIds = registry.queryDataIds(["instrument"]).subset(registry.dimensions.empty, unique=True)
1721 self.checkQueryResults(dataIds, [dataId])
1722 self.checkQueryResults(
1723 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1724 [dataset1, dataset2],
1725 )
1726 self.checkQueryResults(
1727 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1728 [dataset1],
1729 )
1730 self.checkQueryResults(
1731 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1732 [dataset2],
1733 )
1734 with dataIds.materialize() as dataIds:
1735 self.checkQueryResults(dataIds, [dataId])
1736 self.checkQueryResults(
1737 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1738 [dataset1, dataset2],
1739 )
1740 self.checkQueryResults(
1741 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1742 [dataset1],
1743 )
1744 self.checkQueryResults(
1745 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1746 [dataset2],
1747 )
1748 # Query for non-empty data IDs, then materialize, then subset to get
1749 # the empty one. Repeat again.
1750 with registry.queryDataIds(["instrument"]).materialize() as nonEmptyDataIds:
1751 dataIds = nonEmptyDataIds.subset(registry.dimensions.empty, unique=True)
1752 self.checkQueryResults(dataIds, [dataId])
1753 self.checkQueryResults(
1754 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1755 [dataset1, dataset2],
1756 )
1757 self.checkQueryResults(
1758 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1759 [dataset1],
1760 )
1761 self.checkQueryResults(
1762 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1763 [dataset2],
1764 )
1765 with dataIds.materialize() as dataIds:
1766 self.checkQueryResults(dataIds, [dataId])
1767 self.checkQueryResults(
1768 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1769 [dataset1, dataset2],
1770 )
1771 self.checkQueryResults(
1772 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1773 [dataset1],
1774 )
1775 self.checkQueryResults(
1776 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1777 [dataset2],
1778 )
1779 # Query for non-empty data IDs with a constraint on an empty-data-ID
1780 # dataset that exists.
1781 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=...)
1782 self.checkQueryResults(
1783 dataIds.subset(unique=True),
1784 [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)],
1785 )
1786 # Again query for non-empty data IDs with a constraint on empty-data-ID
1787 # datasets, but when the datasets don't exist. We delete the existing
1788 # dataset and query just that collection rather than creating a new
1789 # empty collection because this is a bit less likely for our build-time
1790 # logic to shortcut-out (via the collection summaries), and such a
1791 # shortcut would make this test a bit more trivial than we'd like.
1792 registry.removeDatasets([dataset2])
1793 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=run2)
1794 self.checkQueryResults(dataIds, [])
1796 def testDimensionDataModifications(self):
1797 """Test that modifying dimension records via:
1798 syncDimensionData(..., update=True) and
1799 insertDimensionData(..., replace=True) works as expected, even in the
1800 presence of datasets using those dimensions and spatial overlap
1801 relationships.
1802 """
1804 def unpack_range_set(ranges: lsst.sphgeom.RangeSet) -> Iterator[int]:
1805 """Unpack a sphgeom.RangeSet into the integers it contains."""
1806 for begin, end in ranges:
1807 yield from range(begin, end)
1809 def range_set_hull(
1810 ranges: lsst.sphgeom.RangeSet,
1811 pixelization: lsst.sphgeom.HtmPixelization,
1812 ) -> lsst.sphgeom.ConvexPolygon:
1813 """Create a ConvexPolygon hull of the region defined by a set of
1814 HTM pixelization index ranges.
1815 """
1816 points = []
1817 for index in unpack_range_set(ranges):
1818 points.extend(pixelization.triangle(index).getVertices())
1819 return lsst.sphgeom.ConvexPolygon(points)
1821 # Use HTM to set up an initial parent region (one arbitrary trixel)
1822 # and four child regions (the trixels within the parent at the next
1823 # level. We'll use the parent as a tract/visit region and the children
1824 # as its patch/visit_detector regions.
1825 registry = self.makeRegistry()
1826 htm6 = registry.dimensions.skypix["htm"][6].pixelization
1827 commonSkyPix = registry.dimensions.commonSkyPix.pixelization
1828 index = 12288
1829 child_ranges_small = lsst.sphgeom.RangeSet(index).scaled(4)
1830 assert htm6.universe().contains(child_ranges_small)
1831 child_regions_small = [htm6.triangle(i) for i in unpack_range_set(child_ranges_small)]
1832 parent_region_small = lsst.sphgeom.ConvexPolygon(
1833 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_small))
1834 )
1835 assert all(parent_region_small.contains(c) for c in child_regions_small)
1836 # Make a larger version of each child region, defined to be the set of
1837 # htm6 trixels that overlap the original's bounding circle. Make a new
1838 # parent that's the convex hull of the new children.
1839 child_regions_large = [
1840 range_set_hull(htm6.envelope(c.getBoundingCircle()), htm6) for c in child_regions_small
1841 ]
1842 assert all(large.contains(small) for large, small in zip(child_regions_large, child_regions_small))
1843 parent_region_large = lsst.sphgeom.ConvexPolygon(
1844 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_large))
1845 )
1846 assert all(parent_region_large.contains(c) for c in child_regions_large)
1847 assert parent_region_large.contains(parent_region_small)
1848 assert not parent_region_small.contains(parent_region_large)
1849 assert not all(parent_region_small.contains(c) for c in child_regions_large)
1850 # Find some commonSkyPix indices that overlap the large regions but not
1851 # overlap the small regions. We use commonSkyPix here to make sure the
1852 # real tests later involve what's in the database, not just post-query
1853 # filtering of regions.
1854 child_difference_indices = []
1855 for large, small in zip(child_regions_large, child_regions_small):
1856 difference = list(unpack_range_set(commonSkyPix.envelope(large) - commonSkyPix.envelope(small)))
1857 assert difference, "if this is empty, we can't test anything useful with these regions"
1858 assert all(
1859 not commonSkyPix.triangle(d).isDisjointFrom(large)
1860 and commonSkyPix.triangle(d).isDisjointFrom(small)
1861 for d in difference
1862 )
1863 child_difference_indices.append(difference)
1864 parent_difference_indices = list(
1865 unpack_range_set(
1866 commonSkyPix.envelope(parent_region_large) - commonSkyPix.envelope(parent_region_small)
1867 )
1868 )
1869 assert parent_difference_indices, "if this is empty, we can't test anything useful with these regions"
1870 assert all(
1871 (
1872 not commonSkyPix.triangle(d).isDisjointFrom(parent_region_large)
1873 and commonSkyPix.triangle(d).isDisjointFrom(parent_region_small)
1874 )
1875 for d in parent_difference_indices
1876 )
1877 # Now that we've finally got those regions, we'll insert the large ones
1878 # as tract/patch dimension records.
1879 skymap_name = "testing_v1"
1880 registry.insertDimensionData(
1881 "skymap",
1882 {
1883 "name": skymap_name,
1884 "hash": bytes([42]),
1885 "tract_max": 1,
1886 "patch_nx_max": 2,
1887 "patch_ny_max": 2,
1888 },
1889 )
1890 registry.insertDimensionData("tract", {"skymap": skymap_name, "id": 0, "region": parent_region_large})
1891 registry.insertDimensionData(
1892 "patch",
1893 *[
1894 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1895 for n, c in enumerate(child_regions_large)
1896 ],
1897 )
1898 # Add at dataset that uses these dimensions to make sure that modifying
1899 # them doesn't disrupt foreign keys (need to make sure DB doesn't
1900 # implement insert with replace=True as delete-then-insert).
1901 dataset_type = DatasetType(
1902 "coadd",
1903 dimensions=["tract", "patch"],
1904 universe=registry.dimensions,
1905 storageClass="Exposure",
1906 )
1907 registry.registerDatasetType(dataset_type)
1908 registry.registerCollection("the_run", CollectionType.RUN)
1909 registry.insertDatasets(
1910 dataset_type,
1911 [{"skymap": skymap_name, "tract": 0, "patch": 2}],
1912 run="the_run",
1913 )
1914 # Query for tracts and patches that overlap some "difference" htm9
1915 # pixels; there should be overlaps, because the database has
1916 # the "large" suite of regions.
1917 self.assertEqual(
1918 {0},
1919 {
1920 data_id["tract"]
1921 for data_id in registry.queryDataIds(
1922 ["tract"],
1923 skymap=skymap_name,
1924 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1925 )
1926 },
1927 )
1928 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1929 self.assertIn(
1930 patch_id,
1931 {
1932 data_id["patch"]
1933 for data_id in registry.queryDataIds(
1934 ["patch"],
1935 skymap=skymap_name,
1936 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1937 )
1938 },
1939 )
1940 # Use sync to update the tract region and insert to update the regions
1941 # of the patches, to the "small" suite.
1942 updated = registry.syncDimensionData(
1943 "tract",
1944 {"skymap": skymap_name, "id": 0, "region": parent_region_small},
1945 update=True,
1946 )
1947 self.assertEqual(updated, {"region": parent_region_large})
1948 registry.insertDimensionData(
1949 "patch",
1950 *[
1951 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1952 for n, c in enumerate(child_regions_small)
1953 ],
1954 replace=True,
1955 )
1956 # Query again; there now should be no such overlaps, because the
1957 # database has the "small" suite of regions.
1958 self.assertFalse(
1959 set(
1960 registry.queryDataIds(
1961 ["tract"],
1962 skymap=skymap_name,
1963 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1964 )
1965 )
1966 )
1967 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1968 self.assertNotIn(
1969 patch_id,
1970 {
1971 data_id["patch"]
1972 for data_id in registry.queryDataIds(
1973 ["patch"],
1974 skymap=skymap_name,
1975 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1976 )
1977 },
1978 )
1979 # Update back to the large regions and query one more time.
1980 updated = registry.syncDimensionData(
1981 "tract",
1982 {"skymap": skymap_name, "id": 0, "region": parent_region_large},
1983 update=True,
1984 )
1985 self.assertEqual(updated, {"region": parent_region_small})
1986 registry.insertDimensionData(
1987 "patch",
1988 *[
1989 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1990 for n, c in enumerate(child_regions_large)
1991 ],
1992 replace=True,
1993 )
1994 self.assertEqual(
1995 {0},
1996 {
1997 data_id["tract"]
1998 for data_id in registry.queryDataIds(
1999 ["tract"],
2000 skymap=skymap_name,
2001 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
2002 )
2003 },
2004 )
2005 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
2006 self.assertIn(
2007 patch_id,
2008 {
2009 data_id["patch"]
2010 for data_id in registry.queryDataIds(
2011 ["patch"],
2012 skymap=skymap_name,
2013 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
2014 )
2015 },
2016 )
2018 def testCalibrationCollections(self):
2019 """Test operations on `~CollectionType.CALIBRATION` collections,
2020 including `Registry.certify`, `Registry.decertify`, and
2021 `Registry.findDataset`.
2022 """
2023 # Setup - make a Registry, fill it with some datasets in
2024 # non-calibration collections.
2025 registry = self.makeRegistry()
2026 self.loadData(registry, "base.yaml")
2027 self.loadData(registry, "datasets.yaml")
2028 # Set up some timestamps.
2029 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")
2030 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai")
2031 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai")
2032 t4 = astropy.time.Time("2020-01-01T04:00:00", format="isot", scale="tai")
2033 t5 = astropy.time.Time("2020-01-01T05:00:00", format="isot", scale="tai")
2034 allTimespans = [
2035 Timespan(a, b) for a, b in itertools.combinations([None, t1, t2, t3, t4, t5, None], r=2)
2036 ]
2037 # Get references to some datasets.
2038 bias2a = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g")
2039 bias3a = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g")
2040 bias2b = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r")
2041 bias3b = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r")
2042 # Register the main calibration collection we'll be working with.
2043 collection = "Cam1/calibs/default"
2044 registry.registerCollection(collection, type=CollectionType.CALIBRATION)
2045 # Cannot associate into a calibration collection (no timespan).
2046 with self.assertRaises(CollectionTypeError):
2047 registry.associate(collection, [bias2a])
2048 # Certify 2a dataset with [t2, t4) validity.
2049 registry.certify(collection, [bias2a], Timespan(begin=t2, end=t4))
2050 # Test that we can query for this dataset via the new collection, both
2051 # on its own and with a RUN collection, as long as we don't try to join
2052 # in temporal dimensions or use findFirst=True.
2053 self.assertEqual(
2054 set(registry.queryDatasets("bias", findFirst=False, collections=collection)),
2055 {bias2a},
2056 )
2057 self.assertEqual(
2058 set(registry.queryDatasets("bias", findFirst=False, collections=[collection, "imported_r"])),
2059 {
2060 bias2a,
2061 bias2b,
2062 bias3b,
2063 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2064 },
2065 )
2066 self.assertEqual(
2067 set(registry.queryDataIds("detector", datasets="bias", collections=collection)),
2068 {registry.expandDataId(instrument="Cam1", detector=2)},
2069 )
2070 self.assertEqual(
2071 set(registry.queryDataIds("detector", datasets="bias", collections=[collection, "imported_r"])),
2072 {
2073 registry.expandDataId(instrument="Cam1", detector=2),
2074 registry.expandDataId(instrument="Cam1", detector=3),
2075 registry.expandDataId(instrument="Cam1", detector=4),
2076 },
2077 )
2079 # We should not be able to certify 2b with anything overlapping that
2080 # window.
2081 with self.assertRaises(ConflictingDefinitionError):
2082 registry.certify(collection, [bias2b], Timespan(begin=None, end=t3))
2083 with self.assertRaises(ConflictingDefinitionError):
2084 registry.certify(collection, [bias2b], Timespan(begin=None, end=t5))
2085 with self.assertRaises(ConflictingDefinitionError):
2086 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t3))
2087 with self.assertRaises(ConflictingDefinitionError):
2088 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t5))
2089 with self.assertRaises(ConflictingDefinitionError):
2090 registry.certify(collection, [bias2b], Timespan(begin=t1, end=None))
2091 with self.assertRaises(ConflictingDefinitionError):
2092 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t3))
2093 with self.assertRaises(ConflictingDefinitionError):
2094 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t5))
2095 with self.assertRaises(ConflictingDefinitionError):
2096 registry.certify(collection, [bias2b], Timespan(begin=t2, end=None))
2097 # We should be able to certify 3a with a range overlapping that window,
2098 # because it's for a different detector.
2099 # We'll certify 3a over [t1, t3).
2100 registry.certify(collection, [bias3a], Timespan(begin=t1, end=t3))
2101 # Now we'll certify 2b and 3b together over [t4, ∞).
2102 registry.certify(collection, [bias2b, bias3b], Timespan(begin=t4, end=None))
2104 # Fetch all associations and check that they are what we expect.
2105 self.assertCountEqual(
2106 list(
2107 registry.queryDatasetAssociations(
2108 "bias",
2109 collections=[collection, "imported_g", "imported_r"],
2110 )
2111 ),
2112 [
2113 DatasetAssociation(
2114 ref=registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
2115 collection="imported_g",
2116 timespan=None,
2117 ),
2118 DatasetAssociation(
2119 ref=registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2120 collection="imported_r",
2121 timespan=None,
2122 ),
2123 DatasetAssociation(ref=bias2a, collection="imported_g", timespan=None),
2124 DatasetAssociation(ref=bias3a, collection="imported_g", timespan=None),
2125 DatasetAssociation(ref=bias2b, collection="imported_r", timespan=None),
2126 DatasetAssociation(ref=bias3b, collection="imported_r", timespan=None),
2127 DatasetAssociation(ref=bias2a, collection=collection, timespan=Timespan(begin=t2, end=t4)),
2128 DatasetAssociation(ref=bias3a, collection=collection, timespan=Timespan(begin=t1, end=t3)),
2129 DatasetAssociation(ref=bias2b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2130 DatasetAssociation(ref=bias3b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2131 ],
2132 )
2134 class Ambiguous:
2135 """Tag class to denote lookups that should be ambiguous."""
2137 pass
2139 def assertLookup(
2140 detector: int, timespan: Timespan, expected: Optional[Union[DatasetRef, Type[Ambiguous]]]
2141 ) -> None:
2142 """Local function that asserts that a bias lookup returns the given
2143 expected result.
2144 """
2145 if expected is Ambiguous:
2146 with self.assertRaises((DatasetTypeError, LookupError)):
2147 registry.findDataset(
2148 "bias",
2149 collections=collection,
2150 instrument="Cam1",
2151 detector=detector,
2152 timespan=timespan,
2153 )
2154 else:
2155 self.assertEqual(
2156 expected,
2157 registry.findDataset(
2158 "bias",
2159 collections=collection,
2160 instrument="Cam1",
2161 detector=detector,
2162 timespan=timespan,
2163 ),
2164 )
2166 # Systematically test lookups against expected results.
2167 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2168 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2169 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2170 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2171 assertLookup(detector=2, timespan=Timespan(None, t5), expected=Ambiguous)
2172 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2173 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2174 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2175 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2176 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=Ambiguous)
2177 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2178 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2179 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2180 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=Ambiguous)
2181 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2182 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=bias2a)
2183 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=Ambiguous)
2184 assertLookup(detector=2, timespan=Timespan(t3, None), expected=Ambiguous)
2185 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=bias2b)
2186 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2187 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2188 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2189 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2190 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2191 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2192 assertLookup(detector=3, timespan=Timespan(None, t5), expected=Ambiguous)
2193 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2194 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2195 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2196 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2197 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=Ambiguous)
2198 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2199 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2200 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2201 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=Ambiguous)
2202 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2203 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2204 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=bias3b)
2205 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2206 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=bias3b)
2207 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2208 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2210 # Decertify [t3, t5) for all data IDs, and do test lookups again.
2211 # This should truncate bias2a to [t2, t3), leave bias3a unchanged at
2212 # [t1, t3), and truncate bias2b and bias3b to [t5, ∞).
2213 registry.decertify(collection=collection, datasetType="bias", timespan=Timespan(t3, t5))
2214 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2215 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2216 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2217 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2218 assertLookup(detector=2, timespan=Timespan(None, t5), expected=bias2a)
2219 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2220 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2221 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2222 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2223 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=bias2a)
2224 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2225 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2226 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2227 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=bias2a)
2228 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2229 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=None)
2230 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=None)
2231 assertLookup(detector=2, timespan=Timespan(t3, None), expected=bias2b)
2232 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=None)
2233 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2234 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2235 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2236 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2237 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2238 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2239 assertLookup(detector=3, timespan=Timespan(None, t5), expected=bias3a)
2240 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2241 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2242 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2243 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2244 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=bias3a)
2245 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2246 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2247 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2248 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=bias3a)
2249 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2250 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2251 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=None)
2252 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2253 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=None)
2254 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2255 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2257 # Decertify everything, this time with explicit data IDs, then check
2258 # that no lookups succeed.
2259 registry.decertify(
2260 collection,
2261 "bias",
2262 Timespan(None, None),
2263 dataIds=[
2264 dict(instrument="Cam1", detector=2),
2265 dict(instrument="Cam1", detector=3),
2266 ],
2267 )
2268 for detector in (2, 3):
2269 for timespan in allTimespans:
2270 assertLookup(detector=detector, timespan=timespan, expected=None)
2271 # Certify bias2a and bias3a over (-∞, ∞), check that all lookups return
2272 # those.
2273 registry.certify(
2274 collection,
2275 [bias2a, bias3a],
2276 Timespan(None, None),
2277 )
2278 for timespan in allTimespans:
2279 assertLookup(detector=2, timespan=timespan, expected=bias2a)
2280 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2281 # Decertify just bias2 over [t2, t4).
2282 # This should split a single certification row into two (and leave the
2283 # other existing row, for bias3a, alone).
2284 registry.decertify(
2285 collection, "bias", Timespan(t2, t4), dataIds=[dict(instrument="Cam1", detector=2)]
2286 )
2287 for timespan in allTimespans:
2288 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2289 overlapsBefore = timespan.overlaps(Timespan(None, t2))
2290 overlapsAfter = timespan.overlaps(Timespan(t4, None))
2291 if overlapsBefore and overlapsAfter:
2292 expected = Ambiguous
2293 elif overlapsBefore or overlapsAfter:
2294 expected = bias2a
2295 else:
2296 expected = None
2297 assertLookup(detector=2, timespan=timespan, expected=expected)
2299 def testSkipCalibs(self):
2300 """Test how queries handle skipping of calibration collections."""
2301 registry = self.makeRegistry()
2302 self.loadData(registry, "base.yaml")
2303 self.loadData(registry, "datasets.yaml")
2305 coll_calib = "Cam1/calibs/default"
2306 registry.registerCollection(coll_calib, type=CollectionType.CALIBRATION)
2308 # Add all biases to the calibration collection.
2309 # Without this, the logic that prunes dataset subqueries based on
2310 # datasetType-collection summary information will fire before the logic
2311 # we want to test below. This is a good thing (it avoids the dreaded
2312 # NotImplementedError a bit more often) everywhere but here.
2313 registry.certify(coll_calib, registry.queryDatasets("bias", collections=...), Timespan(None, None))
2315 coll_list = [coll_calib, "imported_g", "imported_r"]
2316 chain = "Cam1/chain"
2317 registry.registerCollection(chain, type=CollectionType.CHAINED)
2318 registry.setCollectionChain(chain, coll_list)
2320 # explicit list will raise if findFirst=True or there are temporal
2321 # dimensions
2322 with self.assertRaises(NotImplementedError):
2323 registry.queryDatasets("bias", collections=coll_list, findFirst=True)
2324 with self.assertRaises(NotImplementedError):
2325 registry.queryDataIds(
2326 ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
2327 ).count()
2329 # chain will skip
2330 datasets = list(registry.queryDatasets("bias", collections=chain))
2331 self.assertGreater(len(datasets), 0)
2333 dataIds = list(registry.queryDataIds(["instrument", "detector"], datasets="bias", collections=chain))
2334 self.assertGreater(len(dataIds), 0)
2336 # glob will skip too
2337 datasets = list(registry.queryDatasets("bias", collections="*d*"))
2338 self.assertGreater(len(datasets), 0)
2340 # regular expression will skip too
2341 pattern = re.compile(".*")
2342 datasets = list(registry.queryDatasets("bias", collections=pattern))
2343 self.assertGreater(len(datasets), 0)
2345 # ellipsis should work as usual
2346 datasets = list(registry.queryDatasets("bias", collections=...))
2347 self.assertGreater(len(datasets), 0)
2349 # few tests with findFirst
2350 datasets = list(registry.queryDatasets("bias", collections=chain, findFirst=True))
2351 self.assertGreater(len(datasets), 0)
2353 def testIngestTimeQuery(self):
2354 registry = self.makeRegistry()
2355 self.loadData(registry, "base.yaml")
2356 dt0 = datetime.utcnow()
2357 self.loadData(registry, "datasets.yaml")
2358 dt1 = datetime.utcnow()
2360 datasets = list(registry.queryDatasets(..., collections=...))
2361 len0 = len(datasets)
2362 self.assertGreater(len0, 0)
2364 where = "ingest_date > T'2000-01-01'"
2365 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2366 len1 = len(datasets)
2367 self.assertEqual(len0, len1)
2369 # no one will ever use this piece of software in 30 years
2370 where = "ingest_date > T'2050-01-01'"
2371 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2372 len2 = len(datasets)
2373 self.assertEqual(len2, 0)
2375 # Check more exact timing to make sure there is no 37 seconds offset
2376 # (after fixing DM-30124). SQLite time precision is 1 second, make
2377 # sure that we don't test with higher precision.
2378 tests = [
2379 # format: (timestamp, operator, expected_len)
2380 (dt0 - timedelta(seconds=1), ">", len0),
2381 (dt0 - timedelta(seconds=1), "<", 0),
2382 (dt1 + timedelta(seconds=1), "<", len0),
2383 (dt1 + timedelta(seconds=1), ">", 0),
2384 ]
2385 for dt, op, expect_len in tests:
2386 dt_str = dt.isoformat(sep=" ")
2388 where = f"ingest_date {op} T'{dt_str}'"
2389 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2390 self.assertEqual(len(datasets), expect_len)
2392 # same with bind using datetime or astropy Time
2393 where = f"ingest_date {op} ingest_time"
2394 datasets = list(
2395 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt})
2396 )
2397 self.assertEqual(len(datasets), expect_len)
2399 dt_astropy = astropy.time.Time(dt, format="datetime")
2400 datasets = list(
2401 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt_astropy})
2402 )
2403 self.assertEqual(len(datasets), expect_len)
2405 def testTimespanQueries(self):
2406 """Test query expressions involving timespans."""
2407 registry = self.makeRegistry()
2408 self.loadData(registry, "hsc-rc2-subset.yaml")
2409 # All exposures in the database; mapping from ID to timespan.
2410 visits = {record.id: record.timespan for record in registry.queryDimensionRecords("visit")}
2411 # Just those IDs, sorted (which is also temporal sorting, because HSC
2412 # exposure IDs are monotonically increasing).
2413 ids = sorted(visits.keys())
2414 self.assertGreater(len(ids), 20)
2415 # Pick some quasi-random indexes into `ids` to play with.
2416 i1 = int(len(ids) * 0.1)
2417 i2 = int(len(ids) * 0.3)
2418 i3 = int(len(ids) * 0.6)
2419 i4 = int(len(ids) * 0.8)
2420 # Extract some times from those: just before the beginning of i1 (which
2421 # should be after the end of the exposure before), exactly the
2422 # beginning of i2, just after the beginning of i3 (and before its end),
2423 # and the exact end of i4.
2424 t1 = visits[ids[i1]].begin - astropy.time.TimeDelta(1.0, format="sec")
2425 self.assertGreater(t1, visits[ids[i1 - 1]].end)
2426 t2 = visits[ids[i2]].begin
2427 t3 = visits[ids[i3]].begin + astropy.time.TimeDelta(1.0, format="sec")
2428 self.assertLess(t3, visits[ids[i3]].end)
2429 t4 = visits[ids[i4]].end
2430 # Make sure those are actually in order.
2431 self.assertEqual([t1, t2, t3, t4], sorted([t4, t3, t2, t1]))
2433 bind = {
2434 "t1": t1,
2435 "t2": t2,
2436 "t3": t3,
2437 "t4": t4,
2438 "ts23": Timespan(t2, t3),
2439 }
2441 def query(where):
2442 """Helper function that queries for visit data IDs and returns
2443 results as a sorted, deduplicated list of visit IDs.
2444 """
2445 return sorted(
2446 {
2447 dataId["visit"]
2448 for dataId in registry.queryDataIds("visit", instrument="HSC", bind=bind, where=where)
2449 }
2450 )
2452 # Try a bunch of timespan queries, mixing up the bounds themselves,
2453 # where they appear in the expression, and how we get the timespan into
2454 # the expression.
2456 # t1 is before the start of i1, so this should not include i1.
2457 self.assertEqual(ids[:i1], query("visit.timespan OVERLAPS (null, t1)"))
2458 # t2 is exactly at the start of i2, but ends are exclusive, so these
2459 # should not include i2.
2460 self.assertEqual(ids[i1:i2], query("(t1, t2) OVERLAPS visit.timespan"))
2461 self.assertEqual(ids[:i2], query("visit.timespan < (t2, t4)"))
2462 # t3 is in the middle of i3, so this should include i3.
2463 self.assertEqual(ids[i2 : i3 + 1], query("visit.timespan OVERLAPS ts23"))
2464 # This one should not include t3 by the same reasoning.
2465 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > (t1, t3)"))
2466 # t4 is exactly at the end of i4, so this should include i4.
2467 self.assertEqual(ids[i3 : i4 + 1], query(f"visit.timespan OVERLAPS (T'{t3.tai.isot}', t4)"))
2468 # i4's upper bound of t4 is exclusive so this should not include t4.
2469 self.assertEqual(ids[i4 + 1 :], query("visit.timespan OVERLAPS (t4, NULL)"))
2471 # Now some timespan vs. time scalar queries.
2472 self.assertEqual(ids[:i2], query("visit.timespan < t2"))
2473 self.assertEqual(ids[:i2], query("t2 > visit.timespan"))
2474 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > t3"))
2475 self.assertEqual(ids[i3 + 1 :], query("t3 < visit.timespan"))
2476 self.assertEqual(ids[i3 : i3 + 1], query("visit.timespan OVERLAPS t3"))
2477 self.assertEqual(ids[i3 : i3 + 1], query(f"T'{t3.tai.isot}' OVERLAPS visit.timespan"))
2479 # Empty timespans should not overlap anything.
2480 self.assertEqual([], query("visit.timespan OVERLAPS (t3, t2)"))
2482 def testCollectionSummaries(self):
2483 """Test recording and retrieval of collection summaries."""
2484 self.maxDiff = None
2485 registry = self.makeRegistry()
2486 # Importing datasets from yaml should go through the code path where
2487 # we update collection summaries as we insert datasets.
2488 self.loadData(registry, "base.yaml")
2489 self.loadData(registry, "datasets.yaml")
2490 flat = registry.getDatasetType("flat")
2491 expected1 = CollectionSummary()
2492 expected1.dataset_types.add(registry.getDatasetType("bias"))
2493 expected1.add_data_ids(
2494 flat, [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)]
2495 )
2496 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2497 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2498 # Create a chained collection with both of the imported runs; the
2499 # summary should be the same, because it's a union with itself.
2500 chain = "chain"
2501 registry.registerCollection(chain, CollectionType.CHAINED)
2502 registry.setCollectionChain(chain, ["imported_r", "imported_g"])
2503 self.assertEqual(registry.getCollectionSummary(chain), expected1)
2504 # Associate flats only into a tagged collection and a calibration
2505 # collection to check summaries of those.
2506 tag = "tag"
2507 registry.registerCollection(tag, CollectionType.TAGGED)
2508 registry.associate(tag, registry.queryDatasets(flat, collections="imported_g"))
2509 calibs = "calibs"
2510 registry.registerCollection(calibs, CollectionType.CALIBRATION)
2511 registry.certify(
2512 calibs, registry.queryDatasets(flat, collections="imported_g"), timespan=Timespan(None, None)
2513 )
2514 expected2 = expected1.copy()
2515 expected2.dataset_types.discard("bias")
2516 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2517 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2518 # Explicitly calling Registry.refresh() should load those same
2519 # summaries, via a totally different code path.
2520 registry.refresh()
2521 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2522 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2523 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2524 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2526 def testBindInQueryDatasets(self):
2527 """Test that the bind parameter is correctly forwarded in
2528 queryDatasets recursion.
2529 """
2530 registry = self.makeRegistry()
2531 # Importing datasets from yaml should go through the code path where
2532 # we update collection summaries as we insert datasets.
2533 self.loadData(registry, "base.yaml")
2534 self.loadData(registry, "datasets.yaml")
2535 self.assertEqual(
2536 set(registry.queryDatasets("flat", band="r", collections=...)),
2537 set(registry.queryDatasets("flat", where="band=my_band", bind={"my_band": "r"}, collections=...)),
2538 )
2540 def testQueryIntRangeExpressions(self):
2541 """Test integer range expressions in ``where`` arguments.
2543 Note that our expressions use inclusive stop values, unlike Python's.
2544 """
2545 registry = self.makeRegistry()
2546 self.loadData(registry, "base.yaml")
2547 self.assertEqual(
2548 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..2)")),
2549 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 2]},
2550 )
2551 self.assertEqual(
2552 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..4:2)")),
2553 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 3]},
2554 )
2555 self.assertEqual(
2556 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (2..4:2)")),
2557 {registry.expandDataId(instrument="Cam1", detector=n) for n in [2, 4]},
2558 )
2560 def testQueryResultSummaries(self):
2561 """Test summary methods like `count`, `any`, and `explain_no_results`
2562 on `DataCoordinateQueryResults` and `DatasetQueryResults`
2563 """
2564 registry = self.makeRegistry()
2565 self.loadData(registry, "base.yaml")
2566 self.loadData(registry, "datasets.yaml")
2567 self.loadData(registry, "spatial.yaml")
2568 # Default test dataset has two collections, each with both flats and
2569 # biases. Add a new collection with only biases.
2570 registry.registerCollection("biases", CollectionType.TAGGED)
2571 registry.associate("biases", registry.queryDatasets("bias", collections=["imported_g"]))
2572 # First query yields two results, and involves no postprocessing.
2573 query1 = registry.queryDataIds(["physical_filter"], band="r")
2574 self.assertTrue(query1.any(execute=False, exact=False))
2575 self.assertTrue(query1.any(execute=True, exact=False))
2576 self.assertTrue(query1.any(execute=True, exact=True))
2577 self.assertEqual(query1.count(exact=False), 2)
2578 self.assertEqual(query1.count(exact=True), 2)
2579 self.assertFalse(list(query1.explain_no_results()))
2580 # Second query should yield no results, which we should see when
2581 # we attempt to expand the data ID.
2582 query2 = registry.queryDataIds(["physical_filter"], band="h")
2583 # There's no execute=False, exact=Fals test here because the behavior
2584 # not something we want to guarantee in this case (and exact=False
2585 # says either answer is legal).
2586 self.assertFalse(query2.any(execute=True, exact=False))
2587 self.assertFalse(query2.any(execute=True, exact=True))
2588 self.assertEqual(query2.count(exact=False), 0)
2589 self.assertEqual(query2.count(exact=True), 0)
2590 self.assertTrue(list(query2.explain_no_results()))
2591 # These queries yield no results due to various problems that can be
2592 # spotted prior to execution, yielding helpful diagnostics.
2593 base_query = registry.queryDataIds(["detector", "physical_filter"])
2594 queries_and_snippets = [
2595 (
2596 # Dataset type name doesn't match any existing dataset types.
2597 registry.queryDatasets("nonexistent", collections=...),
2598 ["nonexistent"],
2599 ),
2600 (
2601 # Dataset type object isn't registered.
2602 registry.queryDatasets(
2603 DatasetType(
2604 "nonexistent",
2605 dimensions=["instrument"],
2606 universe=registry.dimensions,
2607 storageClass="Image",
2608 ),
2609 collections=...,
2610 ),
2611 ["nonexistent"],
2612 ),
2613 (
2614 # No datasets of this type in this collection.
2615 registry.queryDatasets("flat", collections=["biases"]),
2616 ["flat", "biases"],
2617 ),
2618 (
2619 # No datasets of this type in this collection.
2620 base_query.findDatasets("flat", collections=["biases"]),
2621 ["flat", "biases"],
2622 ),
2623 (
2624 # No collections matching at all.
2625 registry.queryDatasets("flat", collections=re.compile("potato.+")),
2626 ["potato"],
2627 ),
2628 ]
2629 # The behavior of these additional queries is slated to change in the
2630 # future, so we also check for deprecation warnings.
2631 with self.assertWarns(FutureWarning):
2632 queries_and_snippets.append(
2633 (
2634 # Dataset type name doesn't match any existing dataset
2635 # types.
2636 registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...),
2637 ["nonexistent"],
2638 )
2639 )
2640 with self.assertWarns(FutureWarning):
2641 queries_and_snippets.append(
2642 (
2643 # Dataset type name doesn't match any existing dataset
2644 # types.
2645 registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...),
2646 ["nonexistent"],
2647 )
2648 )
2649 for query, snippets in queries_and_snippets:
2650 self.assertFalse(query.any(execute=False, exact=False))
2651 self.assertFalse(query.any(execute=True, exact=False))
2652 self.assertFalse(query.any(execute=True, exact=True))
2653 self.assertEqual(query.count(exact=False), 0)
2654 self.assertEqual(query.count(exact=True), 0)
2655 messages = list(query.explain_no_results())
2656 self.assertTrue(messages)
2657 # Want all expected snippets to appear in at least one message.
2658 self.assertTrue(
2659 any(
2660 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2661 ),
2662 messages,
2663 )
2665 # This query does yield results, but should also emit a warning because
2666 # dataset type patterns to queryDataIds is deprecated; just look for
2667 # the warning.
2668 with self.assertWarns(FutureWarning):
2669 registry.queryDataIds(["detector"], datasets=re.compile("^nonexistent$"), collections=...)
2671 # These queries yield no results due to problems that can be identified
2672 # by cheap follow-up queries, yielding helpful diagnostics.
2673 for query, snippets in [
2674 (
2675 # No records for one of the involved dimensions.
2676 registry.queryDataIds(["subfilter"]),
2677 ["no rows", "subfilter"],
2678 ),
2679 (
2680 # No records for one of the involved dimensions.
2681 registry.queryDimensionRecords("subfilter"),
2682 ["no rows", "subfilter"],
2683 ),
2684 ]:
2685 self.assertFalse(query.any(execute=True, exact=False))
2686 self.assertFalse(query.any(execute=True, exact=True))
2687 self.assertEqual(query.count(exact=True), 0)
2688 messages = list(query.explain_no_results())
2689 self.assertTrue(messages)
2690 # Want all expected snippets to appear in at least one message.
2691 self.assertTrue(
2692 any(
2693 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2694 ),
2695 messages,
2696 )
2698 # This query yields four overlaps in the database, but one is filtered
2699 # out in postprocessing. The count queries aren't accurate because
2700 # they don't account for duplication that happens due to an internal
2701 # join against commonSkyPix.
2702 query3 = registry.queryDataIds(["visit", "tract"], instrument="Cam1", skymap="SkyMap1")
2703 self.assertEqual(
2704 {
2705 DataCoordinate.standardize(
2706 instrument="Cam1",
2707 skymap="SkyMap1",
2708 visit=v,
2709 tract=t,
2710 universe=registry.dimensions,
2711 )
2712 for v, t in [(1, 0), (2, 0), (2, 1)]
2713 },
2714 set(query3),
2715 )
2716 self.assertTrue(query3.any(execute=False, exact=False))
2717 self.assertTrue(query3.any(execute=True, exact=False))
2718 self.assertTrue(query3.any(execute=True, exact=True))
2719 self.assertGreaterEqual(query3.count(exact=False), 4)
2720 self.assertGreaterEqual(query3.count(exact=True, discard=True), 3)
2721 self.assertFalse(list(query3.explain_no_results()))
2722 # This query yields overlaps in the database, but all are filtered
2723 # out in postprocessing. The count queries again aren't very useful.
2724 # We have to use `where=` here to avoid an optimization that
2725 # (currently) skips the spatial postprocess-filtering because it
2726 # recognizes that no spatial join is necessary. That's not ideal, but
2727 # fixing it is out of scope for this ticket.
2728 query4 = registry.queryDataIds(
2729 ["visit", "tract"],
2730 instrument="Cam1",
2731 skymap="SkyMap1",
2732 where="visit=1 AND detector=1 AND tract=0 AND patch=4",
2733 )
2734 self.assertFalse(set(query4))
2735 self.assertTrue(query4.any(execute=False, exact=False))
2736 self.assertTrue(query4.any(execute=True, exact=False))
2737 self.assertFalse(query4.any(execute=True, exact=True))
2738 self.assertGreaterEqual(query4.count(exact=False), 1)
2739 self.assertEqual(query4.count(exact=True, discard=True), 0)
2740 messages = query4.explain_no_results()
2741 self.assertTrue(messages)
2742 self.assertTrue(any("overlap" in message for message in messages))
2743 # This query should yield results from one dataset type but not the
2744 # other, which is not registered.
2745 query5 = registry.queryDatasets(["bias", "nonexistent"], collections=["biases"])
2746 self.assertTrue(set(query5))
2747 self.assertTrue(query5.any(execute=False, exact=False))
2748 self.assertTrue(query5.any(execute=True, exact=False))
2749 self.assertTrue(query5.any(execute=True, exact=True))
2750 self.assertGreaterEqual(query5.count(exact=False), 1)
2751 self.assertGreaterEqual(query5.count(exact=True), 1)
2752 self.assertFalse(list(query5.explain_no_results()))
2753 # This query applies a selection that yields no results, fully in the
2754 # database. Explaining why it fails involves traversing the relation
2755 # tree and running a LIMIT 1 query at each level that has the potential
2756 # to remove rows.
2757 query6 = registry.queryDimensionRecords(
2758 "detector", where="detector.purpose = 'no-purpose'", instrument="Cam1"
2759 )
2760 self.assertEqual(query6.count(exact=True), 0)
2761 messages = query6.explain_no_results()
2762 self.assertTrue(messages)
2763 self.assertTrue(any("no-purpose" in message for message in messages))
2765 def testQueryDataIdsOrderBy(self):
2766 """Test order_by and limit on result returned by queryDataIds()."""
2767 registry = self.makeRegistry()
2768 self.loadData(registry, "base.yaml")
2769 self.loadData(registry, "datasets.yaml")
2770 self.loadData(registry, "spatial.yaml")
2772 def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
2773 return registry.queryDataIds(
2774 dimensions, datasets=datasets, collections=collections, instrument="Cam1", skymap="SkyMap1"
2775 )
2777 Test = namedtuple(
2778 "testQueryDataIdsOrderByTest",
2779 ("order_by", "keys", "result", "limit", "datasets", "collections"),
2780 defaults=(None, None, None),
2781 )
2783 test_data = (
2784 Test("tract,visit", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2785 Test("-tract,visit", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))),
2786 Test("tract,-visit", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))),
2787 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2), (0, 2), (0, 1), (0, 1))),
2788 Test(
2789 "tract.id,visit.id",
2790 "tract,visit",
2791 ((0, 1), (0, 1), (0, 2)),
2792 limit=(3,),
2793 ),
2794 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2)), limit=(3,)),
2795 Test("tract,visit", "tract,visit", ((0, 2), (1, 2), (1, 2)), limit=(3, 3)),
2796 Test("-tract,-visit", "tract,visit", ((0, 1),), limit=(3, 5)),
2797 Test(
2798 "tract,visit.exposure_time", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))
2799 ),
2800 Test(
2801 "-tract,-visit.exposure_time", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))
2802 ),
2803 Test("tract,-exposure_time", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2804 Test("tract,visit.name", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2805 Test(
2806 "tract,-timespan.begin,timespan.end",
2807 "tract,visit",
2808 ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2)),
2809 ),
2810 Test("visit.day_obs,exposure.day_obs", "visit,exposure", ()),
2811 Test("visit.timespan.begin,-exposure.timespan.begin", "visit,exposure", ()),
2812 Test(
2813 "tract,detector",
2814 "tract,detector",
2815 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2816 datasets="flat",
2817 collections="imported_r",
2818 ),
2819 Test(
2820 "tract,detector.full_name",
2821 "tract,detector",
2822 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2823 datasets="flat",
2824 collections="imported_r",
2825 ),
2826 Test(
2827 "tract,detector.raft,detector.name_in_raft",
2828 "tract,detector",
2829 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2830 datasets="flat",
2831 collections="imported_r",
2832 ),
2833 )
2835 for test in test_data:
2836 order_by = test.order_by.split(",")
2837 keys = test.keys.split(",")
2838 query = do_query(keys, test.datasets, test.collections).order_by(*order_by)
2839 if test.limit is not None:
2840 query = query.limit(*test.limit)
2841 dataIds = tuple(tuple(dataId[k] for k in keys) for dataId in query)
2842 self.assertEqual(dataIds, test.result)
2844 # and materialize
2845 query = do_query(keys).order_by(*order_by)
2846 if test.limit is not None:
2847 query = query.limit(*test.limit)
2848 with self.assertRaises(RelationalAlgebraError):
2849 with query.materialize():
2850 pass
2852 # errors in a name
2853 for order_by in ("", "-"):
2854 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2855 list(do_query().order_by(order_by))
2857 for order_by in ("undimension.name", "-undimension.name"):
2858 with self.assertRaisesRegex(ValueError, "Unknown dimension element name 'undimension'"):
2859 list(do_query().order_by(order_by))
2861 for order_by in ("attract", "-attract"):
2862 with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
2863 list(do_query().order_by(order_by))
2865 with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
2866 list(do_query(("exposure", "visit")).order_by("exposure_time"))
2868 with self.assertRaisesRegex(ValueError, "Timespan exists in more than one dimesion"):
2869 list(do_query(("exposure", "visit")).order_by("timespan.begin"))
2871 with self.assertRaisesRegex(
2872 ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
2873 ):
2874 list(do_query("tract").order_by("timespan.begin"))
2876 with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
2877 list(do_query("tract").order_by("tract.timespan.begin"))
2879 with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
2880 list(do_query("tract").order_by("tract.name"))
2882 def testQueryDataIdsGovernorExceptions(self):
2883 """Test exceptions raised by queryDataIds() for incorrect governors."""
2884 registry = self.makeRegistry()
2885 self.loadData(registry, "base.yaml")
2886 self.loadData(registry, "datasets.yaml")
2887 self.loadData(registry, "spatial.yaml")
2889 def do_query(dimensions, dataId=None, where="", bind=None, **kwargs):
2890 return registry.queryDataIds(dimensions, dataId=dataId, where=where, bind=bind, **kwargs)
2892 Test = namedtuple(
2893 "testQueryDataIdExceptionsTest",
2894 ("dimensions", "dataId", "where", "bind", "kwargs", "exception", "count"),
2895 defaults=(None, None, None, {}, None, 0),
2896 )
2898 test_data = (
2899 Test("tract,visit", count=6),
2900 Test("tract,visit", kwargs={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2901 Test(
2902 "tract,visit", kwargs={"instrument": "Cam2", "skymap": "SkyMap1"}, exception=DataIdValueError
2903 ),
2904 Test("tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2905 Test(
2906 "tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap2"}, exception=DataIdValueError
2907 ),
2908 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap1'", count=6),
2909 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap5'", exception=DataIdValueError),
2910 Test(
2911 "tract,visit",
2912 where="instrument=cam AND skymap=map",
2913 bind={"cam": "Cam1", "map": "SkyMap1"},
2914 count=6,
2915 ),
2916 Test(
2917 "tract,visit",
2918 where="instrument=cam AND skymap=map",
2919 bind={"cam": "Cam", "map": "SkyMap"},
2920 exception=DataIdValueError,
2921 ),
2922 )
2924 for test in test_data:
2925 dimensions = test.dimensions.split(",")
2926 if test.exception:
2927 with self.assertRaises(test.exception):
2928 do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs).count()
2929 else:
2930 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2931 self.assertEqual(query.count(discard=True), test.count)
2933 # and materialize
2934 if test.exception:
2935 with self.assertRaises(test.exception):
2936 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2937 with query.materialize() as materialized:
2938 materialized.count(discard=True)
2939 else:
2940 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2941 with query.materialize() as materialized:
2942 self.assertEqual(materialized.count(discard=True), test.count)
2944 def testQueryDimensionRecordsOrderBy(self):
2945 """Test order_by and limit on result returned by
2946 queryDimensionRecords().
2947 """
2948 registry = self.makeRegistry()
2949 self.loadData(registry, "base.yaml")
2950 self.loadData(registry, "datasets.yaml")
2951 self.loadData(registry, "spatial.yaml")
2953 def do_query(element, datasets=None, collections=None):
2954 return registry.queryDimensionRecords(
2955 element, instrument="Cam1", datasets=datasets, collections=collections
2956 )
2958 query = do_query("detector")
2959 self.assertEqual(len(list(query)), 4)
2961 Test = namedtuple(
2962 "testQueryDataIdsOrderByTest",
2963 ("element", "order_by", "result", "limit", "datasets", "collections"),
2964 defaults=(None, None, None),
2965 )
2967 test_data = (
2968 Test("detector", "detector", (1, 2, 3, 4)),
2969 Test("detector", "-detector", (4, 3, 2, 1)),
2970 Test("detector", "raft,-name_in_raft", (2, 1, 4, 3)),
2971 Test("detector", "-detector.purpose", (4,), limit=(1,)),
2972 Test("detector", "-purpose,detector.raft,name_in_raft", (2, 3), limit=(2, 2)),
2973 Test("visit", "visit", (1, 2)),
2974 Test("visit", "-visit.id", (2, 1)),
2975 Test("visit", "zenith_angle", (1, 2)),
2976 Test("visit", "-visit.name", (2, 1)),
2977 Test("visit", "day_obs,-timespan.begin", (2, 1)),
2978 )
2980 for test in test_data:
2981 order_by = test.order_by.split(",")
2982 query = do_query(test.element).order_by(*order_by)
2983 if test.limit is not None:
2984 query = query.limit(*test.limit)
2985 dataIds = tuple(rec.id for rec in query)
2986 self.assertEqual(dataIds, test.result)
2988 # errors in a name
2989 for order_by in ("", "-"):
2990 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2991 list(do_query("detector").order_by(order_by))
2993 for order_by in ("undimension.name", "-undimension.name"):
2994 with self.assertRaisesRegex(ValueError, "Element name mismatch: 'undimension'"):
2995 list(do_query("detector").order_by(order_by))
2997 for order_by in ("attract", "-attract"):
2998 with self.assertRaisesRegex(ValueError, "Field 'attract' does not exist in 'detector'."):
2999 list(do_query("detector").order_by(order_by))
3001 def testQueryDimensionRecordsExceptions(self):
3002 """Test exceptions raised by queryDimensionRecords()."""
3003 registry = self.makeRegistry()
3004 self.loadData(registry, "base.yaml")
3005 self.loadData(registry, "datasets.yaml")
3006 self.loadData(registry, "spatial.yaml")
3008 result = registry.queryDimensionRecords("detector")
3009 self.assertEqual(result.count(), 4)
3010 result = registry.queryDimensionRecords("detector", instrument="Cam1")
3011 self.assertEqual(result.count(), 4)
3012 result = registry.queryDimensionRecords("detector", dataId={"instrument": "Cam1"})
3013 self.assertEqual(result.count(), 4)
3014 result = registry.queryDimensionRecords("detector", where="instrument='Cam1'")
3015 self.assertEqual(result.count(), 4)
3016 result = registry.queryDimensionRecords("detector", where="instrument=instr", bind={"instr": "Cam1"})
3017 self.assertEqual(result.count(), 4)
3019 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3020 result = registry.queryDimensionRecords("detector", instrument="NotCam1")
3021 result.count()
3023 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3024 result = registry.queryDimensionRecords("detector", dataId={"instrument": "NotCam1"})
3025 result.count()
3027 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3028 result = registry.queryDimensionRecords("detector", where="instrument='NotCam1'")
3029 result.count()
3031 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3032 result = registry.queryDimensionRecords(
3033 "detector", where="instrument=instr", bind={"instr": "NotCam1"}
3034 )
3035 result.count()
3037 def testDatasetConstrainedDimensionRecordQueries(self):
3038 """Test that queryDimensionRecords works even when given a dataset
3039 constraint whose dimensions extend beyond the requested dimension
3040 element's.
3041 """
3042 registry = self.makeRegistry()
3043 self.loadData(registry, "base.yaml")
3044 self.loadData(registry, "datasets.yaml")
3045 # Query for physical_filter dimension records, using a dataset that
3046 # has both physical_filter and dataset dimensions.
3047 records = registry.queryDimensionRecords(
3048 "physical_filter",
3049 datasets=["flat"],
3050 collections="imported_r",
3051 )
3052 self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"})
3053 # Trying to constrain by all dataset types is an error.
3054 with self.assertRaises(TypeError):
3055 list(registry.queryDimensionRecords("physical_filter", datasets=..., collections="imported_r"))
3057 def testSkyPixDatasetQueries(self):
3058 """Test that we can build queries involving skypix dimensions as long
3059 as a dataset type that uses those dimensions is included.
3060 """
3061 registry = self.makeRegistry()
3062 self.loadData(registry, "base.yaml")
3063 dataset_type = DatasetType(
3064 "a", dimensions=["htm7", "instrument"], universe=registry.dimensions, storageClass="int"
3065 )
3066 registry.registerDatasetType(dataset_type)
3067 run = "r"
3068 registry.registerRun(run)
3069 # First try queries where there are no datasets; the concern is whether
3070 # we can even build and execute these queries without raising, even
3071 # when "doomed" query shortcuts are in play.
3072 self.assertFalse(
3073 list(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run))
3074 )
3075 self.assertFalse(list(registry.queryDatasets(dataset_type, collections=run)))
3076 # Now add a dataset and see that we can get it back.
3077 htm7 = registry.dimensions.skypix["htm"][7].pixelization
3078 data_id = registry.expandDataId(instrument="Cam1", htm7=htm7.universe()[0][0])
3079 (ref,) = registry.insertDatasets(dataset_type, [data_id], run=run)
3080 self.assertEqual(
3081 set(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run)),
3082 {data_id},
3083 )
3084 self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})
3086 def testDatasetIdFactory(self):
3087 """Simple test for DatasetIdFactory, mostly to catch potential changes
3088 in its API.
3089 """
3090 registry = self.makeRegistry()
3091 factory = registry.datasetIdFactory
3092 dataset_type = DatasetType(
3093 "datasetType",
3094 dimensions=["detector", "instrument"],
3095 universe=registry.dimensions,
3096 storageClass="int",
3097 )
3098 run = "run"
3099 data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)
3101 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
3102 self.assertIsInstance(datasetId, uuid.UUID)
3103 self.assertEqual(datasetId.version, 4)
3105 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
3106 self.assertIsInstance(datasetId, uuid.UUID)
3107 self.assertEqual(datasetId.version, 5)
3109 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
3110 self.assertIsInstance(datasetId, uuid.UUID)
3111 self.assertEqual(datasetId.version, 5)
3113 def testExposureQueries(self):
3114 """Test query methods using arguments sourced from the exposure log
3115 service.
3117 The most complete test dataset currently available to daf_butler tests
3118 is hsc-rc2-subset.yaml export (which is unfortunately distinct from the
3119 the lsst/rc2_subset GitHub repo), but that does not have 'exposure'
3120 dimension records as it was focused on providing nontrivial spatial
3121 overlaps between visit+detector and tract+patch. So in this test we
3122 need to translate queries that originally used the exposure dimension
3123 to use the (very similar) visit dimension instead.
3124 """
3125 registry = self.makeRegistry()
3126 self.loadData(registry, "hsc-rc2-subset.yaml")
3127 self.assertEqual(
3128 [
3129 record.id
3130 for record in registry.queryDimensionRecords("visit", instrument="HSC")
3131 .order_by("id")
3132 .limit(5)
3133 ],
3134 [318, 322, 326, 330, 332],
3135 )
3136 self.assertEqual(
3137 [
3138 data_id["visit"]
3139 for data_id in registry.queryDataIds(["visit"], instrument="HSC").order_by("id").limit(5)
3140 ],
3141 [318, 322, 326, 330, 332],
3142 )
3143 self.assertEqual(
3144 [
3145 record.id
3146 for record in registry.queryDimensionRecords("detector", instrument="HSC")
3147 .order_by("full_name")
3148 .limit(5)
3149 ],
3150 [73, 72, 71, 70, 65],
3151 )
3152 self.assertEqual(
3153 [
3154 data_id["detector"]
3155 for data_id in registry.queryDataIds(["detector"], instrument="HSC")
3156 .order_by("full_name")
3157 .limit(5)
3158 ],
3159 [73, 72, 71, 70, 65],
3160 )
3162 def test_long_query_names(self) -> None:
3163 """Test that queries involving very long names are handled correctly.
3165 This is especially important for PostgreSQL, which truncates symbols
3166 longer than 64 chars, but it's worth testing for all DBs.
3167 """
3168 registry = self.makeRegistry()
3169 name = "abcd" * 17
3170 registry.registerDatasetType(
3171 DatasetType(
3172 name,
3173 dimensions=(),
3174 storageClass="Exposure",
3175 universe=registry.dimensions,
3176 )
3177 )
3178 # Need to search more than one collection actually containing a
3179 # matching dataset to avoid optimizations that sidestep bugs due to
3180 # truncation by making findFirst=True a no-op.
3181 run1 = "run1"
3182 registry.registerRun(run1)
3183 run2 = "run2"
3184 registry.registerRun(run2)
3185 (ref1,) = registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run1)
3186 registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run2)
3187 self.assertEqual(
3188 set(registry.queryDatasets(name, collections=[run1, run2], findFirst=True)),
3189 {ref1},
3190 )
3192 def test_skypix_constraint_queries(self) -> None:
3193 """Test queries spatially constrained by a skypix data ID."""
3194 registry = self.makeRegistry()
3195 self.loadData(registry, "hsc-rc2-subset.yaml")
3196 patch_regions = {
3197 (data_id["tract"], data_id["patch"]): data_id.region
3198 for data_id in registry.queryDataIds(["patch"]).expanded()
3199 }
3200 skypix_dimension: SkyPixDimension = registry.dimensions["htm11"]
3201 # This check ensures the test doesn't become trivial due to a config
3202 # change; if it does, just pick a different HTML level.
3203 self.assertNotEqual(skypix_dimension, registry.dimensions.commonSkyPix)
3204 # Gather all skypix IDs that definitely overlap at least one of these
3205 # patches.
3206 relevant_skypix_ids = lsst.sphgeom.RangeSet()
3207 for patch_region in patch_regions.values():
3208 relevant_skypix_ids |= skypix_dimension.pixelization.interior(patch_region)
3209 # Look for a "nontrivial" skypix_id that overlaps at least one patch
3210 # and does not overlap at least one other patch.
3211 for skypix_id in itertools.chain.from_iterable(
3212 range(begin, end) for begin, end in relevant_skypix_ids
3213 ):
3214 skypix_region = skypix_dimension.pixelization.pixel(skypix_id)
3215 overlapping_patches = {
3216 patch_key
3217 for patch_key, patch_region in patch_regions.items()
3218 if not patch_region.isDisjointFrom(skypix_region)
3219 }
3220 if overlapping_patches and overlapping_patches != patch_regions.keys():
3221 break
3222 else:
3223 raise RuntimeError("Could not find usable skypix ID for this dimension configuration.")
3224 self.assertEqual(
3225 {
3226 (data_id["tract"], data_id["patch"])
3227 for data_id in registry.queryDataIds(
3228 ["patch"],
3229 dataId={skypix_dimension.name: skypix_id},
3230 )
3231 },
3232 overlapping_patches,
3233 )
3234 # Test that a three-way join that includes the common skypix system in
3235 # the dimensions doesn't generate redundant join terms in the query.
3236 full_data_ids = set(
3237 registry.queryDataIds(
3238 ["tract", "visit", "htm7"], skymap="hsc_rings_v1", instrument="HSC"
3239 ).expanded()
3240 )
3241 self.assertGreater(len(full_data_ids), 0)
3242 for data_id in full_data_ids:
3243 self.assertFalse(data_id.records["tract"].region.isDisjointFrom(data_id.records["htm7"].region))
3244 self.assertFalse(data_id.records["visit"].region.isDisjointFrom(data_id.records["htm7"].region))
3246 def test_spatial_constraint_queries(self) -> None:
3247 """Test queries in which one spatial dimension in the constraint (data
3248 ID or ``where`` string) constrains a different spatial dimension in the
3249 query result columns.
3250 """
3251 registry = self.makeRegistry()
3252 self.loadData(registry, "hsc-rc2-subset.yaml")
3253 patch_regions = {
3254 (data_id["tract"], data_id["patch"]): data_id.region
3255 for data_id in registry.queryDataIds(["patch"]).expanded()
3256 }
3257 observation_regions = {
3258 (data_id["visit"], data_id["detector"]): data_id.region
3259 for data_id in registry.queryDataIds(["visit", "detector"]).expanded()
3260 }
3261 all_combos = {
3262 (patch_key, observation_key)
3263 for patch_key, observation_key in itertools.product(patch_regions, observation_regions)
3264 }
3265 overlapping_combos = {
3266 (patch_key, observation_key)
3267 for patch_key, observation_key in all_combos
3268 if not patch_regions[patch_key].isDisjointFrom(observation_regions[observation_key])
3269 }
3270 # Check a direct spatial join with no constraint first.
3271 self.assertEqual(
3272 {
3273 ((data_id["tract"], data_id["patch"]), (data_id["visit"], data_id["detector"]))
3274 for data_id in registry.queryDataIds(["patch", "visit", "detector"])
3275 },
3276 overlapping_combos,
3277 )
3278 overlaps_by_patch: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3279 overlaps_by_observation: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3280 for patch_key, observation_key in overlapping_combos:
3281 overlaps_by_patch[patch_key].add(observation_key)
3282 overlaps_by_observation[observation_key].add(patch_key)
3283 # Find patches and observations that overlap at least one of the other
3284 # but not all of the other.
3285 nontrivial_patch = next(
3286 iter(
3287 patch_key
3288 for patch_key, observation_keys in overlaps_by_patch.items()
3289 if observation_keys and observation_keys != observation_regions.keys()
3290 )
3291 )
3292 nontrivial_observation = next(
3293 iter(
3294 observation_key
3295 for observation_key, patch_keys in overlaps_by_observation.items()
3296 if patch_keys and patch_keys != patch_regions.keys()
3297 )
3298 )
3299 # Use the nontrivial patches and observations as constraints on the
3300 # other dimensions in various ways, first via a 'where' expression.
3301 # It's better in general to us 'bind' instead of f-strings, but these
3302 # all integers so there are no quoting concerns.
3303 self.assertEqual(
3304 {
3305 (data_id["visit"], data_id["detector"])
3306 for data_id in registry.queryDataIds(
3307 ["visit", "detector"],
3308 where=f"tract={nontrivial_patch[0]} AND patch={nontrivial_patch[1]}",
3309 skymap="hsc_rings_v1",
3310 )
3311 },
3312 overlaps_by_patch[nontrivial_patch],
3313 )
3314 self.assertEqual(
3315 {
3316 (data_id["tract"], data_id["patch"])
3317 for data_id in registry.queryDataIds(
3318 ["patch"],
3319 where=f"visit={nontrivial_observation[0]} AND detector={nontrivial_observation[1]}",
3320 instrument="HSC",
3321 )
3322 },
3323 overlaps_by_observation[nontrivial_observation],
3324 )
3325 # and then via the dataId argument.
3326 self.assertEqual(
3327 {
3328 (data_id["visit"], data_id["detector"])
3329 for data_id in registry.queryDataIds(
3330 ["visit", "detector"],
3331 dataId={
3332 "tract": nontrivial_patch[0],
3333 "patch": nontrivial_patch[1],
3334 },
3335 skymap="hsc_rings_v1",
3336 )
3337 },
3338 overlaps_by_patch[nontrivial_patch],
3339 )
3340 self.assertEqual(
3341 {
3342 (data_id["tract"], data_id["patch"])
3343 for data_id in registry.queryDataIds(
3344 ["patch"],
3345 dataId={
3346 "visit": nontrivial_observation[0],
3347 "detector": nontrivial_observation[1],
3348 },
3349 instrument="HSC",
3350 )
3351 },
3352 overlaps_by_observation[nontrivial_observation],
3353 )
3355 def test_query_projection_drop_postprocessing(self) -> None:
3356 """Test that projections and deduplications on query objects can
3357 drop post-query region filtering to ensure the query remains in
3358 the SQL engine.
3359 """
3360 registry = self.makeRegistry()
3361 self.loadData(registry, "base.yaml")
3362 self.loadData(registry, "spatial.yaml")
3364 def pop_transfer(tree: Relation) -> Relation:
3365 """If a relation tree terminates with a transfer to a new engine,
3366 return the relation prior to that transfer. If not, return the
3367 original relation.
3368 """
3369 match tree:
3370 case Transfer(target=target):
3371 return target
3372 case _:
3373 return tree
3375 # There's no public way to get a Query object yet, so we get one from a
3376 # DataCoordinateQueryResults private attribute. When a public API is
3377 # available this test should use it.
3378 query = registry.queryDataIds(["visit", "detector", "tract", "patch"])._query
3379 # We expect this query to terminate in the iteration engine originally,
3380 # because region-filtering is necessary.
3381 self.assertIsInstance(pop_transfer(query.relation).engine, iteration.Engine)
3382 # If we deduplicate, we usually have to do that downstream of the
3383 # filtering. That means the deduplication has to happen in the
3384 # iteration engine.
3385 self.assertIsInstance(pop_transfer(query.projected(unique=True).relation).engine, iteration.Engine)
3386 # If we pass drop_postprocessing, we instead drop the region filtering
3387 # so the deduplication can happen in SQL (though there might still be
3388 # transfer to iteration at the tail of the tree that we can ignore;
3389 # that's what the pop_transfer takes care of here).
3390 self.assertIsInstance(
3391 pop_transfer(query.projected(unique=True, drop_postprocessing=True).relation).engine,
3392 sql.Engine,
3393 )
3395 def test_query_empty_collections(self) -> None:
3396 """Test for registry query methods with empty collections. The methods
3397 should return empty result set (or None when applicable) and provide
3398 "doomed" diagnostics.
3399 """
3400 registry = self.makeRegistry()
3401 self.loadData(registry, "base.yaml")
3402 self.loadData(registry, "datasets.yaml")
3404 # Tests for registry.findDataset()
3405 with self.assertRaises(NoDefaultCollectionError):
3406 registry.findDataset("bias", instrument="Cam1", detector=1)
3407 self.assertIsNotNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=...))
3408 self.assertIsNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=[]))
3410 # Tests for registry.queryDatasets()
3411 with self.assertRaises(NoDefaultCollectionError):
3412 registry.queryDatasets("bias")
3413 self.assertTrue(list(registry.queryDatasets("bias", collections=...)))
3415 result = registry.queryDatasets("bias", collections=[])
3416 self.assertEqual(len(list(result)), 0)
3417 messages = list(result.explain_no_results())
3418 self.assertTrue(messages)
3419 self.assertTrue(any("because collection list is empty" in message for message in messages))
3421 # Tests for registry.queryDataIds()
3422 with self.assertRaises(NoDefaultCollectionError):
3423 registry.queryDataIds("detector", datasets="bias")
3424 self.assertTrue(list(registry.queryDataIds("detector", datasets="bias", collections=...)))
3426 result = registry.queryDataIds("detector", datasets="bias", collections=[])
3427 self.assertEqual(len(list(result)), 0)
3428 messages = list(result.explain_no_results())
3429 self.assertTrue(messages)
3430 self.assertTrue(any("because collection list is empty" in message for message in messages))
3432 # Tests for registry.queryDimensionRecords()
3433 with self.assertRaises(NoDefaultCollectionError):
3434 registry.queryDimensionRecords("detector", datasets="bias")
3435 self.assertTrue(list(registry.queryDimensionRecords("detector", datasets="bias", collections=...)))
3437 result = registry.queryDimensionRecords("detector", datasets="bias", collections=[])
3438 self.assertEqual(len(list(result)), 0)
3439 messages = list(result.explain_no_results())
3440 self.assertTrue(messages)
3441 self.assertTrue(any("because collection list is empty" in message for message in messages))