Coverage for python/lsst/daf/butler/registry/tests/_registry.py: 5%
1466 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["RegistryTests"]
25import itertools
26import logging
27import os
28import re
29import unittest
30import uuid
31from abc import ABC, abstractmethod
32from collections import defaultdict, namedtuple
33from collections.abc import Iterator
34from datetime import datetime, timedelta
35from typing import TYPE_CHECKING
37import astropy.time
38import sqlalchemy
40try:
41 import numpy as np
42except ImportError:
43 np = None
45import lsst.sphgeom
46from lsst.daf.relation import Relation, RelationalAlgebraError, Transfer, iteration, sql
48from ...core import (
49 DataCoordinate,
50 DataCoordinateSet,
51 DatasetAssociation,
52 DatasetIdGenEnum,
53 DatasetRef,
54 DatasetType,
55 DimensionGraph,
56 NamedValueSet,
57 SkyPixDimension,
58 StorageClass,
59 Timespan,
60 ddl,
61)
62from .._collection_summary import CollectionSummary
63from .._collectionType import CollectionType
64from .._config import RegistryConfig
65from .._exceptions import (
66 ArgumentError,
67 CollectionError,
68 CollectionTypeError,
69 ConflictingDefinitionError,
70 DataIdValueError,
71 DatasetTypeError,
72 InconsistentDataIdError,
73 MissingCollectionError,
74 MissingDatasetTypeError,
75 NoDefaultCollectionError,
76 OrphanedRecordError,
77)
78from ..interfaces import ButlerAttributeExistsError
80if TYPE_CHECKING:
81 from .._registry import Registry
84class RegistryTests(ABC):
85 """Generic tests for the `Registry` class that can be subclassed to
86 generate tests for different configurations.
87 """
89 collectionsManager: str | None = None
90 """Name of the collections manager class, if subclass provides value for
91 this member then it overrides name specified in default configuration
92 (`str`).
93 """
95 datasetsManager: str | dict[str, str] | None = None
96 """Name or configuration dictionary of the datasets manager class, if
97 subclass provides value for this member then it overrides name specified
98 in default configuration (`str` or `dict`).
99 """
101 @classmethod
102 @abstractmethod
103 def getDataDir(cls) -> str:
104 """Return the root directory containing test data YAML files."""
105 raise NotImplementedError()
107 def makeRegistryConfig(self) -> RegistryConfig:
108 """Create RegistryConfig used to create a registry.
110 This method should be called by a subclass from `makeRegistry`.
111 Returned instance will be pre-configured based on the values of class
112 members, and default-configured for all other parameters. Subclasses
113 that need default configuration should just instantiate
114 `RegistryConfig` directly.
115 """
116 config = RegistryConfig()
117 if self.collectionsManager:
118 config["managers", "collections"] = self.collectionsManager
119 if self.datasetsManager:
120 config["managers", "datasets"] = self.datasetsManager
121 return config
123 @abstractmethod
124 def makeRegistry(self, share_repo_with: Registry | None = None) -> Registry | None:
125 """Return the Registry instance to be tested.
127 Parameters
128 ----------
129 share_repo_with : `Registry`, optional
130 If provided, the new registry should point to the same data
131 repository as this existing registry.
133 Returns
134 -------
135 registry : `Registry`
136 New `Registry` instance, or `None` *only* if `share_repo_with` is
137 not `None` and this test case does not support that argument
138 (e.g. it is impossible with in-memory SQLite DBs).
139 """
140 raise NotImplementedError()
142 def loadData(self, registry: Registry, filename: str):
143 """Load registry test data from ``getDataDir/<filename>``,
144 which should be a YAML import/export file.
145 """
146 from ...transfers import YamlRepoImportBackend
148 with open(os.path.join(self.getDataDir(), filename)) as stream:
149 backend = YamlRepoImportBackend(stream, registry)
150 backend.register()
151 backend.load(datastore=None)
153 def checkQueryResults(self, results, expected):
154 """Check that a query results object contains expected values.
156 Parameters
157 ----------
158 results : `DataCoordinateQueryResults` or `DatasetQueryResults`
159 A lazy-evaluation query results object.
160 expected : `list`
161 A list of `DataCoordinate` o `DatasetRef` objects that should be
162 equal to results of the query, aside from ordering.
163 """
164 self.assertCountEqual(list(results), expected)
165 self.assertEqual(results.count(), len(expected))
166 if expected:
167 self.assertTrue(results.any())
168 else:
169 self.assertFalse(results.any())
171 def testOpaque(self):
172 """Tests for `Registry.registerOpaqueTable`,
173 `Registry.insertOpaqueData`, `Registry.fetchOpaqueData`, and
174 `Registry.deleteOpaqueData`.
175 """
176 registry = self.makeRegistry()
177 table = "opaque_table_for_testing"
178 registry.registerOpaqueTable(
179 table,
180 spec=ddl.TableSpec(
181 fields=[
182 ddl.FieldSpec("id", dtype=sqlalchemy.BigInteger, primaryKey=True),
183 ddl.FieldSpec("name", dtype=sqlalchemy.String, length=16, nullable=False),
184 ddl.FieldSpec("count", dtype=sqlalchemy.SmallInteger, nullable=True),
185 ],
186 ),
187 )
188 rows = [
189 {"id": 1, "name": "one", "count": None},
190 {"id": 2, "name": "two", "count": 5},
191 {"id": 3, "name": "three", "count": 6},
192 ]
193 registry.insertOpaqueData(table, *rows)
194 self.assertCountEqual(rows, list(registry.fetchOpaqueData(table)))
195 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=1)))
196 self.assertEqual(rows[1:2], list(registry.fetchOpaqueData(table, name="two")))
197 self.assertEqual(rows[0:1], list(registry.fetchOpaqueData(table, id=(1, 3), name=("one", "two"))))
198 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=(1, 2, 3))))
199 # Test very long IN clause which exceeds sqlite limit on number of
200 # parameters. SQLite says the limit is 32k but it looks like it is
201 # much higher.
202 self.assertEqual(rows, list(registry.fetchOpaqueData(table, id=list(range(300_000)))))
203 # Two IN clauses, each longer than 1k batch size, first with
204 # duplicates, second has matching elements in different batches (after
205 # sorting).
206 self.assertEqual(
207 rows[0:2],
208 list(
209 registry.fetchOpaqueData(
210 table,
211 id=list(range(1000)) + list(range(100, 0, -1)),
212 name=["one"] + [f"q{i}" for i in range(2200)] + ["two"],
213 )
214 ),
215 )
216 self.assertEqual([], list(registry.fetchOpaqueData(table, id=1, name="two")))
217 registry.deleteOpaqueData(table, id=3)
218 self.assertCountEqual(rows[:2], list(registry.fetchOpaqueData(table)))
219 registry.deleteOpaqueData(table)
220 self.assertEqual([], list(registry.fetchOpaqueData(table)))
222 def testDatasetType(self):
223 """Tests for `Registry.registerDatasetType` and
224 `Registry.getDatasetType`.
225 """
226 registry = self.makeRegistry()
227 # Check valid insert
228 datasetTypeName = "test"
229 storageClass = StorageClass("testDatasetType")
230 registry.storageClasses.registerStorageClass(storageClass)
231 dimensions = registry.dimensions.extract(("instrument", "visit"))
232 differentDimensions = registry.dimensions.extract(("instrument", "patch"))
233 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
234 # Inserting for the first time should return True
235 self.assertTrue(registry.registerDatasetType(inDatasetType))
236 outDatasetType1 = registry.getDatasetType(datasetTypeName)
237 self.assertEqual(outDatasetType1, inDatasetType)
239 # Re-inserting should work
240 self.assertFalse(registry.registerDatasetType(inDatasetType))
241 # Except when they are not identical
242 with self.assertRaises(ConflictingDefinitionError):
243 nonIdenticalDatasetType = DatasetType(datasetTypeName, differentDimensions, storageClass)
244 registry.registerDatasetType(nonIdenticalDatasetType)
246 # Template can be None
247 datasetTypeName = "testNoneTemplate"
248 storageClass = StorageClass("testDatasetType2")
249 registry.storageClasses.registerStorageClass(storageClass)
250 dimensions = registry.dimensions.extract(("instrument", "visit"))
251 inDatasetType = DatasetType(datasetTypeName, dimensions, storageClass)
252 registry.registerDatasetType(inDatasetType)
253 outDatasetType2 = registry.getDatasetType(datasetTypeName)
254 self.assertEqual(outDatasetType2, inDatasetType)
256 allTypes = set(registry.queryDatasetTypes())
257 self.assertEqual(allTypes, {outDatasetType1, outDatasetType2})
259 def testDimensions(self):
260 """Tests for `Registry.insertDimensionData`,
261 `Registry.syncDimensionData`, and `Registry.expandDataId`.
262 """
263 registry = self.makeRegistry()
264 dimensionName = "instrument"
265 dimension = registry.dimensions[dimensionName]
266 dimensionValue = {
267 "name": "DummyCam",
268 "visit_max": 10,
269 "visit_system": 0,
270 "exposure_max": 10,
271 "detector_max": 2,
272 "class_name": "lsst.pipe.base.Instrument",
273 }
274 registry.insertDimensionData(dimensionName, dimensionValue)
275 # Inserting the same value twice should fail
276 with self.assertRaises(sqlalchemy.exc.IntegrityError):
277 registry.insertDimensionData(dimensionName, dimensionValue)
278 # expandDataId should retrieve the record we just inserted
279 self.assertEqual(
280 registry.expandDataId(instrument="DummyCam", graph=dimension.graph)
281 .records[dimensionName]
282 .toDict(),
283 dimensionValue,
284 )
285 # expandDataId should raise if there is no record with the given ID.
286 with self.assertRaises(DataIdValueError):
287 registry.expandDataId({"instrument": "Unknown"}, graph=dimension.graph)
288 # band doesn't have a table; insert should fail.
289 with self.assertRaises(TypeError):
290 registry.insertDimensionData("band", {"band": "i"})
291 dimensionName2 = "physical_filter"
292 dimension2 = registry.dimensions[dimensionName2]
293 dimensionValue2 = {"name": "DummyCam_i", "band": "i"}
294 # Missing required dependency ("instrument") should fail
295 with self.assertRaises(KeyError):
296 registry.insertDimensionData(dimensionName2, dimensionValue2)
297 # Adding required dependency should fix the failure
298 dimensionValue2["instrument"] = "DummyCam"
299 registry.insertDimensionData(dimensionName2, dimensionValue2)
300 # expandDataId should retrieve the record we just inserted.
301 self.assertEqual(
302 registry.expandDataId(instrument="DummyCam", physical_filter="DummyCam_i", graph=dimension2.graph)
303 .records[dimensionName2]
304 .toDict(),
305 dimensionValue2,
306 )
307 # Use syncDimensionData to insert a new record successfully.
308 dimensionName3 = "detector"
309 dimensionValue3 = {
310 "instrument": "DummyCam",
311 "id": 1,
312 "full_name": "one",
313 "name_in_raft": "zero",
314 "purpose": "SCIENCE",
315 }
316 self.assertTrue(registry.syncDimensionData(dimensionName3, dimensionValue3))
317 # Sync that again. Note that one field ("raft") is NULL, and that
318 # should be okay.
319 self.assertFalse(registry.syncDimensionData(dimensionName3, dimensionValue3))
320 # Now try that sync with the same primary key but a different value.
321 # This should fail.
322 with self.assertRaises(ConflictingDefinitionError):
323 registry.syncDimensionData(
324 dimensionName3,
325 {
326 "instrument": "DummyCam",
327 "id": 1,
328 "full_name": "one",
329 "name_in_raft": "four",
330 "purpose": "SCIENCE",
331 },
332 )
334 @unittest.skipIf(np is None, "numpy not available.")
335 def testNumpyDataId(self):
336 """Test that we can use a numpy int in a dataId."""
337 registry = self.makeRegistry()
338 dimensionEntries = [
339 ("instrument", {"instrument": "DummyCam"}),
340 ("physical_filter", {"instrument": "DummyCam", "name": "d-r", "band": "R"}),
341 # Using an np.int64 here fails unless Records.fromDict is also
342 # patched to look for numbers.Integral
343 ("visit", {"instrument": "DummyCam", "id": 42, "name": "fortytwo", "physical_filter": "d-r"}),
344 ]
345 for args in dimensionEntries:
346 registry.insertDimensionData(*args)
348 # Try a normal integer and something that looks like an int but
349 # is not.
350 for visit_id in (42, np.int64(42)):
351 with self.subTest(visit_id=visit_id, id_type=type(visit_id).__name__):
352 expanded = registry.expandDataId({"instrument": "DummyCam", "visit": visit_id})
353 self.assertEqual(expanded["visit"], int(visit_id))
354 self.assertIsInstance(expanded["visit"], int)
356 def testDataIdRelationships(self):
357 """Test that `Registry.expandDataId` raises an exception when the given
358 keys are inconsistent.
359 """
360 registry = self.makeRegistry()
361 self.loadData(registry, "base.yaml")
362 # Insert a few more dimension records for the next test.
363 registry.insertDimensionData(
364 "exposure",
365 {"instrument": "Cam1", "id": 1, "obs_id": "one", "physical_filter": "Cam1-G"},
366 )
367 registry.insertDimensionData(
368 "exposure",
369 {"instrument": "Cam1", "id": 2, "obs_id": "two", "physical_filter": "Cam1-G"},
370 )
371 registry.insertDimensionData(
372 "visit_system",
373 {"instrument": "Cam1", "id": 0, "name": "one-to-one"},
374 )
375 registry.insertDimensionData(
376 "visit",
377 {"instrument": "Cam1", "id": 1, "name": "one", "physical_filter": "Cam1-G", "visit_system": 0},
378 )
379 registry.insertDimensionData(
380 "visit_definition",
381 {"instrument": "Cam1", "visit": 1, "exposure": 1, "visit_system": 0},
382 )
383 with self.assertRaises(InconsistentDataIdError):
384 registry.expandDataId(
385 {"instrument": "Cam1", "visit": 1, "exposure": 2},
386 )
388 def testDataset(self):
389 """Basic tests for `Registry.insertDatasets`, `Registry.getDataset`,
390 and `Registry.removeDatasets`.
391 """
392 registry = self.makeRegistry()
393 self.loadData(registry, "base.yaml")
394 run = "tésτ"
395 registry.registerRun(run)
396 datasetType = registry.getDatasetType("bias")
397 dataId = {"instrument": "Cam1", "detector": 2}
398 (ref,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
399 outRef = registry.getDataset(ref.id)
400 self.assertIsNotNone(ref.id)
401 self.assertEqual(ref, outRef)
402 with self.assertRaises(ConflictingDefinitionError):
403 registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
404 registry.removeDatasets([ref])
405 self.assertIsNone(registry.findDataset(datasetType, dataId, collections=[run]))
407 def testFindDataset(self):
408 """Tests for `Registry.findDataset`."""
409 registry = self.makeRegistry()
410 self.loadData(registry, "base.yaml")
411 run = "tésτ"
412 datasetType = registry.getDatasetType("bias")
413 dataId = {"instrument": "Cam1", "detector": 4}
414 registry.registerRun(run)
415 (inputRef,) = registry.insertDatasets(datasetType, dataIds=[dataId], run=run)
416 outputRef = registry.findDataset(datasetType, dataId, collections=[run])
417 self.assertEqual(outputRef, inputRef)
418 # Check that retrieval with invalid dataId raises
419 with self.assertRaises(LookupError):
420 dataId = {"instrument": "Cam1"} # no detector
421 registry.findDataset(datasetType, dataId, collections=run)
422 # Check that different dataIds match to different datasets
423 dataId1 = {"instrument": "Cam1", "detector": 1}
424 (inputRef1,) = registry.insertDatasets(datasetType, dataIds=[dataId1], run=run)
425 dataId2 = {"instrument": "Cam1", "detector": 2}
426 (inputRef2,) = registry.insertDatasets(datasetType, dataIds=[dataId2], run=run)
427 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef1)
428 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef2)
429 self.assertNotEqual(registry.findDataset(datasetType, dataId1, collections=run), inputRef2)
430 self.assertNotEqual(registry.findDataset(datasetType, dataId2, collections=run), inputRef1)
431 # Check that requesting a non-existing dataId returns None
432 nonExistingDataId = {"instrument": "Cam1", "detector": 3}
433 self.assertIsNone(registry.findDataset(datasetType, nonExistingDataId, collections=run))
434 # Search more than one collection, in which two have the right
435 # dataset type and another does not.
436 registry.registerRun("empty")
437 self.loadData(registry, "datasets-uuid.yaml")
438 bias1 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_g"])
439 self.assertIsNotNone(bias1)
440 bias2 = registry.findDataset("bias", instrument="Cam1", detector=2, collections=["imported_r"])
441 self.assertIsNotNone(bias2)
442 self.assertEqual(
443 bias1,
444 registry.findDataset(
445 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "imported_r"]
446 ),
447 )
448 self.assertEqual(
449 bias2,
450 registry.findDataset(
451 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_r", "imported_g"]
452 ),
453 )
454 # Search more than one collection, with one of them a CALIBRATION
455 # collection.
456 registry.registerCollection("Cam1/calib", CollectionType.CALIBRATION)
457 timespan = Timespan(
458 begin=astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai"),
459 end=astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai"),
460 )
461 registry.certify("Cam1/calib", [bias2], timespan=timespan)
462 self.assertEqual(
463 bias1,
464 registry.findDataset(
465 "bias",
466 instrument="Cam1",
467 detector=2,
468 collections=["empty", "imported_g", "Cam1/calib"],
469 timespan=timespan,
470 ),
471 )
472 self.assertEqual(
473 bias2,
474 registry.findDataset(
475 "bias",
476 instrument="Cam1",
477 detector=2,
478 collections=["empty", "Cam1/calib", "imported_g"],
479 timespan=timespan,
480 ),
481 )
482 # If we try to search those same collections without a timespan, it
483 # should still work, since the CALIBRATION collection is ignored.
484 self.assertEqual(
485 bias1,
486 registry.findDataset(
487 "bias", instrument="Cam1", detector=2, collections=["empty", "imported_g", "Cam1/calib"]
488 ),
489 )
490 self.assertEqual(
491 bias1,
492 registry.findDataset(
493 "bias", instrument="Cam1", detector=2, collections=["empty", "Cam1/calib", "imported_g"]
494 ),
495 )
497 def testRemoveDatasetTypeSuccess(self):
498 """Test that Registry.removeDatasetType works when there are no
499 datasets of that type present.
500 """
501 registry = self.makeRegistry()
502 self.loadData(registry, "base.yaml")
503 registry.removeDatasetType("flat")
504 with self.assertRaises(MissingDatasetTypeError):
505 registry.getDatasetType("flat")
507 def testRemoveDatasetTypeFailure(self):
508 """Test that Registry.removeDatasetType raises when there are datasets
509 of that type present or if the dataset type is for a component.
510 """
511 registry = self.makeRegistry()
512 self.loadData(registry, "base.yaml")
513 self.loadData(registry, "datasets.yaml")
514 with self.assertRaises(OrphanedRecordError):
515 registry.removeDatasetType("flat")
516 with self.assertRaises(ValueError):
517 registry.removeDatasetType(DatasetType.nameWithComponent("flat", "image"))
519 def testImportDatasetsUUID(self):
520 """Test for `Registry._importDatasets` with UUID dataset ID."""
521 if isinstance(self.datasetsManager, str):
522 if not self.datasetsManager.endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
523 self.skipTest(f"Unexpected dataset manager {self.datasetsManager}")
524 elif isinstance(self.datasetsManager, dict):
525 if not self.datasetsManager["cls"].endswith(".ByDimensionsDatasetRecordStorageManagerUUID"):
526 self.skipTest(f"Unexpected dataset manager {self.datasetsManager['cls']}")
528 registry = self.makeRegistry()
529 self.loadData(registry, "base.yaml")
530 for run in range(6):
531 registry.registerRun(f"run{run}")
532 datasetTypeBias = registry.getDatasetType("bias")
533 datasetTypeFlat = registry.getDatasetType("flat")
534 dataIdBias1 = {"instrument": "Cam1", "detector": 1}
535 dataIdBias2 = {"instrument": "Cam1", "detector": 2}
536 dataIdFlat1 = {"instrument": "Cam1", "detector": 1, "physical_filter": "Cam1-G", "band": "g"}
538 ref = DatasetRef(datasetTypeBias, dataIdBias1, run="run0")
539 (ref1,) = registry._importDatasets([ref])
540 # UUID is used without change
541 self.assertEqual(ref.id, ref1.id)
543 # All different failure modes
544 refs = (
545 # Importing same DatasetRef with different dataset ID is an error
546 DatasetRef(datasetTypeBias, dataIdBias1, run="run0"),
547 # Same DatasetId but different DataId
548 DatasetRef(datasetTypeBias, dataIdBias2, id=ref1.id, run="run0"),
549 DatasetRef(datasetTypeFlat, dataIdFlat1, id=ref1.id, run="run0"),
550 # Same DatasetRef and DatasetId but different run
551 DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run="run1"),
552 )
553 for ref in refs:
554 with self.assertRaises(ConflictingDefinitionError):
555 registry._importDatasets([ref])
557 # Test for non-unique IDs, they can be re-imported multiple times.
558 for run, idGenMode in ((2, DatasetIdGenEnum.DATAID_TYPE), (4, DatasetIdGenEnum.DATAID_TYPE_RUN)):
559 with self.subTest(idGenMode=idGenMode):
560 # Make dataset ref with reproducible dataset ID.
561 ref = DatasetRef(datasetTypeBias, dataIdBias1, run=f"run{run}", id_generation_mode=idGenMode)
562 (ref1,) = registry._importDatasets([ref])
563 self.assertIsInstance(ref1.id, uuid.UUID)
564 self.assertEqual(ref1.id.version, 5)
565 self.assertEqual(ref1.id, ref.id)
567 # Importing it again is OK
568 (ref2,) = registry._importDatasets([ref1])
569 self.assertEqual(ref2.id, ref1.id)
571 # Cannot import to different run with the same ID
572 ref = DatasetRef(datasetTypeBias, dataIdBias1, id=ref1.id, run=f"run{run+1}")
573 with self.assertRaises(ConflictingDefinitionError):
574 registry._importDatasets([ref])
576 ref = DatasetRef(
577 datasetTypeBias, dataIdBias1, run=f"run{run+1}", id_generation_mode=idGenMode
578 )
579 if idGenMode is DatasetIdGenEnum.DATAID_TYPE:
580 # Cannot import same DATAID_TYPE ref into a new run
581 with self.assertRaises(ConflictingDefinitionError):
582 (ref2,) = registry._importDatasets([ref])
583 else:
584 # DATAID_TYPE_RUN ref can be imported into a new run
585 (ref2,) = registry._importDatasets([ref])
587 def testDatasetTypeComponentQueries(self):
588 """Test component options when querying for dataset types.
590 All of the behavior here is deprecated, so many of these tests are
591 currently wrapped in a context to check that we get a warning whenever
592 a component dataset is actually returned.
593 """
594 registry = self.makeRegistry()
595 self.loadData(registry, "base.yaml")
596 self.loadData(registry, "datasets.yaml")
597 # Test querying for dataset types with different inputs.
598 # First query for all dataset types; components should only be included
599 # when components=True.
600 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes()).names)
601 self.assertEqual({"bias", "flat"}, NamedValueSet(registry.queryDatasetTypes(components=False)).names)
602 with self.assertWarns(FutureWarning):
603 self.assertLess(
604 {"bias", "flat", "bias.wcs", "flat.photoCalib"},
605 NamedValueSet(registry.queryDatasetTypes(components=True)).names,
606 )
607 # Use a pattern that can match either parent or components. Again,
608 # components are only returned if components=True.
609 self.assertEqual({"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"))).names)
610 self.assertEqual(
611 {"bias"}, NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=False)).names
612 )
613 with self.assertWarns(FutureWarning):
614 self.assertLess(
615 {"bias", "bias.wcs"},
616 NamedValueSet(registry.queryDatasetTypes(re.compile("^bias.*"), components=True)).names,
617 )
618 # This pattern matches only a component. In this case we also return
619 # that component dataset type if components=None.
620 with self.assertWarns(FutureWarning):
621 self.assertEqual(
622 {"bias.wcs"}, NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"))).names
623 )
624 self.assertEqual(
625 set(),
626 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=False)).names,
627 )
628 with self.assertWarns(FutureWarning):
629 self.assertEqual(
630 {"bias.wcs"},
631 NamedValueSet(registry.queryDatasetTypes(re.compile(r"^bias\.wcs"), components=True)).names,
632 )
633 # Add a dataset type using a StorageClass that we'll then remove; check
634 # that this does not affect our ability to query for dataset types
635 # (though it will warn).
636 tempStorageClass = StorageClass(
637 name="TempStorageClass",
638 components={
639 "data1": registry.storageClasses.getStorageClass("StructuredDataDict"),
640 "data2": registry.storageClasses.getStorageClass("StructuredDataDict"),
641 },
642 )
643 registry.storageClasses.registerStorageClass(tempStorageClass)
644 datasetType = DatasetType(
645 "temporary",
646 dimensions=["instrument"],
647 storageClass=tempStorageClass,
648 universe=registry.dimensions,
649 )
650 registry.registerDatasetType(datasetType)
651 registry.storageClasses._unregisterStorageClass(tempStorageClass.name)
652 datasetType._storageClass = None
653 del tempStorageClass
654 # Querying for all dataset types, including components, should include
655 # at least all non-component dataset types (and I don't want to
656 # enumerate all of the Exposure components for bias and flat here).
657 with self.assertWarns(FutureWarning):
658 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
659 everything = NamedValueSet(registry.queryDatasetTypes(components=True))
660 self.assertIn("TempStorageClass", cm.output[0])
661 self.assertLess({"bias", "flat", "temporary"}, everything.names)
662 # It should not include "temporary.columns", because we tried to remove
663 # the storage class that would tell it about that. So if the next line
664 # fails (i.e. "temporary.columns" _is_ in everything.names), it means
665 # this part of the test isn't doing anything, because the _unregister
666 # call about isn't simulating the real-life case we want it to
667 # simulate, in which different versions of daf_butler in entirely
668 # different Python processes interact with the same repo.
669 self.assertNotIn("temporary.data", everything.names)
670 # Query for dataset types that start with "temp". This should again
671 # not include the component, and also not fail.
672 with self.assertLogs("lsst.daf.butler.registry", logging.WARN) as cm:
673 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=True))
674 self.assertIn("TempStorageClass", cm.output[0])
675 self.assertEqual({"temporary"}, startsWithTemp.names)
676 # Querying with no components should not warn at all.
677 with self.assertLogs("lsst.daf.butler.registries", logging.WARN) as cm:
678 startsWithTemp = NamedValueSet(registry.queryDatasetTypes(re.compile("temp.*"), components=False))
679 # Must issue a warning of our own to be captured.
680 logging.getLogger("lsst.daf.butler.registries").warning("test message")
681 self.assertEqual(len(cm.output), 1)
682 self.assertIn("test message", cm.output[0])
684 def testComponentLookups(self):
685 """Test searching for component datasets via their parents.
687 All of the behavior here is deprecated, so many of these tests are
688 currently wrapped in a context to check that we get a warning whenever
689 a component dataset is actually returned.
690 """
691 registry = self.makeRegistry()
692 self.loadData(registry, "base.yaml")
693 self.loadData(registry, "datasets.yaml")
694 # Test getting the child dataset type (which does still exist in the
695 # Registry), and check for consistency with
696 # DatasetRef.makeComponentRef.
697 collection = "imported_g"
698 parentType = registry.getDatasetType("bias")
699 childType = registry.getDatasetType("bias.wcs")
700 parentRefResolved = registry.findDataset(
701 parentType, collections=collection, instrument="Cam1", detector=1
702 )
703 self.assertIsInstance(parentRefResolved, DatasetRef)
704 self.assertEqual(childType, parentRefResolved.makeComponentRef("wcs").datasetType)
705 # Search for a single dataset with findDataset.
706 childRef1 = registry.findDataset("bias.wcs", collections=collection, dataId=parentRefResolved.dataId)
707 self.assertEqual(childRef1, parentRefResolved.makeComponentRef("wcs"))
708 # Search for detector data IDs constrained by component dataset
709 # existence with queryDataIds.
710 with self.assertWarns(FutureWarning):
711 dataIds = registry.queryDataIds(
712 ["detector"],
713 datasets=["bias.wcs"],
714 collections=collection,
715 ).toSet()
716 self.assertEqual(
717 dataIds,
718 DataCoordinateSet(
719 {
720 DataCoordinate.standardize(instrument="Cam1", detector=d, graph=parentType.dimensions)
721 for d in (1, 2, 3)
722 },
723 parentType.dimensions,
724 ),
725 )
726 # Search for multiple datasets of a single type with queryDatasets.
727 with self.assertWarns(FutureWarning):
728 childRefs2 = set(
729 registry.queryDatasets(
730 "bias.wcs",
731 collections=collection,
732 )
733 )
734 self.assertEqual({ref.datasetType for ref in childRefs2}, {childType})
735 self.assertEqual({ref.dataId for ref in childRefs2}, set(dataIds))
737 def testCollections(self):
738 """Tests for registry methods that manage collections."""
739 registry = self.makeRegistry()
740 other_registry = self.makeRegistry(share_repo_with=registry)
741 self.loadData(registry, "base.yaml")
742 self.loadData(registry, "datasets.yaml")
743 run1 = "imported_g"
744 run2 = "imported_r"
745 # Test setting a collection docstring after it has been created.
746 registry.setCollectionDocumentation(run1, "doc for run1")
747 self.assertEqual(registry.getCollectionDocumentation(run1), "doc for run1")
748 registry.setCollectionDocumentation(run1, None)
749 self.assertIsNone(registry.getCollectionDocumentation(run1))
750 datasetType = "bias"
751 # Find some datasets via their run's collection.
752 dataId1 = {"instrument": "Cam1", "detector": 1}
753 ref1 = registry.findDataset(datasetType, dataId1, collections=run1)
754 self.assertIsNotNone(ref1)
755 dataId2 = {"instrument": "Cam1", "detector": 2}
756 ref2 = registry.findDataset(datasetType, dataId2, collections=run1)
757 self.assertIsNotNone(ref2)
758 # Associate those into a new collection, then look for them there.
759 tag1 = "tag1"
760 registry.registerCollection(tag1, type=CollectionType.TAGGED, doc="doc for tag1")
761 # Check that we can query for old and new collections by type.
762 self.assertEqual(set(registry.queryCollections(collectionTypes=CollectionType.RUN)), {run1, run2})
763 self.assertEqual(
764 set(registry.queryCollections(collectionTypes={CollectionType.TAGGED, CollectionType.RUN})),
765 {tag1, run1, run2},
766 )
767 self.assertEqual(registry.getCollectionDocumentation(tag1), "doc for tag1")
768 registry.associate(tag1, [ref1, ref2])
769 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
770 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
771 # Disassociate one and verify that we can't it there anymore...
772 registry.disassociate(tag1, [ref1])
773 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=tag1))
774 # ...but we can still find ref2 in tag1, and ref1 in the run.
775 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=run1), ref1)
776 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
777 collections = set(registry.queryCollections())
778 self.assertEqual(collections, {run1, run2, tag1})
779 # Associate both refs into tag1 again; ref2 is already there, but that
780 # should be a harmless no-op.
781 registry.associate(tag1, [ref1, ref2])
782 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
783 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
784 # Get a different dataset (from a different run) that has the same
785 # dataset type and data ID as ref2.
786 ref2b = registry.findDataset(datasetType, dataId2, collections=run2)
787 self.assertNotEqual(ref2, ref2b)
788 # Attempting to associate that into tag1 should be an error.
789 with self.assertRaises(ConflictingDefinitionError):
790 registry.associate(tag1, [ref2b])
791 # That error shouldn't have messed up what we had before.
792 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
793 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
794 # Attempt to associate the conflicting dataset again, this time with
795 # a dataset that isn't in the collection and won't cause a conflict.
796 # Should also fail without modifying anything.
797 dataId3 = {"instrument": "Cam1", "detector": 3}
798 ref3 = registry.findDataset(datasetType, dataId3, collections=run1)
799 with self.assertRaises(ConflictingDefinitionError):
800 registry.associate(tag1, [ref3, ref2b])
801 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
802 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)
803 self.assertIsNone(registry.findDataset(datasetType, dataId3, collections=tag1))
804 # Register a chained collection that searches [tag1, run2]
805 chain1 = "chain1"
806 registry.registerCollection(chain1, type=CollectionType.CHAINED)
807 self.assertIs(registry.getCollectionType(chain1), CollectionType.CHAINED)
808 # Chained collection exists, but has no collections in it.
809 self.assertFalse(registry.getCollectionChain(chain1))
810 # If we query for all collections, we should get the chained collection
811 # only if we don't ask to flatten it (i.e. yield only its children).
812 self.assertEqual(set(registry.queryCollections(flattenChains=False)), {tag1, run1, run2, chain1})
813 self.assertEqual(set(registry.queryCollections(flattenChains=True)), {tag1, run1, run2})
814 # Attempt to set its child collections to something circular; that
815 # should fail.
816 with self.assertRaises(ValueError):
817 registry.setCollectionChain(chain1, [tag1, chain1])
818 # Add the child collections.
819 registry.setCollectionChain(chain1, [tag1, run2])
820 self.assertEqual(list(registry.getCollectionChain(chain1)), [tag1, run2])
821 self.assertEqual(registry.getCollectionParentChains(tag1), {chain1})
822 self.assertEqual(registry.getCollectionParentChains(run2), {chain1})
823 # Refresh the other registry that points to the same repo, and make
824 # sure it can see the things we've done (note that this does require
825 # an explicit refresh(); that's the documented behavior, because
826 # caching is ~impossible otherwise).
827 if other_registry is not None:
828 other_registry.refresh()
829 self.assertEqual(list(other_registry.getCollectionChain(chain1)), [tag1, run2])
830 self.assertEqual(other_registry.getCollectionParentChains(tag1), {chain1})
831 self.assertEqual(other_registry.getCollectionParentChains(run2), {chain1})
832 # Searching for dataId1 or dataId2 in the chain should return ref1 and
833 # ref2, because both are in tag1.
834 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain1), ref1)
835 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain1), ref2)
836 # Now disassociate ref2 from tag1. The search (for bias) with
837 # dataId2 in chain1 should then:
838 # 1. not find it in tag1
839 # 2. find a different dataset in run2
840 registry.disassociate(tag1, [ref2])
841 ref2b = registry.findDataset(datasetType, dataId2, collections=chain1)
842 self.assertNotEqual(ref2b, ref2)
843 self.assertEqual(ref2b, registry.findDataset(datasetType, dataId2, collections=run2))
844 # Define a new chain so we can test recursive chains.
845 chain2 = "chain2"
846 registry.registerCollection(chain2, type=CollectionType.CHAINED)
847 registry.setCollectionChain(chain2, [run2, chain1])
848 self.assertEqual(registry.getCollectionParentChains(chain1), {chain2})
849 self.assertEqual(registry.getCollectionParentChains(run2), {chain1, chain2})
850 # Query for collections matching a regex.
851 self.assertCountEqual(
852 list(registry.queryCollections(re.compile("imported_."), flattenChains=False)),
853 ["imported_r", "imported_g"],
854 )
855 # Query for collections matching a regex or an explicit str.
856 self.assertCountEqual(
857 list(registry.queryCollections([re.compile("imported_."), "chain1"], flattenChains=False)),
858 ["imported_r", "imported_g", "chain1"],
859 )
860 # Search for bias with dataId1 should find it via tag1 in chain2,
861 # recursing, because is not in run1.
862 self.assertIsNone(registry.findDataset(datasetType, dataId1, collections=run2))
863 self.assertEqual(registry.findDataset(datasetType, dataId1, collections=chain2), ref1)
864 # Search for bias with dataId2 should find it in run2 (ref2b).
865 self.assertEqual(registry.findDataset(datasetType, dataId2, collections=chain2), ref2b)
866 # Search for a flat that is in run2. That should not be found
867 # at the front of chain2, because of the restriction to bias
868 # on run2 there, but it should be found in at the end of chain1.
869 dataId4 = {"instrument": "Cam1", "detector": 3, "physical_filter": "Cam1-R2"}
870 ref4 = registry.findDataset("flat", dataId4, collections=run2)
871 self.assertIsNotNone(ref4)
872 self.assertEqual(ref4, registry.findDataset("flat", dataId4, collections=chain2))
873 # Deleting a collection that's part of a CHAINED collection is not
874 # allowed, and is exception-safe.
875 with self.assertRaises(Exception):
876 registry.removeCollection(run2)
877 self.assertEqual(registry.getCollectionType(run2), CollectionType.RUN)
878 with self.assertRaises(Exception):
879 registry.removeCollection(chain1)
880 self.assertEqual(registry.getCollectionType(chain1), CollectionType.CHAINED)
881 # Actually remove chain2, test that it's gone by asking for its type.
882 registry.removeCollection(chain2)
883 with self.assertRaises(MissingCollectionError):
884 registry.getCollectionType(chain2)
885 # Actually remove run2 and chain1, which should work now.
886 registry.removeCollection(chain1)
887 registry.removeCollection(run2)
888 with self.assertRaises(MissingCollectionError):
889 registry.getCollectionType(run2)
890 with self.assertRaises(MissingCollectionError):
891 registry.getCollectionType(chain1)
892 # Remove tag1 as well, just to test that we can remove TAGGED
893 # collections.
894 registry.removeCollection(tag1)
895 with self.assertRaises(MissingCollectionError):
896 registry.getCollectionType(tag1)
898 def testCollectionChainFlatten(self):
899 """Test that Registry.setCollectionChain obeys its 'flatten' option."""
900 registry = self.makeRegistry()
901 registry.registerCollection("inner", CollectionType.CHAINED)
902 registry.registerCollection("innermost", CollectionType.RUN)
903 registry.setCollectionChain("inner", ["innermost"])
904 registry.registerCollection("outer", CollectionType.CHAINED)
905 registry.setCollectionChain("outer", ["inner"], flatten=False)
906 self.assertEqual(list(registry.getCollectionChain("outer")), ["inner"])
907 registry.setCollectionChain("outer", ["inner"], flatten=True)
908 self.assertEqual(list(registry.getCollectionChain("outer")), ["innermost"])
910 def testBasicTransaction(self):
911 """Test that all operations within a single transaction block are
912 rolled back if an exception propagates out of the block.
913 """
914 registry = self.makeRegistry()
915 storageClass = StorageClass("testDatasetType")
916 registry.storageClasses.registerStorageClass(storageClass)
917 with registry.transaction():
918 registry.insertDimensionData("instrument", {"name": "Cam1", "class_name": "A"})
919 with self.assertRaises(ValueError):
920 with registry.transaction():
921 registry.insertDimensionData("instrument", {"name": "Cam2"})
922 raise ValueError("Oops, something went wrong")
923 # Cam1 should exist
924 self.assertEqual(registry.expandDataId(instrument="Cam1").records["instrument"].class_name, "A")
925 # But Cam2 and Cam3 should both not exist
926 with self.assertRaises(DataIdValueError):
927 registry.expandDataId(instrument="Cam2")
928 with self.assertRaises(DataIdValueError):
929 registry.expandDataId(instrument="Cam3")
931 def testNestedTransaction(self):
932 """Test that operations within a transaction block are not rolled back
933 if an exception propagates out of an inner transaction block and is
934 then caught.
935 """
936 registry = self.makeRegistry()
937 dimension = registry.dimensions["instrument"]
938 dataId1 = {"instrument": "DummyCam"}
939 dataId2 = {"instrument": "DummyCam2"}
940 checkpointReached = False
941 with registry.transaction():
942 # This should be added and (ultimately) committed.
943 registry.insertDimensionData(dimension, dataId1)
944 with self.assertRaises(sqlalchemy.exc.IntegrityError):
945 with registry.transaction(savepoint=True):
946 # This does not conflict, and should succeed (but not
947 # be committed).
948 registry.insertDimensionData(dimension, dataId2)
949 checkpointReached = True
950 # This should conflict and raise, triggerring a rollback
951 # of the previous insertion within the same transaction
952 # context, but not the original insertion in the outer
953 # block.
954 registry.insertDimensionData(dimension, dataId1)
955 self.assertTrue(checkpointReached)
956 self.assertIsNotNone(registry.expandDataId(dataId1, graph=dimension.graph))
957 with self.assertRaises(DataIdValueError):
958 registry.expandDataId(dataId2, graph=dimension.graph)
960 def testInstrumentDimensions(self):
961 """Test queries involving only instrument dimensions, with no joins to
962 skymap.
963 """
964 registry = self.makeRegistry()
966 # need a bunch of dimensions and datasets for test
967 registry.insertDimensionData(
968 "instrument", dict(name="DummyCam", visit_max=25, exposure_max=300, detector_max=6)
969 )
970 registry.insertDimensionData(
971 "physical_filter",
972 dict(instrument="DummyCam", name="dummy_r", band="r"),
973 dict(instrument="DummyCam", name="dummy_i", band="i"),
974 )
975 registry.insertDimensionData(
976 "detector", *[dict(instrument="DummyCam", id=i, full_name=str(i)) for i in range(1, 6)]
977 )
978 registry.insertDimensionData(
979 "visit_system",
980 dict(instrument="DummyCam", id=1, name="default"),
981 )
982 registry.insertDimensionData(
983 "visit",
984 dict(instrument="DummyCam", id=10, name="ten", physical_filter="dummy_i", visit_system=1),
985 dict(instrument="DummyCam", id=11, name="eleven", physical_filter="dummy_r", visit_system=1),
986 dict(instrument="DummyCam", id=20, name="twelve", physical_filter="dummy_r", visit_system=1),
987 )
988 for i in range(1, 6):
989 registry.insertDimensionData(
990 "visit_detector_region",
991 dict(instrument="DummyCam", visit=10, detector=i),
992 dict(instrument="DummyCam", visit=11, detector=i),
993 dict(instrument="DummyCam", visit=20, detector=i),
994 )
995 registry.insertDimensionData(
996 "exposure",
997 dict(instrument="DummyCam", id=100, obs_id="100", physical_filter="dummy_i"),
998 dict(instrument="DummyCam", id=101, obs_id="101", physical_filter="dummy_i"),
999 dict(instrument="DummyCam", id=110, obs_id="110", physical_filter="dummy_r"),
1000 dict(instrument="DummyCam", id=111, obs_id="111", physical_filter="dummy_r"),
1001 dict(instrument="DummyCam", id=200, obs_id="200", physical_filter="dummy_r"),
1002 dict(instrument="DummyCam", id=201, obs_id="201", physical_filter="dummy_r"),
1003 )
1004 registry.insertDimensionData(
1005 "visit_definition",
1006 dict(instrument="DummyCam", exposure=100, visit_system=1, visit=10),
1007 dict(instrument="DummyCam", exposure=101, visit_system=1, visit=10),
1008 dict(instrument="DummyCam", exposure=110, visit_system=1, visit=11),
1009 dict(instrument="DummyCam", exposure=111, visit_system=1, visit=11),
1010 dict(instrument="DummyCam", exposure=200, visit_system=1, visit=20),
1011 dict(instrument="DummyCam", exposure=201, visit_system=1, visit=20),
1012 )
1013 # dataset types
1014 run1 = "test1_r"
1015 run2 = "test2_r"
1016 tagged2 = "test2_t"
1017 registry.registerRun(run1)
1018 registry.registerRun(run2)
1019 registry.registerCollection(tagged2)
1020 storageClass = StorageClass("testDataset")
1021 registry.storageClasses.registerStorageClass(storageClass)
1022 rawType = DatasetType(
1023 name="RAW",
1024 dimensions=registry.dimensions.extract(("instrument", "exposure", "detector")),
1025 storageClass=storageClass,
1026 )
1027 registry.registerDatasetType(rawType)
1028 calexpType = DatasetType(
1029 name="CALEXP",
1030 dimensions=registry.dimensions.extract(("instrument", "visit", "detector")),
1031 storageClass=storageClass,
1032 )
1033 registry.registerDatasetType(calexpType)
1035 # add pre-existing datasets
1036 for exposure in (100, 101, 110, 111):
1037 for detector in (1, 2, 3):
1038 # note that only 3 of 5 detectors have datasets
1039 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1040 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run1)
1041 # exposures 100 and 101 appear in both run1 and tagged2.
1042 # 100 has different datasets in the different collections
1043 # 101 has the same dataset in both collections.
1044 if exposure == 100:
1045 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1046 if exposure in (100, 101):
1047 registry.associate(tagged2, [ref])
1048 # Add pre-existing datasets to tagged2.
1049 for exposure in (200, 201):
1050 for detector in (3, 4, 5):
1051 # note that only 3 of 5 detectors have datasets
1052 dataId = dict(instrument="DummyCam", exposure=exposure, detector=detector)
1053 (ref,) = registry.insertDatasets(rawType, dataIds=[dataId], run=run2)
1054 registry.associate(tagged2, [ref])
1056 dimensions = DimensionGraph(
1057 registry.dimensions, dimensions=(rawType.dimensions.required | calexpType.dimensions.required)
1058 )
1059 # Test that single dim string works as well as list of str
1060 rows = registry.queryDataIds("visit", datasets=rawType, collections=run1).expanded().toSet()
1061 rowsI = registry.queryDataIds(["visit"], datasets=rawType, collections=run1).expanded().toSet()
1062 self.assertEqual(rows, rowsI)
1063 # with empty expression
1064 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=run1).expanded().toSet()
1065 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1066 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111))
1067 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11))
1068 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1070 # second collection
1071 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=tagged2).toSet()
1072 self.assertEqual(len(rows), 4 * 3) # 4 exposures times 3 detectors
1073 for dataId in rows:
1074 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1075 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 200, 201))
1076 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 20))
1077 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1079 # with two input datasets
1080 rows = registry.queryDataIds(dimensions, datasets=rawType, collections=[run1, tagged2]).toSet()
1081 self.assertEqual(len(set(rows)), 6 * 3) # 6 exposures times 3 detectors; set needed to de-dupe
1082 for dataId in rows:
1083 self.assertCountEqual(dataId.keys(), ("instrument", "detector", "exposure", "visit"))
1084 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101, 110, 111, 200, 201))
1085 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10, 11, 20))
1086 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3, 4, 5))
1088 # limit to single visit
1089 rows = registry.queryDataIds(
1090 dimensions, datasets=rawType, collections=run1, where="visit = 10", instrument="DummyCam"
1091 ).toSet()
1092 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1093 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1094 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1095 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1097 # more limiting expression, using link names instead of Table.column
1098 rows = registry.queryDataIds(
1099 dimensions,
1100 datasets=rawType,
1101 collections=run1,
1102 where="visit = 10 and detector > 1 and 'DummyCam'=instrument",
1103 ).toSet()
1104 self.assertEqual(len(rows), 2 * 2) # 2 exposures times 2 detectors
1105 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (100, 101))
1106 self.assertCountEqual({dataId["visit"] for dataId in rows}, (10,))
1107 self.assertCountEqual({dataId["detector"] for dataId in rows}, (2, 3))
1109 # queryDataIds with only one of `datasets` and `collections` is an
1110 # error.
1111 with self.assertRaises(CollectionError):
1112 registry.queryDataIds(dimensions, datasets=rawType)
1113 with self.assertRaises(ArgumentError):
1114 registry.queryDataIds(dimensions, collections=run1)
1116 # expression excludes everything
1117 rows = registry.queryDataIds(
1118 dimensions, datasets=rawType, collections=run1, where="visit > 1000", instrument="DummyCam"
1119 ).toSet()
1120 self.assertEqual(len(rows), 0)
1122 # Selecting by physical_filter, this is not in the dimensions, but it
1123 # is a part of the full expression so it should work too.
1124 rows = registry.queryDataIds(
1125 dimensions,
1126 datasets=rawType,
1127 collections=run1,
1128 where="physical_filter = 'dummy_r'",
1129 instrument="DummyCam",
1130 ).toSet()
1131 self.assertEqual(len(rows), 2 * 3) # 2 exposures times 3 detectors
1132 self.assertCountEqual({dataId["exposure"] for dataId in rows}, (110, 111))
1133 self.assertCountEqual({dataId["visit"] for dataId in rows}, (11,))
1134 self.assertCountEqual({dataId["detector"] for dataId in rows}, (1, 2, 3))
1136 def testSkyMapDimensions(self):
1137 """Tests involving only skymap dimensions, no joins to instrument."""
1138 registry = self.makeRegistry()
1140 # need a bunch of dimensions and datasets for test, we want
1141 # "band" in the test so also have to add physical_filter
1142 # dimensions
1143 registry.insertDimensionData("instrument", dict(instrument="DummyCam"))
1144 registry.insertDimensionData(
1145 "physical_filter",
1146 dict(instrument="DummyCam", name="dummy_r", band="r"),
1147 dict(instrument="DummyCam", name="dummy_i", band="i"),
1148 )
1149 registry.insertDimensionData("skymap", dict(name="DummyMap", hash=b"sha!"))
1150 for tract in range(10):
1151 registry.insertDimensionData("tract", dict(skymap="DummyMap", id=tract))
1152 registry.insertDimensionData(
1153 "patch",
1154 *[dict(skymap="DummyMap", tract=tract, id=patch, cell_x=0, cell_y=0) for patch in range(10)],
1155 )
1157 # dataset types
1158 run = "tésτ"
1159 registry.registerRun(run)
1160 storageClass = StorageClass("testDataset")
1161 registry.storageClasses.registerStorageClass(storageClass)
1162 calexpType = DatasetType(
1163 name="deepCoadd_calexp",
1164 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1165 storageClass=storageClass,
1166 )
1167 registry.registerDatasetType(calexpType)
1168 mergeType = DatasetType(
1169 name="deepCoadd_mergeDet",
1170 dimensions=registry.dimensions.extract(("skymap", "tract", "patch")),
1171 storageClass=storageClass,
1172 )
1173 registry.registerDatasetType(mergeType)
1174 measType = DatasetType(
1175 name="deepCoadd_meas",
1176 dimensions=registry.dimensions.extract(("skymap", "tract", "patch", "band")),
1177 storageClass=storageClass,
1178 )
1179 registry.registerDatasetType(measType)
1181 dimensions = DimensionGraph(
1182 registry.dimensions,
1183 dimensions=(
1184 calexpType.dimensions.required | mergeType.dimensions.required | measType.dimensions.required
1185 ),
1186 )
1188 # add pre-existing datasets
1189 for tract in (1, 3, 5):
1190 for patch in (2, 4, 6, 7):
1191 dataId = dict(skymap="DummyMap", tract=tract, patch=patch)
1192 registry.insertDatasets(mergeType, dataIds=[dataId], run=run)
1193 for aFilter in ("i", "r"):
1194 dataId = dict(skymap="DummyMap", tract=tract, patch=patch, band=aFilter)
1195 registry.insertDatasets(calexpType, dataIds=[dataId], run=run)
1197 # with empty expression
1198 rows = registry.queryDataIds(dimensions, datasets=[calexpType, mergeType], collections=run).toSet()
1199 self.assertEqual(len(rows), 3 * 4 * 2) # 4 tracts x 4 patches x 2 filters
1200 for dataId in rows:
1201 self.assertCountEqual(dataId.keys(), ("skymap", "tract", "patch", "band"))
1202 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1203 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1204 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1206 # limit to 2 tracts and 2 patches
1207 rows = registry.queryDataIds(
1208 dimensions,
1209 datasets=[calexpType, mergeType],
1210 collections=run,
1211 where="tract IN (1, 5) AND patch IN (2, 7)",
1212 skymap="DummyMap",
1213 ).toSet()
1214 self.assertEqual(len(rows), 2 * 2 * 2) # 2 tracts x 2 patches x 2 filters
1215 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 5))
1216 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 7))
1217 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i", "r"))
1219 # limit to single filter
1220 rows = registry.queryDataIds(
1221 dimensions, datasets=[calexpType, mergeType], collections=run, where="band = 'i'"
1222 ).toSet()
1223 self.assertEqual(len(rows), 3 * 4 * 1) # 4 tracts x 4 patches x 2 filters
1224 self.assertCountEqual({dataId["tract"] for dataId in rows}, (1, 3, 5))
1225 self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
1226 self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",))
1228 # Specifying non-existing skymap is an exception
1229 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
1230 rows = registry.queryDataIds(
1231 dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
1232 ).toSet()
1234 def testSpatialJoin(self):
1235 """Test queries that involve spatial overlap joins."""
1236 registry = self.makeRegistry()
1237 self.loadData(registry, "hsc-rc2-subset.yaml")
1239 # Dictionary of spatial DatabaseDimensionElements, keyed by the name of
1240 # the TopologicalFamily they belong to. We'll relate all elements in
1241 # each family to all of the elements in each other family.
1242 families = defaultdict(set)
1243 # Dictionary of {element.name: {dataId: region}}.
1244 regions = {}
1245 for element in registry.dimensions.getDatabaseElements():
1246 if element.spatial is not None:
1247 families[element.spatial.name].add(element)
1248 regions[element.name] = {
1249 record.dataId: record.region for record in registry.queryDimensionRecords(element)
1250 }
1252 # If this check fails, it's not necessarily a problem - it may just be
1253 # a reasonable change to the default dimension definitions - but the
1254 # test below depends on there being more than one family to do anything
1255 # useful.
1256 self.assertEqual(len(families), 2)
1258 # Overlap DatabaseDimensionElements with each other.
1259 for family1, family2 in itertools.combinations(families, 2):
1260 for element1, element2 in itertools.product(families[family1], families[family2]):
1261 graph = DimensionGraph.union(element1.graph, element2.graph)
1262 # Construct expected set of overlapping data IDs via a
1263 # brute-force comparison of the regions we've already fetched.
1264 expected = {
1265 DataCoordinate.standardize({**dataId1.byName(), **dataId2.byName()}, graph=graph)
1266 for (dataId1, region1), (dataId2, region2) in itertools.product(
1267 regions[element1.name].items(), regions[element2.name].items()
1268 )
1269 if not region1.isDisjointFrom(region2)
1270 }
1271 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1272 queried = set(registry.queryDataIds(graph))
1273 self.assertEqual(expected, queried)
1275 # Overlap each DatabaseDimensionElement with the commonSkyPix system.
1276 commonSkyPix = registry.dimensions.commonSkyPix
1277 for elementName, regions in regions.items():
1278 graph = DimensionGraph.union(registry.dimensions[elementName].graph, commonSkyPix.graph)
1279 expected = set()
1280 for dataId, region in regions.items():
1281 for begin, end in commonSkyPix.pixelization.envelope(region):
1282 expected.update(
1283 DataCoordinate.standardize({commonSkyPix.name: index, **dataId.byName()}, graph=graph)
1284 for index in range(begin, end)
1285 )
1286 self.assertGreater(len(expected), 2, msg="Test that we aren't just comparing empty sets.")
1287 queried = set(registry.queryDataIds(graph))
1288 self.assertEqual(expected, queried)
1290 def testAbstractQuery(self):
1291 """Test that we can run a query that just lists the known
1292 bands. This is tricky because band is
1293 backed by a query against physical_filter.
1294 """
1295 registry = self.makeRegistry()
1296 registry.insertDimensionData("instrument", dict(name="DummyCam"))
1297 registry.insertDimensionData(
1298 "physical_filter",
1299 dict(instrument="DummyCam", name="dummy_i", band="i"),
1300 dict(instrument="DummyCam", name="dummy_i2", band="i"),
1301 dict(instrument="DummyCam", name="dummy_r", band="r"),
1302 )
1303 rows = registry.queryDataIds(["band"]).toSet()
1304 self.assertCountEqual(
1305 rows,
1306 [
1307 DataCoordinate.standardize(band="i", universe=registry.dimensions),
1308 DataCoordinate.standardize(band="r", universe=registry.dimensions),
1309 ],
1310 )
1312 def testAttributeManager(self):
1313 """Test basic functionality of attribute manager."""
1314 # number of attributes with schema versions in a fresh database,
1315 # 6 managers with 2 records per manager, plus config for dimensions
1316 VERSION_COUNT = 6 * 2 + 1
1318 registry = self.makeRegistry()
1319 attributes = registry._managers.attributes
1321 # check what get() returns for non-existing key
1322 self.assertIsNone(attributes.get("attr"))
1323 self.assertEqual(attributes.get("attr", ""), "")
1324 self.assertEqual(attributes.get("attr", "Value"), "Value")
1325 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1327 # cannot store empty key or value
1328 with self.assertRaises(ValueError):
1329 attributes.set("", "value")
1330 with self.assertRaises(ValueError):
1331 attributes.set("attr", "")
1333 # set value of non-existing key
1334 attributes.set("attr", "value")
1335 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1336 self.assertEqual(attributes.get("attr"), "value")
1338 # update value of existing key
1339 with self.assertRaises(ButlerAttributeExistsError):
1340 attributes.set("attr", "value2")
1342 attributes.set("attr", "value2", force=True)
1343 self.assertEqual(len(list(attributes.items())), VERSION_COUNT + 1)
1344 self.assertEqual(attributes.get("attr"), "value2")
1346 # delete existing key
1347 self.assertTrue(attributes.delete("attr"))
1348 self.assertEqual(len(list(attributes.items())), VERSION_COUNT)
1350 # delete non-existing key
1351 self.assertFalse(attributes.delete("non-attr"))
1353 # store bunch of keys and get the list back
1354 data = [
1355 ("version.core", "1.2.3"),
1356 ("version.dimensions", "3.2.1"),
1357 ("config.managers.opaque", "ByNameOpaqueTableStorageManager"),
1358 ]
1359 for key, value in data:
1360 attributes.set(key, value)
1361 items = dict(attributes.items())
1362 for key, value in data:
1363 self.assertEqual(items[key], value)
1365 def testQueryDatasetsDeduplication(self):
1366 """Test that the findFirst option to queryDatasets selects datasets
1367 from collections in the order given".
1368 """
1369 registry = self.makeRegistry()
1370 self.loadData(registry, "base.yaml")
1371 self.loadData(registry, "datasets.yaml")
1372 self.assertCountEqual(
1373 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"])),
1374 [
1375 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1376 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1377 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1378 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1379 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1380 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1381 ],
1382 )
1383 self.assertCountEqual(
1384 list(registry.queryDatasets("bias", collections=["imported_g", "imported_r"], findFirst=True)),
1385 [
1386 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1387 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g"),
1388 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g"),
1389 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1390 ],
1391 )
1392 self.assertCountEqual(
1393 list(registry.queryDatasets("bias", collections=["imported_r", "imported_g"], findFirst=True)),
1394 [
1395 registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
1396 registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r"),
1397 registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r"),
1398 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
1399 ],
1400 )
1402 def testQueryResults(self):
1403 """Test querying for data IDs and then manipulating the QueryResults
1404 object returned to perform other queries.
1405 """
1406 registry = self.makeRegistry()
1407 self.loadData(registry, "base.yaml")
1408 self.loadData(registry, "datasets.yaml")
1409 bias = registry.getDatasetType("bias")
1410 flat = registry.getDatasetType("flat")
1411 # Obtain expected results from methods other than those we're testing
1412 # here. That includes:
1413 # - the dimensions of the data IDs we want to query:
1414 expectedGraph = DimensionGraph(registry.dimensions, names=["detector", "physical_filter"])
1415 # - the dimensions of some other data IDs we'll extract from that:
1416 expectedSubsetGraph = DimensionGraph(registry.dimensions, names=["detector"])
1417 # - the data IDs we expect to obtain from the first queries:
1418 expectedDataIds = DataCoordinateSet(
1419 {
1420 DataCoordinate.standardize(
1421 instrument="Cam1", detector=d, physical_filter=p, universe=registry.dimensions
1422 )
1423 for d, p in itertools.product({1, 2, 3}, {"Cam1-G", "Cam1-R1", "Cam1-R2"})
1424 },
1425 graph=expectedGraph,
1426 hasFull=False,
1427 hasRecords=False,
1428 )
1429 # - the flat datasets we expect to find from those data IDs, in just
1430 # one collection (so deduplication is irrelevant):
1431 expectedFlats = [
1432 registry.findDataset(
1433 flat, instrument="Cam1", detector=1, physical_filter="Cam1-R1", collections="imported_r"
1434 ),
1435 registry.findDataset(
1436 flat, instrument="Cam1", detector=2, physical_filter="Cam1-R1", collections="imported_r"
1437 ),
1438 registry.findDataset(
1439 flat, instrument="Cam1", detector=3, physical_filter="Cam1-R2", collections="imported_r"
1440 ),
1441 ]
1442 # - the data IDs we expect to extract from that:
1443 expectedSubsetDataIds = expectedDataIds.subset(expectedSubsetGraph)
1444 # - the bias datasets we expect to find from those data IDs, after we
1445 # subset-out the physical_filter dimension, both with duplicates:
1446 expectedAllBiases = [
1447 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1448 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_g"),
1449 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_g"),
1450 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1451 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1452 ]
1453 # - ...and without duplicates:
1454 expectedDeduplicatedBiases = [
1455 registry.findDataset(bias, instrument="Cam1", detector=1, collections="imported_g"),
1456 registry.findDataset(bias, instrument="Cam1", detector=2, collections="imported_r"),
1457 registry.findDataset(bias, instrument="Cam1", detector=3, collections="imported_r"),
1458 ]
1459 # Test against those expected results, using a "lazy" query for the
1460 # data IDs (which re-executes that query each time we use it to do
1461 # something new).
1462 dataIds = registry.queryDataIds(
1463 ["detector", "physical_filter"],
1464 where="detector.purpose = 'SCIENCE'", # this rejects detector=4
1465 instrument="Cam1",
1466 )
1467 self.assertEqual(dataIds.graph, expectedGraph)
1468 self.assertEqual(dataIds.toSet(), expectedDataIds)
1469 self.assertCountEqual(
1470 list(
1471 dataIds.findDatasets(
1472 flat,
1473 collections=["imported_r"],
1474 )
1475 ),
1476 expectedFlats,
1477 )
1478 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1479 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1480 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1481 self.assertCountEqual(
1482 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=False)),
1483 expectedAllBiases,
1484 )
1485 self.assertCountEqual(
1486 list(subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)),
1487 expectedDeduplicatedBiases,
1488 )
1490 # Check dimensions match.
1491 with self.assertRaises(ValueError):
1492 subsetDataIds.findDatasets("flat", collections=["imported_r", "imported_g"], findFirst=True)
1494 # Use a component dataset type.
1495 self.assertCountEqual(
1496 [
1497 ref.makeComponentRef("image")
1498 for ref in subsetDataIds.findDatasets(
1499 bias,
1500 collections=["imported_r", "imported_g"],
1501 findFirst=False,
1502 )
1503 ],
1504 [ref.makeComponentRef("image") for ref in expectedAllBiases],
1505 )
1507 # Use a named dataset type that does not exist and a dataset type
1508 # object that does not exist.
1509 unknown_type = DatasetType("not_known", dimensions=bias.dimensions, storageClass="Exposure")
1511 # Test both string name and dataset type object.
1512 test_type: str | DatasetType
1513 for test_type, test_type_name in (
1514 (unknown_type, unknown_type.name),
1515 (unknown_type.name, unknown_type.name),
1516 ):
1517 with self.assertRaisesRegex(DatasetTypeError, expected_regex=test_type_name):
1518 list(
1519 subsetDataIds.findDatasets(
1520 test_type, collections=["imported_r", "imported_g"], findFirst=True
1521 )
1522 )
1524 # Materialize the bias dataset queries (only) by putting the results
1525 # into temporary tables, then repeat those tests.
1526 with subsetDataIds.findDatasets(
1527 bias, collections=["imported_r", "imported_g"], findFirst=False
1528 ).materialize() as biases:
1529 self.assertCountEqual(list(biases), expectedAllBiases)
1530 with subsetDataIds.findDatasets(
1531 bias, collections=["imported_r", "imported_g"], findFirst=True
1532 ).materialize() as biases:
1533 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1534 # Materialize the data ID subset query, but not the dataset queries.
1535 with subsetDataIds.materialize() as subsetDataIds:
1536 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1537 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1538 self.assertCountEqual(
1539 list(
1540 subsetDataIds.findDatasets(
1541 bias, collections=["imported_r", "imported_g"], findFirst=False
1542 )
1543 ),
1544 expectedAllBiases,
1545 )
1546 self.assertCountEqual(
1547 list(
1548 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1549 ),
1550 expectedDeduplicatedBiases,
1551 )
1552 # Materialize the dataset queries, too.
1553 with subsetDataIds.findDatasets(
1554 bias, collections=["imported_r", "imported_g"], findFirst=False
1555 ).materialize() as biases:
1556 self.assertCountEqual(list(biases), expectedAllBiases)
1557 with subsetDataIds.findDatasets(
1558 bias, collections=["imported_r", "imported_g"], findFirst=True
1559 ).materialize() as biases:
1560 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1561 # Materialize the original query, but none of the follow-up queries.
1562 with dataIds.materialize() as dataIds:
1563 self.assertEqual(dataIds.graph, expectedGraph)
1564 self.assertEqual(dataIds.toSet(), expectedDataIds)
1565 self.assertCountEqual(
1566 list(
1567 dataIds.findDatasets(
1568 flat,
1569 collections=["imported_r"],
1570 )
1571 ),
1572 expectedFlats,
1573 )
1574 subsetDataIds = dataIds.subset(expectedSubsetGraph, unique=True)
1575 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1576 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1577 self.assertCountEqual(
1578 list(
1579 subsetDataIds.findDatasets(
1580 bias, collections=["imported_r", "imported_g"], findFirst=False
1581 )
1582 ),
1583 expectedAllBiases,
1584 )
1585 self.assertCountEqual(
1586 list(
1587 subsetDataIds.findDatasets(bias, collections=["imported_r", "imported_g"], findFirst=True)
1588 ),
1589 expectedDeduplicatedBiases,
1590 )
1591 # Materialize just the bias dataset queries.
1592 with subsetDataIds.findDatasets(
1593 bias, collections=["imported_r", "imported_g"], findFirst=False
1594 ).materialize() as biases:
1595 self.assertCountEqual(list(biases), expectedAllBiases)
1596 with subsetDataIds.findDatasets(
1597 bias, collections=["imported_r", "imported_g"], findFirst=True
1598 ).materialize() as biases:
1599 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1600 # Materialize the subset data ID query, but not the dataset
1601 # queries.
1602 with subsetDataIds.materialize() as subsetDataIds:
1603 self.assertEqual(subsetDataIds.graph, expectedSubsetGraph)
1604 self.assertEqual(subsetDataIds.toSet(), expectedSubsetDataIds)
1605 self.assertCountEqual(
1606 list(
1607 subsetDataIds.findDatasets(
1608 bias, collections=["imported_r", "imported_g"], findFirst=False
1609 )
1610 ),
1611 expectedAllBiases,
1612 )
1613 self.assertCountEqual(
1614 list(
1615 subsetDataIds.findDatasets(
1616 bias, collections=["imported_r", "imported_g"], findFirst=True
1617 )
1618 ),
1619 expectedDeduplicatedBiases,
1620 )
1621 # Materialize the bias dataset queries, too, so now we're
1622 # materializing every single step.
1623 with subsetDataIds.findDatasets(
1624 bias, collections=["imported_r", "imported_g"], findFirst=False
1625 ).materialize() as biases:
1626 self.assertCountEqual(list(biases), expectedAllBiases)
1627 with subsetDataIds.findDatasets(
1628 bias, collections=["imported_r", "imported_g"], findFirst=True
1629 ).materialize() as biases:
1630 self.assertCountEqual(list(biases), expectedDeduplicatedBiases)
1632 def testStorageClassPropagation(self):
1633 """Test that queries for datasets respect the storage class passed in
1634 as part of a full dataset type.
1635 """
1636 registry = self.makeRegistry()
1637 self.loadData(registry, "base.yaml")
1638 dataset_type_in_registry = DatasetType(
1639 "tbl", dimensions=["instrument"], storageClass="DataFrame", universe=registry.dimensions
1640 )
1641 registry.registerDatasetType(dataset_type_in_registry)
1642 run = "run1"
1643 registry.registerRun(run)
1644 (inserted_ref,) = registry.insertDatasets(
1645 dataset_type_in_registry, [registry.expandDataId(instrument="Cam1")], run=run
1646 )
1647 self.assertEqual(inserted_ref.datasetType, dataset_type_in_registry)
1648 query_dataset_type = DatasetType(
1649 "tbl", dimensions=["instrument"], storageClass="ArrowAstropy", universe=registry.dimensions
1650 )
1651 self.assertNotEqual(dataset_type_in_registry, query_dataset_type)
1652 query_datasets_result = registry.queryDatasets(query_dataset_type, collections=[run])
1653 self.assertEqual(query_datasets_result.parentDatasetType, query_dataset_type) # type: ignore
1654 (query_datasets_ref,) = query_datasets_result
1655 self.assertEqual(query_datasets_ref.datasetType, query_dataset_type)
1656 query_data_ids_find_datasets_result = registry.queryDataIds(["instrument"]).findDatasets(
1657 query_dataset_type, collections=[run]
1658 )
1659 self.assertEqual(query_data_ids_find_datasets_result.parentDatasetType, query_dataset_type)
1660 (query_data_ids_find_datasets_ref,) = query_data_ids_find_datasets_result
1661 self.assertEqual(query_data_ids_find_datasets_ref.datasetType, query_dataset_type)
1662 query_dataset_types_result = registry.queryDatasetTypes(query_dataset_type)
1663 self.assertEqual(list(query_dataset_types_result), [query_dataset_type])
1664 find_dataset_ref = registry.findDataset(query_dataset_type, instrument="Cam1", collections=[run])
1665 self.assertEqual(find_dataset_ref.datasetType, query_dataset_type)
1667 def testEmptyDimensionsQueries(self):
1668 """Test Query and QueryResults objects in the case where there are no
1669 dimensions.
1670 """
1671 # Set up test data: one dataset type, two runs, one dataset in each.
1672 registry = self.makeRegistry()
1673 self.loadData(registry, "base.yaml")
1674 schema = DatasetType("schema", dimensions=registry.dimensions.empty, storageClass="Catalog")
1675 registry.registerDatasetType(schema)
1676 dataId = DataCoordinate.makeEmpty(registry.dimensions)
1677 run1 = "run1"
1678 run2 = "run2"
1679 registry.registerRun(run1)
1680 registry.registerRun(run2)
1681 (dataset1,) = registry.insertDatasets(schema, dataIds=[dataId], run=run1)
1682 (dataset2,) = registry.insertDatasets(schema, dataIds=[dataId], run=run2)
1683 # Query directly for both of the datasets, and each one, one at a time.
1684 self.checkQueryResults(
1685 registry.queryDatasets(schema, collections=[run1, run2], findFirst=False), [dataset1, dataset2]
1686 )
1687 self.checkQueryResults(
1688 registry.queryDatasets(schema, collections=[run1, run2], findFirst=True),
1689 [dataset1],
1690 )
1691 self.checkQueryResults(
1692 registry.queryDatasets(schema, collections=[run2, run1], findFirst=True),
1693 [dataset2],
1694 )
1695 # Query for data IDs with no dimensions.
1696 dataIds = registry.queryDataIds([])
1697 self.checkQueryResults(dataIds, [dataId])
1698 # Use queried data IDs to find the datasets.
1699 self.checkQueryResults(
1700 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1701 [dataset1, dataset2],
1702 )
1703 self.checkQueryResults(
1704 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1705 [dataset1],
1706 )
1707 self.checkQueryResults(
1708 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1709 [dataset2],
1710 )
1711 # Now materialize the data ID query results and repeat those tests.
1712 with dataIds.materialize() as dataIds:
1713 self.checkQueryResults(dataIds, [dataId])
1714 self.checkQueryResults(
1715 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1716 [dataset1],
1717 )
1718 self.checkQueryResults(
1719 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1720 [dataset2],
1721 )
1722 # Query for non-empty data IDs, then subset that to get the empty one.
1723 # Repeat the above tests starting from that.
1724 dataIds = registry.queryDataIds(["instrument"]).subset(registry.dimensions.empty, unique=True)
1725 self.checkQueryResults(dataIds, [dataId])
1726 self.checkQueryResults(
1727 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1728 [dataset1, dataset2],
1729 )
1730 self.checkQueryResults(
1731 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1732 [dataset1],
1733 )
1734 self.checkQueryResults(
1735 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1736 [dataset2],
1737 )
1738 with dataIds.materialize() as dataIds:
1739 self.checkQueryResults(dataIds, [dataId])
1740 self.checkQueryResults(
1741 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1742 [dataset1, dataset2],
1743 )
1744 self.checkQueryResults(
1745 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1746 [dataset1],
1747 )
1748 self.checkQueryResults(
1749 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1750 [dataset2],
1751 )
1752 # Query for non-empty data IDs, then materialize, then subset to get
1753 # the empty one. Repeat again.
1754 with registry.queryDataIds(["instrument"]).materialize() as nonEmptyDataIds:
1755 dataIds = nonEmptyDataIds.subset(registry.dimensions.empty, unique=True)
1756 self.checkQueryResults(dataIds, [dataId])
1757 self.checkQueryResults(
1758 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1759 [dataset1, dataset2],
1760 )
1761 self.checkQueryResults(
1762 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1763 [dataset1],
1764 )
1765 self.checkQueryResults(
1766 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1767 [dataset2],
1768 )
1769 with dataIds.materialize() as dataIds:
1770 self.checkQueryResults(dataIds, [dataId])
1771 self.checkQueryResults(
1772 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=False),
1773 [dataset1, dataset2],
1774 )
1775 self.checkQueryResults(
1776 dataIds.findDatasets(schema, collections=[run1, run2], findFirst=True),
1777 [dataset1],
1778 )
1779 self.checkQueryResults(
1780 dataIds.findDatasets(schema, collections=[run2, run1], findFirst=True),
1781 [dataset2],
1782 )
1783 # Query for non-empty data IDs with a constraint on an empty-data-ID
1784 # dataset that exists.
1785 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=...)
1786 self.checkQueryResults(
1787 dataIds.subset(unique=True),
1788 [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)],
1789 )
1790 # Again query for non-empty data IDs with a constraint on empty-data-ID
1791 # datasets, but when the datasets don't exist. We delete the existing
1792 # dataset and query just that collection rather than creating a new
1793 # empty collection because this is a bit less likely for our build-time
1794 # logic to shortcut-out (via the collection summaries), and such a
1795 # shortcut would make this test a bit more trivial than we'd like.
1796 registry.removeDatasets([dataset2])
1797 dataIds = registry.queryDataIds(["instrument"], datasets="schema", collections=run2)
1798 self.checkQueryResults(dataIds, [])
1800 def testDimensionDataModifications(self):
1801 """Test that modifying dimension records via:
1802 syncDimensionData(..., update=True) and
1803 insertDimensionData(..., replace=True) works as expected, even in the
1804 presence of datasets using those dimensions and spatial overlap
1805 relationships.
1806 """
1808 def unpack_range_set(ranges: lsst.sphgeom.RangeSet) -> Iterator[int]:
1809 """Unpack a sphgeom.RangeSet into the integers it contains."""
1810 for begin, end in ranges:
1811 yield from range(begin, end)
1813 def range_set_hull(
1814 ranges: lsst.sphgeom.RangeSet,
1815 pixelization: lsst.sphgeom.HtmPixelization,
1816 ) -> lsst.sphgeom.ConvexPolygon:
1817 """Create a ConvexPolygon hull of the region defined by a set of
1818 HTM pixelization index ranges.
1819 """
1820 points = []
1821 for index in unpack_range_set(ranges):
1822 points.extend(pixelization.triangle(index).getVertices())
1823 return lsst.sphgeom.ConvexPolygon(points)
1825 # Use HTM to set up an initial parent region (one arbitrary trixel)
1826 # and four child regions (the trixels within the parent at the next
1827 # level. We'll use the parent as a tract/visit region and the children
1828 # as its patch/visit_detector regions.
1829 registry = self.makeRegistry()
1830 htm6 = registry.dimensions.skypix["htm"][6].pixelization
1831 commonSkyPix = registry.dimensions.commonSkyPix.pixelization
1832 index = 12288
1833 child_ranges_small = lsst.sphgeom.RangeSet(index).scaled(4)
1834 assert htm6.universe().contains(child_ranges_small)
1835 child_regions_small = [htm6.triangle(i) for i in unpack_range_set(child_ranges_small)]
1836 parent_region_small = lsst.sphgeom.ConvexPolygon(
1837 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_small))
1838 )
1839 assert all(parent_region_small.contains(c) for c in child_regions_small)
1840 # Make a larger version of each child region, defined to be the set of
1841 # htm6 trixels that overlap the original's bounding circle. Make a new
1842 # parent that's the convex hull of the new children.
1843 child_regions_large = [
1844 range_set_hull(htm6.envelope(c.getBoundingCircle()), htm6) for c in child_regions_small
1845 ]
1846 assert all(large.contains(small) for large, small in zip(child_regions_large, child_regions_small))
1847 parent_region_large = lsst.sphgeom.ConvexPolygon(
1848 list(itertools.chain.from_iterable(c.getVertices() for c in child_regions_large))
1849 )
1850 assert all(parent_region_large.contains(c) for c in child_regions_large)
1851 assert parent_region_large.contains(parent_region_small)
1852 assert not parent_region_small.contains(parent_region_large)
1853 assert not all(parent_region_small.contains(c) for c in child_regions_large)
1854 # Find some commonSkyPix indices that overlap the large regions but not
1855 # overlap the small regions. We use commonSkyPix here to make sure the
1856 # real tests later involve what's in the database, not just post-query
1857 # filtering of regions.
1858 child_difference_indices = []
1859 for large, small in zip(child_regions_large, child_regions_small):
1860 difference = list(unpack_range_set(commonSkyPix.envelope(large) - commonSkyPix.envelope(small)))
1861 assert difference, "if this is empty, we can't test anything useful with these regions"
1862 assert all(
1863 not commonSkyPix.triangle(d).isDisjointFrom(large)
1864 and commonSkyPix.triangle(d).isDisjointFrom(small)
1865 for d in difference
1866 )
1867 child_difference_indices.append(difference)
1868 parent_difference_indices = list(
1869 unpack_range_set(
1870 commonSkyPix.envelope(parent_region_large) - commonSkyPix.envelope(parent_region_small)
1871 )
1872 )
1873 assert parent_difference_indices, "if this is empty, we can't test anything useful with these regions"
1874 assert all(
1875 (
1876 not commonSkyPix.triangle(d).isDisjointFrom(parent_region_large)
1877 and commonSkyPix.triangle(d).isDisjointFrom(parent_region_small)
1878 )
1879 for d in parent_difference_indices
1880 )
1881 # Now that we've finally got those regions, we'll insert the large ones
1882 # as tract/patch dimension records.
1883 skymap_name = "testing_v1"
1884 registry.insertDimensionData(
1885 "skymap",
1886 {
1887 "name": skymap_name,
1888 "hash": bytes([42]),
1889 "tract_max": 1,
1890 "patch_nx_max": 2,
1891 "patch_ny_max": 2,
1892 },
1893 )
1894 registry.insertDimensionData("tract", {"skymap": skymap_name, "id": 0, "region": parent_region_large})
1895 registry.insertDimensionData(
1896 "patch",
1897 *[
1898 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1899 for n, c in enumerate(child_regions_large)
1900 ],
1901 )
1902 # Add at dataset that uses these dimensions to make sure that modifying
1903 # them doesn't disrupt foreign keys (need to make sure DB doesn't
1904 # implement insert with replace=True as delete-then-insert).
1905 dataset_type = DatasetType(
1906 "coadd",
1907 dimensions=["tract", "patch"],
1908 universe=registry.dimensions,
1909 storageClass="Exposure",
1910 )
1911 registry.registerDatasetType(dataset_type)
1912 registry.registerCollection("the_run", CollectionType.RUN)
1913 registry.insertDatasets(
1914 dataset_type,
1915 [{"skymap": skymap_name, "tract": 0, "patch": 2}],
1916 run="the_run",
1917 )
1918 # Query for tracts and patches that overlap some "difference" htm9
1919 # pixels; there should be overlaps, because the database has
1920 # the "large" suite of regions.
1921 self.assertEqual(
1922 {0},
1923 {
1924 data_id["tract"]
1925 for data_id in registry.queryDataIds(
1926 ["tract"],
1927 skymap=skymap_name,
1928 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1929 )
1930 },
1931 )
1932 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1933 self.assertIn(
1934 patch_id,
1935 {
1936 data_id["patch"]
1937 for data_id in registry.queryDataIds(
1938 ["patch"],
1939 skymap=skymap_name,
1940 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1941 )
1942 },
1943 )
1944 # Use sync to update the tract region and insert to update the regions
1945 # of the patches, to the "small" suite.
1946 updated = registry.syncDimensionData(
1947 "tract",
1948 {"skymap": skymap_name, "id": 0, "region": parent_region_small},
1949 update=True,
1950 )
1951 self.assertEqual(updated, {"region": parent_region_large})
1952 registry.insertDimensionData(
1953 "patch",
1954 *[
1955 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1956 for n, c in enumerate(child_regions_small)
1957 ],
1958 replace=True,
1959 )
1960 # Query again; there now should be no such overlaps, because the
1961 # database has the "small" suite of regions.
1962 self.assertFalse(
1963 set(
1964 registry.queryDataIds(
1965 ["tract"],
1966 skymap=skymap_name,
1967 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
1968 )
1969 )
1970 )
1971 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
1972 self.assertNotIn(
1973 patch_id,
1974 {
1975 data_id["patch"]
1976 for data_id in registry.queryDataIds(
1977 ["patch"],
1978 skymap=skymap_name,
1979 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
1980 )
1981 },
1982 )
1983 # Update back to the large regions and query one more time.
1984 updated = registry.syncDimensionData(
1985 "tract",
1986 {"skymap": skymap_name, "id": 0, "region": parent_region_large},
1987 update=True,
1988 )
1989 self.assertEqual(updated, {"region": parent_region_small})
1990 registry.insertDimensionData(
1991 "patch",
1992 *[
1993 {"skymap": skymap_name, "tract": 0, "id": n, "cell_x": n % 2, "cell_y": n // 2, "region": c}
1994 for n, c in enumerate(child_regions_large)
1995 ],
1996 replace=True,
1997 )
1998 self.assertEqual(
1999 {0},
2000 {
2001 data_id["tract"]
2002 for data_id in registry.queryDataIds(
2003 ["tract"],
2004 skymap=skymap_name,
2005 dataId={registry.dimensions.commonSkyPix.name: parent_difference_indices[0]},
2006 )
2007 },
2008 )
2009 for patch_id, patch_difference_indices in enumerate(child_difference_indices):
2010 self.assertIn(
2011 patch_id,
2012 {
2013 data_id["patch"]
2014 for data_id in registry.queryDataIds(
2015 ["patch"],
2016 skymap=skymap_name,
2017 dataId={registry.dimensions.commonSkyPix.name: patch_difference_indices[0]},
2018 )
2019 },
2020 )
2022 def testCalibrationCollections(self):
2023 """Test operations on `~CollectionType.CALIBRATION` collections,
2024 including `Registry.certify`, `Registry.decertify`, and
2025 `Registry.findDataset`.
2026 """
2027 # Setup - make a Registry, fill it with some datasets in
2028 # non-calibration collections.
2029 registry = self.makeRegistry()
2030 self.loadData(registry, "base.yaml")
2031 self.loadData(registry, "datasets.yaml")
2032 # Set up some timestamps.
2033 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")
2034 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai")
2035 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai")
2036 t4 = astropy.time.Time("2020-01-01T04:00:00", format="isot", scale="tai")
2037 t5 = astropy.time.Time("2020-01-01T05:00:00", format="isot", scale="tai")
2038 allTimespans = [
2039 Timespan(a, b) for a, b in itertools.combinations([None, t1, t2, t3, t4, t5, None], r=2)
2040 ]
2041 # Get references to some datasets.
2042 bias2a = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_g")
2043 bias3a = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_g")
2044 bias2b = registry.findDataset("bias", instrument="Cam1", detector=2, collections="imported_r")
2045 bias3b = registry.findDataset("bias", instrument="Cam1", detector=3, collections="imported_r")
2046 # Register the main calibration collection we'll be working with.
2047 collection = "Cam1/calibs/default"
2048 registry.registerCollection(collection, type=CollectionType.CALIBRATION)
2049 # Cannot associate into a calibration collection (no timespan).
2050 with self.assertRaises(CollectionTypeError):
2051 registry.associate(collection, [bias2a])
2052 # Certify 2a dataset with [t2, t4) validity.
2053 registry.certify(collection, [bias2a], Timespan(begin=t2, end=t4))
2054 # Test that we can query for this dataset via the new collection, both
2055 # on its own and with a RUN collection, as long as we don't try to join
2056 # in temporal dimensions or use findFirst=True.
2057 self.assertEqual(
2058 set(registry.queryDatasets("bias", findFirst=False, collections=collection)),
2059 {bias2a},
2060 )
2061 self.assertEqual(
2062 set(registry.queryDatasets("bias", findFirst=False, collections=[collection, "imported_r"])),
2063 {
2064 bias2a,
2065 bias2b,
2066 bias3b,
2067 registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2068 },
2069 )
2070 self.assertEqual(
2071 set(registry.queryDataIds("detector", datasets="bias", collections=collection)),
2072 {registry.expandDataId(instrument="Cam1", detector=2)},
2073 )
2074 self.assertEqual(
2075 set(registry.queryDataIds("detector", datasets="bias", collections=[collection, "imported_r"])),
2076 {
2077 registry.expandDataId(instrument="Cam1", detector=2),
2078 registry.expandDataId(instrument="Cam1", detector=3),
2079 registry.expandDataId(instrument="Cam1", detector=4),
2080 },
2081 )
2083 # We should not be able to certify 2b with anything overlapping that
2084 # window.
2085 with self.assertRaises(ConflictingDefinitionError):
2086 registry.certify(collection, [bias2b], Timespan(begin=None, end=t3))
2087 with self.assertRaises(ConflictingDefinitionError):
2088 registry.certify(collection, [bias2b], Timespan(begin=None, end=t5))
2089 with self.assertRaises(ConflictingDefinitionError):
2090 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t3))
2091 with self.assertRaises(ConflictingDefinitionError):
2092 registry.certify(collection, [bias2b], Timespan(begin=t1, end=t5))
2093 with self.assertRaises(ConflictingDefinitionError):
2094 registry.certify(collection, [bias2b], Timespan(begin=t1, end=None))
2095 with self.assertRaises(ConflictingDefinitionError):
2096 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t3))
2097 with self.assertRaises(ConflictingDefinitionError):
2098 registry.certify(collection, [bias2b], Timespan(begin=t2, end=t5))
2099 with self.assertRaises(ConflictingDefinitionError):
2100 registry.certify(collection, [bias2b], Timespan(begin=t2, end=None))
2101 # We should be able to certify 3a with a range overlapping that window,
2102 # because it's for a different detector.
2103 # We'll certify 3a over [t1, t3).
2104 registry.certify(collection, [bias3a], Timespan(begin=t1, end=t3))
2105 # Now we'll certify 2b and 3b together over [t4, ∞).
2106 registry.certify(collection, [bias2b, bias3b], Timespan(begin=t4, end=None))
2108 # Fetch all associations and check that they are what we expect.
2109 self.assertCountEqual(
2110 list(
2111 registry.queryDatasetAssociations(
2112 "bias",
2113 collections=[collection, "imported_g", "imported_r"],
2114 )
2115 ),
2116 [
2117 DatasetAssociation(
2118 ref=registry.findDataset("bias", instrument="Cam1", detector=1, collections="imported_g"),
2119 collection="imported_g",
2120 timespan=None,
2121 ),
2122 DatasetAssociation(
2123 ref=registry.findDataset("bias", instrument="Cam1", detector=4, collections="imported_r"),
2124 collection="imported_r",
2125 timespan=None,
2126 ),
2127 DatasetAssociation(ref=bias2a, collection="imported_g", timespan=None),
2128 DatasetAssociation(ref=bias3a, collection="imported_g", timespan=None),
2129 DatasetAssociation(ref=bias2b, collection="imported_r", timespan=None),
2130 DatasetAssociation(ref=bias3b, collection="imported_r", timespan=None),
2131 DatasetAssociation(ref=bias2a, collection=collection, timespan=Timespan(begin=t2, end=t4)),
2132 DatasetAssociation(ref=bias3a, collection=collection, timespan=Timespan(begin=t1, end=t3)),
2133 DatasetAssociation(ref=bias2b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2134 DatasetAssociation(ref=bias3b, collection=collection, timespan=Timespan(begin=t4, end=None)),
2135 ],
2136 )
2138 class Ambiguous:
2139 """Tag class to denote lookups that should be ambiguous."""
2141 pass
2143 def assertLookup(
2144 detector: int, timespan: Timespan, expected: DatasetRef | type[Ambiguous] | None
2145 ) -> None:
2146 """Local function that asserts that a bias lookup returns the given
2147 expected result.
2148 """
2149 if expected is Ambiguous:
2150 with self.assertRaises((DatasetTypeError, LookupError)):
2151 registry.findDataset(
2152 "bias",
2153 collections=collection,
2154 instrument="Cam1",
2155 detector=detector,
2156 timespan=timespan,
2157 )
2158 else:
2159 self.assertEqual(
2160 expected,
2161 registry.findDataset(
2162 "bias",
2163 collections=collection,
2164 instrument="Cam1",
2165 detector=detector,
2166 timespan=timespan,
2167 ),
2168 )
2170 # Systematically test lookups against expected results.
2171 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2172 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2173 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2174 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2175 assertLookup(detector=2, timespan=Timespan(None, t5), expected=Ambiguous)
2176 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2177 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2178 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2179 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2180 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=Ambiguous)
2181 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2182 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2183 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2184 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=Ambiguous)
2185 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2186 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=bias2a)
2187 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=Ambiguous)
2188 assertLookup(detector=2, timespan=Timespan(t3, None), expected=Ambiguous)
2189 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=bias2b)
2190 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2191 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2192 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2193 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2194 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2195 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2196 assertLookup(detector=3, timespan=Timespan(None, t5), expected=Ambiguous)
2197 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2198 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2199 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2200 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2201 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=Ambiguous)
2202 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2203 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2204 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2205 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=Ambiguous)
2206 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2207 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2208 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=bias3b)
2209 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2210 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=bias3b)
2211 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2212 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2214 # Decertify [t3, t5) for all data IDs, and do test lookups again.
2215 # This should truncate bias2a to [t2, t3), leave bias3a unchanged at
2216 # [t1, t3), and truncate bias2b and bias3b to [t5, ∞).
2217 registry.decertify(collection=collection, datasetType="bias", timespan=Timespan(t3, t5))
2218 assertLookup(detector=2, timespan=Timespan(None, t1), expected=None)
2219 assertLookup(detector=2, timespan=Timespan(None, t2), expected=None)
2220 assertLookup(detector=2, timespan=Timespan(None, t3), expected=bias2a)
2221 assertLookup(detector=2, timespan=Timespan(None, t4), expected=bias2a)
2222 assertLookup(detector=2, timespan=Timespan(None, t5), expected=bias2a)
2223 assertLookup(detector=2, timespan=Timespan(None, None), expected=Ambiguous)
2224 assertLookup(detector=2, timespan=Timespan(t1, t2), expected=None)
2225 assertLookup(detector=2, timespan=Timespan(t1, t3), expected=bias2a)
2226 assertLookup(detector=2, timespan=Timespan(t1, t4), expected=bias2a)
2227 assertLookup(detector=2, timespan=Timespan(t1, t5), expected=bias2a)
2228 assertLookup(detector=2, timespan=Timespan(t1, None), expected=Ambiguous)
2229 assertLookup(detector=2, timespan=Timespan(t2, t3), expected=bias2a)
2230 assertLookup(detector=2, timespan=Timespan(t2, t4), expected=bias2a)
2231 assertLookup(detector=2, timespan=Timespan(t2, t5), expected=bias2a)
2232 assertLookup(detector=2, timespan=Timespan(t2, None), expected=Ambiguous)
2233 assertLookup(detector=2, timespan=Timespan(t3, t4), expected=None)
2234 assertLookup(detector=2, timespan=Timespan(t3, t5), expected=None)
2235 assertLookup(detector=2, timespan=Timespan(t3, None), expected=bias2b)
2236 assertLookup(detector=2, timespan=Timespan(t4, t5), expected=None)
2237 assertLookup(detector=2, timespan=Timespan(t4, None), expected=bias2b)
2238 assertLookup(detector=2, timespan=Timespan(t5, None), expected=bias2b)
2239 assertLookup(detector=3, timespan=Timespan(None, t1), expected=None)
2240 assertLookup(detector=3, timespan=Timespan(None, t2), expected=bias3a)
2241 assertLookup(detector=3, timespan=Timespan(None, t3), expected=bias3a)
2242 assertLookup(detector=3, timespan=Timespan(None, t4), expected=bias3a)
2243 assertLookup(detector=3, timespan=Timespan(None, t5), expected=bias3a)
2244 assertLookup(detector=3, timespan=Timespan(None, None), expected=Ambiguous)
2245 assertLookup(detector=3, timespan=Timespan(t1, t2), expected=bias3a)
2246 assertLookup(detector=3, timespan=Timespan(t1, t3), expected=bias3a)
2247 assertLookup(detector=3, timespan=Timespan(t1, t4), expected=bias3a)
2248 assertLookup(detector=3, timespan=Timespan(t1, t5), expected=bias3a)
2249 assertLookup(detector=3, timespan=Timespan(t1, None), expected=Ambiguous)
2250 assertLookup(detector=3, timespan=Timespan(t2, t3), expected=bias3a)
2251 assertLookup(detector=3, timespan=Timespan(t2, t4), expected=bias3a)
2252 assertLookup(detector=3, timespan=Timespan(t2, t5), expected=bias3a)
2253 assertLookup(detector=3, timespan=Timespan(t2, None), expected=Ambiguous)
2254 assertLookup(detector=3, timespan=Timespan(t3, t4), expected=None)
2255 assertLookup(detector=3, timespan=Timespan(t3, t5), expected=None)
2256 assertLookup(detector=3, timespan=Timespan(t3, None), expected=bias3b)
2257 assertLookup(detector=3, timespan=Timespan(t4, t5), expected=None)
2258 assertLookup(detector=3, timespan=Timespan(t4, None), expected=bias3b)
2259 assertLookup(detector=3, timespan=Timespan(t5, None), expected=bias3b)
2261 # Decertify everything, this time with explicit data IDs, then check
2262 # that no lookups succeed.
2263 registry.decertify(
2264 collection,
2265 "bias",
2266 Timespan(None, None),
2267 dataIds=[
2268 dict(instrument="Cam1", detector=2),
2269 dict(instrument="Cam1", detector=3),
2270 ],
2271 )
2272 for detector in (2, 3):
2273 for timespan in allTimespans:
2274 assertLookup(detector=detector, timespan=timespan, expected=None)
2275 # Certify bias2a and bias3a over (-∞, ∞), check that all lookups return
2276 # those.
2277 registry.certify(
2278 collection,
2279 [bias2a, bias3a],
2280 Timespan(None, None),
2281 )
2282 for timespan in allTimespans:
2283 assertLookup(detector=2, timespan=timespan, expected=bias2a)
2284 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2285 # Decertify just bias2 over [t2, t4).
2286 # This should split a single certification row into two (and leave the
2287 # other existing row, for bias3a, alone).
2288 registry.decertify(
2289 collection, "bias", Timespan(t2, t4), dataIds=[dict(instrument="Cam1", detector=2)]
2290 )
2291 for timespan in allTimespans:
2292 assertLookup(detector=3, timespan=timespan, expected=bias3a)
2293 overlapsBefore = timespan.overlaps(Timespan(None, t2))
2294 overlapsAfter = timespan.overlaps(Timespan(t4, None))
2295 if overlapsBefore and overlapsAfter:
2296 expected = Ambiguous
2297 elif overlapsBefore or overlapsAfter:
2298 expected = bias2a
2299 else:
2300 expected = None
2301 assertLookup(detector=2, timespan=timespan, expected=expected)
2303 def testSkipCalibs(self):
2304 """Test how queries handle skipping of calibration collections."""
2305 registry = self.makeRegistry()
2306 self.loadData(registry, "base.yaml")
2307 self.loadData(registry, "datasets.yaml")
2309 coll_calib = "Cam1/calibs/default"
2310 registry.registerCollection(coll_calib, type=CollectionType.CALIBRATION)
2312 # Add all biases to the calibration collection.
2313 # Without this, the logic that prunes dataset subqueries based on
2314 # datasetType-collection summary information will fire before the logic
2315 # we want to test below. This is a good thing (it avoids the dreaded
2316 # NotImplementedError a bit more often) everywhere but here.
2317 registry.certify(coll_calib, registry.queryDatasets("bias", collections=...), Timespan(None, None))
2319 coll_list = [coll_calib, "imported_g", "imported_r"]
2320 chain = "Cam1/chain"
2321 registry.registerCollection(chain, type=CollectionType.CHAINED)
2322 registry.setCollectionChain(chain, coll_list)
2324 # explicit list will raise if findFirst=True or there are temporal
2325 # dimensions
2326 with self.assertRaises(NotImplementedError):
2327 registry.queryDatasets("bias", collections=coll_list, findFirst=True)
2328 with self.assertRaises(NotImplementedError):
2329 registry.queryDataIds(
2330 ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
2331 ).count()
2333 # chain will skip
2334 datasets = list(registry.queryDatasets("bias", collections=chain))
2335 self.assertGreater(len(datasets), 0)
2337 dataIds = list(registry.queryDataIds(["instrument", "detector"], datasets="bias", collections=chain))
2338 self.assertGreater(len(dataIds), 0)
2340 # glob will skip too
2341 datasets = list(registry.queryDatasets("bias", collections="*d*"))
2342 self.assertGreater(len(datasets), 0)
2344 # regular expression will skip too
2345 pattern = re.compile(".*")
2346 datasets = list(registry.queryDatasets("bias", collections=pattern))
2347 self.assertGreater(len(datasets), 0)
2349 # ellipsis should work as usual
2350 datasets = list(registry.queryDatasets("bias", collections=...))
2351 self.assertGreater(len(datasets), 0)
2353 # few tests with findFirst
2354 datasets = list(registry.queryDatasets("bias", collections=chain, findFirst=True))
2355 self.assertGreater(len(datasets), 0)
2357 def testIngestTimeQuery(self):
2358 registry = self.makeRegistry()
2359 self.loadData(registry, "base.yaml")
2360 dt0 = datetime.utcnow()
2361 self.loadData(registry, "datasets.yaml")
2362 dt1 = datetime.utcnow()
2364 datasets = list(registry.queryDatasets(..., collections=...))
2365 len0 = len(datasets)
2366 self.assertGreater(len0, 0)
2368 where = "ingest_date > T'2000-01-01'"
2369 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2370 len1 = len(datasets)
2371 self.assertEqual(len0, len1)
2373 # no one will ever use this piece of software in 30 years
2374 where = "ingest_date > T'2050-01-01'"
2375 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2376 len2 = len(datasets)
2377 self.assertEqual(len2, 0)
2379 # Check more exact timing to make sure there is no 37 seconds offset
2380 # (after fixing DM-30124). SQLite time precision is 1 second, make
2381 # sure that we don't test with higher precision.
2382 tests = [
2383 # format: (timestamp, operator, expected_len)
2384 (dt0 - timedelta(seconds=1), ">", len0),
2385 (dt0 - timedelta(seconds=1), "<", 0),
2386 (dt1 + timedelta(seconds=1), "<", len0),
2387 (dt1 + timedelta(seconds=1), ">", 0),
2388 ]
2389 for dt, op, expect_len in tests:
2390 dt_str = dt.isoformat(sep=" ")
2392 where = f"ingest_date {op} T'{dt_str}'"
2393 datasets = list(registry.queryDatasets(..., collections=..., where=where))
2394 self.assertEqual(len(datasets), expect_len)
2396 # same with bind using datetime or astropy Time
2397 where = f"ingest_date {op} ingest_time"
2398 datasets = list(
2399 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt})
2400 )
2401 self.assertEqual(len(datasets), expect_len)
2403 dt_astropy = astropy.time.Time(dt, format="datetime")
2404 datasets = list(
2405 registry.queryDatasets(..., collections=..., where=where, bind={"ingest_time": dt_astropy})
2406 )
2407 self.assertEqual(len(datasets), expect_len)
2409 def testTimespanQueries(self):
2410 """Test query expressions involving timespans."""
2411 registry = self.makeRegistry()
2412 self.loadData(registry, "hsc-rc2-subset.yaml")
2413 # All exposures in the database; mapping from ID to timespan.
2414 visits = {record.id: record.timespan for record in registry.queryDimensionRecords("visit")}
2415 # Just those IDs, sorted (which is also temporal sorting, because HSC
2416 # exposure IDs are monotonically increasing).
2417 ids = sorted(visits.keys())
2418 self.assertGreater(len(ids), 20)
2419 # Pick some quasi-random indexes into `ids` to play with.
2420 i1 = int(len(ids) * 0.1)
2421 i2 = int(len(ids) * 0.3)
2422 i3 = int(len(ids) * 0.6)
2423 i4 = int(len(ids) * 0.8)
2424 # Extract some times from those: just before the beginning of i1 (which
2425 # should be after the end of the exposure before), exactly the
2426 # beginning of i2, just after the beginning of i3 (and before its end),
2427 # and the exact end of i4.
2428 t1 = visits[ids[i1]].begin - astropy.time.TimeDelta(1.0, format="sec")
2429 self.assertGreater(t1, visits[ids[i1 - 1]].end)
2430 t2 = visits[ids[i2]].begin
2431 t3 = visits[ids[i3]].begin + astropy.time.TimeDelta(1.0, format="sec")
2432 self.assertLess(t3, visits[ids[i3]].end)
2433 t4 = visits[ids[i4]].end
2434 # Make sure those are actually in order.
2435 self.assertEqual([t1, t2, t3, t4], sorted([t4, t3, t2, t1]))
2437 bind = {
2438 "t1": t1,
2439 "t2": t2,
2440 "t3": t3,
2441 "t4": t4,
2442 "ts23": Timespan(t2, t3),
2443 }
2445 def query(where):
2446 """Return results as a sorted, deduplicated list of visit IDs."""
2447 return sorted(
2448 {
2449 dataId["visit"]
2450 for dataId in registry.queryDataIds("visit", instrument="HSC", bind=bind, where=where)
2451 }
2452 )
2454 # Try a bunch of timespan queries, mixing up the bounds themselves,
2455 # where they appear in the expression, and how we get the timespan into
2456 # the expression.
2458 # t1 is before the start of i1, so this should not include i1.
2459 self.assertEqual(ids[:i1], query("visit.timespan OVERLAPS (null, t1)"))
2460 # t2 is exactly at the start of i2, but ends are exclusive, so these
2461 # should not include i2.
2462 self.assertEqual(ids[i1:i2], query("(t1, t2) OVERLAPS visit.timespan"))
2463 self.assertEqual(ids[:i2], query("visit.timespan < (t2, t4)"))
2464 # t3 is in the middle of i3, so this should include i3.
2465 self.assertEqual(ids[i2 : i3 + 1], query("visit.timespan OVERLAPS ts23"))
2466 # This one should not include t3 by the same reasoning.
2467 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > (t1, t3)"))
2468 # t4 is exactly at the end of i4, so this should include i4.
2469 self.assertEqual(ids[i3 : i4 + 1], query(f"visit.timespan OVERLAPS (T'{t3.tai.isot}', t4)"))
2470 # i4's upper bound of t4 is exclusive so this should not include t4.
2471 self.assertEqual(ids[i4 + 1 :], query("visit.timespan OVERLAPS (t4, NULL)"))
2473 # Now some timespan vs. time scalar queries.
2474 self.assertEqual(ids[:i2], query("visit.timespan < t2"))
2475 self.assertEqual(ids[:i2], query("t2 > visit.timespan"))
2476 self.assertEqual(ids[i3 + 1 :], query("visit.timespan > t3"))
2477 self.assertEqual(ids[i3 + 1 :], query("t3 < visit.timespan"))
2478 self.assertEqual(ids[i3 : i3 + 1], query("visit.timespan OVERLAPS t3"))
2479 self.assertEqual(ids[i3 : i3 + 1], query(f"T'{t3.tai.isot}' OVERLAPS visit.timespan"))
2481 # Empty timespans should not overlap anything.
2482 self.assertEqual([], query("visit.timespan OVERLAPS (t3, t2)"))
2484 def testCollectionSummaries(self):
2485 """Test recording and retrieval of collection summaries."""
2486 self.maxDiff = None
2487 registry = self.makeRegistry()
2488 # Importing datasets from yaml should go through the code path where
2489 # we update collection summaries as we insert datasets.
2490 self.loadData(registry, "base.yaml")
2491 self.loadData(registry, "datasets.yaml")
2492 flat = registry.getDatasetType("flat")
2493 expected1 = CollectionSummary()
2494 expected1.dataset_types.add(registry.getDatasetType("bias"))
2495 expected1.add_data_ids(
2496 flat, [DataCoordinate.standardize(instrument="Cam1", universe=registry.dimensions)]
2497 )
2498 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2499 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2500 # Create a chained collection with both of the imported runs; the
2501 # summary should be the same, because it's a union with itself.
2502 chain = "chain"
2503 registry.registerCollection(chain, CollectionType.CHAINED)
2504 registry.setCollectionChain(chain, ["imported_r", "imported_g"])
2505 self.assertEqual(registry.getCollectionSummary(chain), expected1)
2506 # Associate flats only into a tagged collection and a calibration
2507 # collection to check summaries of those.
2508 tag = "tag"
2509 registry.registerCollection(tag, CollectionType.TAGGED)
2510 registry.associate(tag, registry.queryDatasets(flat, collections="imported_g"))
2511 calibs = "calibs"
2512 registry.registerCollection(calibs, CollectionType.CALIBRATION)
2513 registry.certify(
2514 calibs, registry.queryDatasets(flat, collections="imported_g"), timespan=Timespan(None, None)
2515 )
2516 expected2 = expected1.copy()
2517 expected2.dataset_types.discard("bias")
2518 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2519 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2520 # Explicitly calling Registry.refresh() should load those same
2521 # summaries, via a totally different code path.
2522 registry.refresh()
2523 self.assertEqual(registry.getCollectionSummary("imported_g"), expected1)
2524 self.assertEqual(registry.getCollectionSummary("imported_r"), expected1)
2525 self.assertEqual(registry.getCollectionSummary(tag), expected2)
2526 self.assertEqual(registry.getCollectionSummary(calibs), expected2)
2528 def testBindInQueryDatasets(self):
2529 """Test that the bind parameter is correctly forwarded in
2530 queryDatasets recursion.
2531 """
2532 registry = self.makeRegistry()
2533 # Importing datasets from yaml should go through the code path where
2534 # we update collection summaries as we insert datasets.
2535 self.loadData(registry, "base.yaml")
2536 self.loadData(registry, "datasets.yaml")
2537 self.assertEqual(
2538 set(registry.queryDatasets("flat", band="r", collections=...)),
2539 set(registry.queryDatasets("flat", where="band=my_band", bind={"my_band": "r"}, collections=...)),
2540 )
2542 def testQueryIntRangeExpressions(self):
2543 """Test integer range expressions in ``where`` arguments.
2545 Note that our expressions use inclusive stop values, unlike Python's.
2546 """
2547 registry = self.makeRegistry()
2548 self.loadData(registry, "base.yaml")
2549 self.assertEqual(
2550 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..2)")),
2551 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 2]},
2552 )
2553 self.assertEqual(
2554 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (1..4:2)")),
2555 {registry.expandDataId(instrument="Cam1", detector=n) for n in [1, 3]},
2556 )
2557 self.assertEqual(
2558 set(registry.queryDataIds(["detector"], instrument="Cam1", where="detector IN (2..4:2)")),
2559 {registry.expandDataId(instrument="Cam1", detector=n) for n in [2, 4]},
2560 )
2562 def testQueryResultSummaries(self):
2563 """Test summary methods like `count`, `any`, and `explain_no_results`
2564 on `DataCoordinateQueryResults` and `DatasetQueryResults`.
2565 """
2566 registry = self.makeRegistry()
2567 self.loadData(registry, "base.yaml")
2568 self.loadData(registry, "datasets.yaml")
2569 self.loadData(registry, "spatial.yaml")
2570 # Default test dataset has two collections, each with both flats and
2571 # biases. Add a new collection with only biases.
2572 registry.registerCollection("biases", CollectionType.TAGGED)
2573 registry.associate("biases", registry.queryDatasets("bias", collections=["imported_g"]))
2574 # First query yields two results, and involves no postprocessing.
2575 query1 = registry.queryDataIds(["physical_filter"], band="r")
2576 self.assertTrue(query1.any(execute=False, exact=False))
2577 self.assertTrue(query1.any(execute=True, exact=False))
2578 self.assertTrue(query1.any(execute=True, exact=True))
2579 self.assertEqual(query1.count(exact=False), 2)
2580 self.assertEqual(query1.count(exact=True), 2)
2581 self.assertFalse(list(query1.explain_no_results()))
2582 # Second query should yield no results, which we should see when
2583 # we attempt to expand the data ID.
2584 query2 = registry.queryDataIds(["physical_filter"], band="h")
2585 # There's no execute=False, exact=Fals test here because the behavior
2586 # not something we want to guarantee in this case (and exact=False
2587 # says either answer is legal).
2588 self.assertFalse(query2.any(execute=True, exact=False))
2589 self.assertFalse(query2.any(execute=True, exact=True))
2590 self.assertEqual(query2.count(exact=False), 0)
2591 self.assertEqual(query2.count(exact=True), 0)
2592 self.assertTrue(list(query2.explain_no_results()))
2593 # These queries yield no results due to various problems that can be
2594 # spotted prior to execution, yielding helpful diagnostics.
2595 base_query = registry.queryDataIds(["detector", "physical_filter"])
2596 queries_and_snippets = [
2597 (
2598 # Dataset type name doesn't match any existing dataset types.
2599 registry.queryDatasets("nonexistent", collections=...),
2600 ["nonexistent"],
2601 ),
2602 (
2603 # Dataset type object isn't registered.
2604 registry.queryDatasets(
2605 DatasetType(
2606 "nonexistent",
2607 dimensions=["instrument"],
2608 universe=registry.dimensions,
2609 storageClass="Image",
2610 ),
2611 collections=...,
2612 ),
2613 ["nonexistent"],
2614 ),
2615 (
2616 # No datasets of this type in this collection.
2617 registry.queryDatasets("flat", collections=["biases"]),
2618 ["flat", "biases"],
2619 ),
2620 (
2621 # No datasets of this type in this collection.
2622 base_query.findDatasets("flat", collections=["biases"]),
2623 ["flat", "biases"],
2624 ),
2625 (
2626 # No collections matching at all.
2627 registry.queryDatasets("flat", collections=re.compile("potato.+")),
2628 ["potato"],
2629 ),
2630 ]
2631 # The behavior of these additional queries is slated to change in the
2632 # future, so we also check for deprecation warnings.
2633 with self.assertWarns(FutureWarning):
2634 queries_and_snippets.append(
2635 (
2636 # Dataset type name doesn't match any existing dataset
2637 # types.
2638 registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...),
2639 ["nonexistent"],
2640 )
2641 )
2642 with self.assertWarns(FutureWarning):
2643 queries_and_snippets.append(
2644 (
2645 # Dataset type name doesn't match any existing dataset
2646 # types.
2647 registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...),
2648 ["nonexistent"],
2649 )
2650 )
2651 for query, snippets in queries_and_snippets:
2652 self.assertFalse(query.any(execute=False, exact=False))
2653 self.assertFalse(query.any(execute=True, exact=False))
2654 self.assertFalse(query.any(execute=True, exact=True))
2655 self.assertEqual(query.count(exact=False), 0)
2656 self.assertEqual(query.count(exact=True), 0)
2657 messages = list(query.explain_no_results())
2658 self.assertTrue(messages)
2659 # Want all expected snippets to appear in at least one message.
2660 self.assertTrue(
2661 any(
2662 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2663 ),
2664 messages,
2665 )
2667 # This query does yield results, but should also emit a warning because
2668 # dataset type patterns to queryDataIds is deprecated; just look for
2669 # the warning.
2670 with self.assertWarns(FutureWarning):
2671 registry.queryDataIds(["detector"], datasets=re.compile("^nonexistent$"), collections=...)
2673 # These queries yield no results due to problems that can be identified
2674 # by cheap follow-up queries, yielding helpful diagnostics.
2675 for query, snippets in [
2676 (
2677 # No records for one of the involved dimensions.
2678 registry.queryDataIds(["subfilter"]),
2679 ["no rows", "subfilter"],
2680 ),
2681 (
2682 # No records for one of the involved dimensions.
2683 registry.queryDimensionRecords("subfilter"),
2684 ["no rows", "subfilter"],
2685 ),
2686 ]:
2687 self.assertFalse(query.any(execute=True, exact=False))
2688 self.assertFalse(query.any(execute=True, exact=True))
2689 self.assertEqual(query.count(exact=True), 0)
2690 messages = list(query.explain_no_results())
2691 self.assertTrue(messages)
2692 # Want all expected snippets to appear in at least one message.
2693 self.assertTrue(
2694 any(
2695 all(snippet in message for snippet in snippets) for message in query.explain_no_results()
2696 ),
2697 messages,
2698 )
2700 # This query yields four overlaps in the database, but one is filtered
2701 # out in postprocessing. The count queries aren't accurate because
2702 # they don't account for duplication that happens due to an internal
2703 # join against commonSkyPix.
2704 query3 = registry.queryDataIds(["visit", "tract"], instrument="Cam1", skymap="SkyMap1")
2705 self.assertEqual(
2706 {
2707 DataCoordinate.standardize(
2708 instrument="Cam1",
2709 skymap="SkyMap1",
2710 visit=v,
2711 tract=t,
2712 universe=registry.dimensions,
2713 )
2714 for v, t in [(1, 0), (2, 0), (2, 1)]
2715 },
2716 set(query3),
2717 )
2718 self.assertTrue(query3.any(execute=False, exact=False))
2719 self.assertTrue(query3.any(execute=True, exact=False))
2720 self.assertTrue(query3.any(execute=True, exact=True))
2721 self.assertGreaterEqual(query3.count(exact=False), 4)
2722 self.assertGreaterEqual(query3.count(exact=True, discard=True), 3)
2723 self.assertFalse(list(query3.explain_no_results()))
2724 # This query yields overlaps in the database, but all are filtered
2725 # out in postprocessing. The count queries again aren't very useful.
2726 # We have to use `where=` here to avoid an optimization that
2727 # (currently) skips the spatial postprocess-filtering because it
2728 # recognizes that no spatial join is necessary. That's not ideal, but
2729 # fixing it is out of scope for this ticket.
2730 query4 = registry.queryDataIds(
2731 ["visit", "tract"],
2732 instrument="Cam1",
2733 skymap="SkyMap1",
2734 where="visit=1 AND detector=1 AND tract=0 AND patch=4",
2735 )
2736 self.assertFalse(set(query4))
2737 self.assertTrue(query4.any(execute=False, exact=False))
2738 self.assertTrue(query4.any(execute=True, exact=False))
2739 self.assertFalse(query4.any(execute=True, exact=True))
2740 self.assertGreaterEqual(query4.count(exact=False), 1)
2741 self.assertEqual(query4.count(exact=True, discard=True), 0)
2742 messages = query4.explain_no_results()
2743 self.assertTrue(messages)
2744 self.assertTrue(any("overlap" in message for message in messages))
2745 # This query should yield results from one dataset type but not the
2746 # other, which is not registered.
2747 query5 = registry.queryDatasets(["bias", "nonexistent"], collections=["biases"])
2748 self.assertTrue(set(query5))
2749 self.assertTrue(query5.any(execute=False, exact=False))
2750 self.assertTrue(query5.any(execute=True, exact=False))
2751 self.assertTrue(query5.any(execute=True, exact=True))
2752 self.assertGreaterEqual(query5.count(exact=False), 1)
2753 self.assertGreaterEqual(query5.count(exact=True), 1)
2754 self.assertFalse(list(query5.explain_no_results()))
2755 # This query applies a selection that yields no results, fully in the
2756 # database. Explaining why it fails involves traversing the relation
2757 # tree and running a LIMIT 1 query at each level that has the potential
2758 # to remove rows.
2759 query6 = registry.queryDimensionRecords(
2760 "detector", where="detector.purpose = 'no-purpose'", instrument="Cam1"
2761 )
2762 self.assertEqual(query6.count(exact=True), 0)
2763 messages = query6.explain_no_results()
2764 self.assertTrue(messages)
2765 self.assertTrue(any("no-purpose" in message for message in messages))
2767 def testQueryDataIdsOrderBy(self):
2768 """Test order_by and limit on result returned by queryDataIds()."""
2769 registry = self.makeRegistry()
2770 self.loadData(registry, "base.yaml")
2771 self.loadData(registry, "datasets.yaml")
2772 self.loadData(registry, "spatial.yaml")
2774 def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
2775 return registry.queryDataIds(
2776 dimensions, datasets=datasets, collections=collections, instrument="Cam1", skymap="SkyMap1"
2777 )
2779 Test = namedtuple(
2780 "testQueryDataIdsOrderByTest",
2781 ("order_by", "keys", "result", "limit", "datasets", "collections"),
2782 defaults=(None, None, None),
2783 )
2785 test_data = (
2786 Test("tract,visit", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2787 Test("-tract,visit", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))),
2788 Test("tract,-visit", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))),
2789 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2), (0, 2), (0, 1), (0, 1))),
2790 Test(
2791 "tract.id,visit.id",
2792 "tract,visit",
2793 ((0, 1), (0, 1), (0, 2)),
2794 limit=(3,),
2795 ),
2796 Test("-tract,-visit", "tract,visit", ((1, 2), (1, 2), (0, 2)), limit=(3,)),
2797 Test("tract,visit", "tract,visit", ((0, 2), (1, 2), (1, 2)), limit=(3, 3)),
2798 Test("-tract,-visit", "tract,visit", ((0, 1),), limit=(3, 5)),
2799 Test(
2800 "tract,visit.exposure_time", "tract,visit", ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2))
2801 ),
2802 Test(
2803 "-tract,-visit.exposure_time", "tract,visit", ((1, 2), (1, 2), (0, 1), (0, 1), (0, 2), (0, 2))
2804 ),
2805 Test("tract,-exposure_time", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2806 Test("tract,visit.name", "tract,visit", ((0, 1), (0, 1), (0, 2), (0, 2), (1, 2), (1, 2))),
2807 Test(
2808 "tract,-timespan.begin,timespan.end",
2809 "tract,visit",
2810 ((0, 2), (0, 2), (0, 1), (0, 1), (1, 2), (1, 2)),
2811 ),
2812 Test("visit.day_obs,exposure.day_obs", "visit,exposure", ()),
2813 Test("visit.timespan.begin,-exposure.timespan.begin", "visit,exposure", ()),
2814 Test(
2815 "tract,detector",
2816 "tract,detector",
2817 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2818 datasets="flat",
2819 collections="imported_r",
2820 ),
2821 Test(
2822 "tract,detector.full_name",
2823 "tract,detector",
2824 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2825 datasets="flat",
2826 collections="imported_r",
2827 ),
2828 Test(
2829 "tract,detector.raft,detector.name_in_raft",
2830 "tract,detector",
2831 ((0, 1), (0, 2), (0, 3), (0, 4), (1, 1), (1, 2), (1, 3), (1, 4)),
2832 datasets="flat",
2833 collections="imported_r",
2834 ),
2835 )
2837 for test in test_data:
2838 order_by = test.order_by.split(",")
2839 keys = test.keys.split(",")
2840 query = do_query(keys, test.datasets, test.collections).order_by(*order_by)
2841 if test.limit is not None:
2842 query = query.limit(*test.limit)
2843 dataIds = tuple(tuple(dataId[k] for k in keys) for dataId in query)
2844 self.assertEqual(dataIds, test.result)
2846 # and materialize
2847 query = do_query(keys).order_by(*order_by)
2848 if test.limit is not None:
2849 query = query.limit(*test.limit)
2850 with self.assertRaises(RelationalAlgebraError):
2851 with query.materialize():
2852 pass
2854 # errors in a name
2855 for order_by in ("", "-"):
2856 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2857 list(do_query().order_by(order_by))
2859 for order_by in ("undimension.name", "-undimension.name"):
2860 with self.assertRaisesRegex(ValueError, "Unknown dimension element name 'undimension'"):
2861 list(do_query().order_by(order_by))
2863 for order_by in ("attract", "-attract"):
2864 with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
2865 list(do_query().order_by(order_by))
2867 with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
2868 list(do_query(("exposure", "visit")).order_by("exposure_time"))
2870 with self.assertRaisesRegex(ValueError, "Timespan exists in more than one dimesion"):
2871 list(do_query(("exposure", "visit")).order_by("timespan.begin"))
2873 with self.assertRaisesRegex(
2874 ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
2875 ):
2876 list(do_query("tract").order_by("timespan.begin"))
2878 with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
2879 list(do_query("tract").order_by("tract.timespan.begin"))
2881 with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
2882 list(do_query("tract").order_by("tract.name"))
2884 def testQueryDataIdsGovernorExceptions(self):
2885 """Test exceptions raised by queryDataIds() for incorrect governors."""
2886 registry = self.makeRegistry()
2887 self.loadData(registry, "base.yaml")
2888 self.loadData(registry, "datasets.yaml")
2889 self.loadData(registry, "spatial.yaml")
2891 def do_query(dimensions, dataId=None, where="", bind=None, **kwargs):
2892 return registry.queryDataIds(dimensions, dataId=dataId, where=where, bind=bind, **kwargs)
2894 Test = namedtuple(
2895 "testQueryDataIdExceptionsTest",
2896 ("dimensions", "dataId", "where", "bind", "kwargs", "exception", "count"),
2897 defaults=(None, None, None, {}, None, 0),
2898 )
2900 test_data = (
2901 Test("tract,visit", count=6),
2902 Test("tract,visit", kwargs={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2903 Test(
2904 "tract,visit", kwargs={"instrument": "Cam2", "skymap": "SkyMap1"}, exception=DataIdValueError
2905 ),
2906 Test("tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap1"}, count=6),
2907 Test(
2908 "tract,visit", dataId={"instrument": "Cam1", "skymap": "SkyMap2"}, exception=DataIdValueError
2909 ),
2910 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap1'", count=6),
2911 Test("tract,visit", where="instrument='Cam1' AND skymap='SkyMap5'", exception=DataIdValueError),
2912 Test(
2913 "tract,visit",
2914 where="instrument=cam AND skymap=map",
2915 bind={"cam": "Cam1", "map": "SkyMap1"},
2916 count=6,
2917 ),
2918 Test(
2919 "tract,visit",
2920 where="instrument=cam AND skymap=map",
2921 bind={"cam": "Cam", "map": "SkyMap"},
2922 exception=DataIdValueError,
2923 ),
2924 )
2926 for test in test_data:
2927 dimensions = test.dimensions.split(",")
2928 if test.exception:
2929 with self.assertRaises(test.exception):
2930 do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs).count()
2931 else:
2932 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2933 self.assertEqual(query.count(discard=True), test.count)
2935 # and materialize
2936 if test.exception:
2937 with self.assertRaises(test.exception):
2938 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2939 with query.materialize() as materialized:
2940 materialized.count(discard=True)
2941 else:
2942 query = do_query(dimensions, test.dataId, test.where, bind=test.bind, **test.kwargs)
2943 with query.materialize() as materialized:
2944 self.assertEqual(materialized.count(discard=True), test.count)
2946 def testQueryDimensionRecordsOrderBy(self):
2947 """Test order_by and limit on result returned by
2948 queryDimensionRecords().
2949 """
2950 registry = self.makeRegistry()
2951 self.loadData(registry, "base.yaml")
2952 self.loadData(registry, "datasets.yaml")
2953 self.loadData(registry, "spatial.yaml")
2955 def do_query(element, datasets=None, collections=None):
2956 return registry.queryDimensionRecords(
2957 element, instrument="Cam1", datasets=datasets, collections=collections
2958 )
2960 query = do_query("detector")
2961 self.assertEqual(len(list(query)), 4)
2963 Test = namedtuple(
2964 "testQueryDataIdsOrderByTest",
2965 ("element", "order_by", "result", "limit", "datasets", "collections"),
2966 defaults=(None, None, None),
2967 )
2969 test_data = (
2970 Test("detector", "detector", (1, 2, 3, 4)),
2971 Test("detector", "-detector", (4, 3, 2, 1)),
2972 Test("detector", "raft,-name_in_raft", (2, 1, 4, 3)),
2973 Test("detector", "-detector.purpose", (4,), limit=(1,)),
2974 Test("detector", "-purpose,detector.raft,name_in_raft", (2, 3), limit=(2, 2)),
2975 Test("visit", "visit", (1, 2)),
2976 Test("visit", "-visit.id", (2, 1)),
2977 Test("visit", "zenith_angle", (1, 2)),
2978 Test("visit", "-visit.name", (2, 1)),
2979 Test("visit", "day_obs,-timespan.begin", (2, 1)),
2980 )
2982 for test in test_data:
2983 order_by = test.order_by.split(",")
2984 query = do_query(test.element).order_by(*order_by)
2985 if test.limit is not None:
2986 query = query.limit(*test.limit)
2987 dataIds = tuple(rec.id for rec in query)
2988 self.assertEqual(dataIds, test.result)
2990 # errors in a name
2991 for order_by in ("", "-"):
2992 with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
2993 list(do_query("detector").order_by(order_by))
2995 for order_by in ("undimension.name", "-undimension.name"):
2996 with self.assertRaisesRegex(ValueError, "Element name mismatch: 'undimension'"):
2997 list(do_query("detector").order_by(order_by))
2999 for order_by in ("attract", "-attract"):
3000 with self.assertRaisesRegex(ValueError, "Field 'attract' does not exist in 'detector'."):
3001 list(do_query("detector").order_by(order_by))
3003 def testQueryDimensionRecordsExceptions(self):
3004 """Test exceptions raised by queryDimensionRecords()."""
3005 registry = self.makeRegistry()
3006 self.loadData(registry, "base.yaml")
3007 self.loadData(registry, "datasets.yaml")
3008 self.loadData(registry, "spatial.yaml")
3010 result = registry.queryDimensionRecords("detector")
3011 self.assertEqual(result.count(), 4)
3012 result = registry.queryDimensionRecords("detector", instrument="Cam1")
3013 self.assertEqual(result.count(), 4)
3014 result = registry.queryDimensionRecords("detector", dataId={"instrument": "Cam1"})
3015 self.assertEqual(result.count(), 4)
3016 result = registry.queryDimensionRecords("detector", where="instrument='Cam1'")
3017 self.assertEqual(result.count(), 4)
3018 result = registry.queryDimensionRecords("detector", where="instrument=instr", bind={"instr": "Cam1"})
3019 self.assertEqual(result.count(), 4)
3021 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3022 result = registry.queryDimensionRecords("detector", instrument="NotCam1")
3023 result.count()
3025 with self.assertRaisesRegex(DataIdValueError, "dimension instrument"):
3026 result = registry.queryDimensionRecords("detector", dataId={"instrument": "NotCam1"})
3027 result.count()
3029 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3030 result = registry.queryDimensionRecords("detector", where="instrument='NotCam1'")
3031 result.count()
3033 with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
3034 result = registry.queryDimensionRecords(
3035 "detector", where="instrument=instr", bind={"instr": "NotCam1"}
3036 )
3037 result.count()
3039 def testDatasetConstrainedDimensionRecordQueries(self):
3040 """Test that queryDimensionRecords works even when given a dataset
3041 constraint whose dimensions extend beyond the requested dimension
3042 element's.
3043 """
3044 registry = self.makeRegistry()
3045 self.loadData(registry, "base.yaml")
3046 self.loadData(registry, "datasets.yaml")
3047 # Query for physical_filter dimension records, using a dataset that
3048 # has both physical_filter and dataset dimensions.
3049 records = registry.queryDimensionRecords(
3050 "physical_filter",
3051 datasets=["flat"],
3052 collections="imported_r",
3053 )
3054 self.assertEqual({record.name for record in records}, {"Cam1-R1", "Cam1-R2"})
3055 # Trying to constrain by all dataset types is an error.
3056 with self.assertRaises(TypeError):
3057 list(registry.queryDimensionRecords("physical_filter", datasets=..., collections="imported_r"))
3059 def testSkyPixDatasetQueries(self):
3060 """Test that we can build queries involving skypix dimensions as long
3061 as a dataset type that uses those dimensions is included.
3062 """
3063 registry = self.makeRegistry()
3064 self.loadData(registry, "base.yaml")
3065 dataset_type = DatasetType(
3066 "a", dimensions=["htm7", "instrument"], universe=registry.dimensions, storageClass="int"
3067 )
3068 registry.registerDatasetType(dataset_type)
3069 run = "r"
3070 registry.registerRun(run)
3071 # First try queries where there are no datasets; the concern is whether
3072 # we can even build and execute these queries without raising, even
3073 # when "doomed" query shortcuts are in play.
3074 self.assertFalse(
3075 list(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run))
3076 )
3077 self.assertFalse(list(registry.queryDatasets(dataset_type, collections=run)))
3078 # Now add a dataset and see that we can get it back.
3079 htm7 = registry.dimensions.skypix["htm"][7].pixelization
3080 data_id = registry.expandDataId(instrument="Cam1", htm7=htm7.universe()[0][0])
3081 (ref,) = registry.insertDatasets(dataset_type, [data_id], run=run)
3082 self.assertEqual(
3083 set(registry.queryDataIds(["htm7", "instrument"], datasets=dataset_type, collections=run)),
3084 {data_id},
3085 )
3086 self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})
3088 def testDatasetIdFactory(self):
3089 """Simple test for DatasetIdFactory, mostly to catch potential changes
3090 in its API.
3091 """
3092 registry = self.makeRegistry()
3093 factory = registry.datasetIdFactory
3094 dataset_type = DatasetType(
3095 "datasetType",
3096 dimensions=["detector", "instrument"],
3097 universe=registry.dimensions,
3098 storageClass="int",
3099 )
3100 run = "run"
3101 data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)
3103 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
3104 self.assertIsInstance(datasetId, uuid.UUID)
3105 self.assertEqual(datasetId.version, 4)
3107 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
3108 self.assertIsInstance(datasetId, uuid.UUID)
3109 self.assertEqual(datasetId.version, 5)
3111 datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
3112 self.assertIsInstance(datasetId, uuid.UUID)
3113 self.assertEqual(datasetId.version, 5)
3115 def testExposureQueries(self):
3116 """Test query methods using arguments sourced from the exposure log
3117 service.
3119 The most complete test dataset currently available to daf_butler tests
3120 is hsc-rc2-subset.yaml export (which is unfortunately distinct from the
3121 the lsst/rc2_subset GitHub repo), but that does not have 'exposure'
3122 dimension records as it was focused on providing nontrivial spatial
3123 overlaps between visit+detector and tract+patch. So in this test we
3124 need to translate queries that originally used the exposure dimension
3125 to use the (very similar) visit dimension instead.
3126 """
3127 registry = self.makeRegistry()
3128 self.loadData(registry, "hsc-rc2-subset.yaml")
3129 self.assertEqual(
3130 [
3131 record.id
3132 for record in registry.queryDimensionRecords("visit", instrument="HSC")
3133 .order_by("id")
3134 .limit(5)
3135 ],
3136 [318, 322, 326, 330, 332],
3137 )
3138 self.assertEqual(
3139 [
3140 data_id["visit"]
3141 for data_id in registry.queryDataIds(["visit"], instrument="HSC").order_by("id").limit(5)
3142 ],
3143 [318, 322, 326, 330, 332],
3144 )
3145 self.assertEqual(
3146 [
3147 record.id
3148 for record in registry.queryDimensionRecords("detector", instrument="HSC")
3149 .order_by("full_name")
3150 .limit(5)
3151 ],
3152 [73, 72, 71, 70, 65],
3153 )
3154 self.assertEqual(
3155 [
3156 data_id["detector"]
3157 for data_id in registry.queryDataIds(["detector"], instrument="HSC")
3158 .order_by("full_name")
3159 .limit(5)
3160 ],
3161 [73, 72, 71, 70, 65],
3162 )
3164 def test_long_query_names(self) -> None:
3165 """Test that queries involving very long names are handled correctly.
3167 This is especially important for PostgreSQL, which truncates symbols
3168 longer than 64 chars, but it's worth testing for all DBs.
3169 """
3170 registry = self.makeRegistry()
3171 name = "abcd" * 17
3172 registry.registerDatasetType(
3173 DatasetType(
3174 name,
3175 dimensions=(),
3176 storageClass="Exposure",
3177 universe=registry.dimensions,
3178 )
3179 )
3180 # Need to search more than one collection actually containing a
3181 # matching dataset to avoid optimizations that sidestep bugs due to
3182 # truncation by making findFirst=True a no-op.
3183 run1 = "run1"
3184 registry.registerRun(run1)
3185 run2 = "run2"
3186 registry.registerRun(run2)
3187 (ref1,) = registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run1)
3188 registry.insertDatasets(name, [DataCoordinate.makeEmpty(registry.dimensions)], run2)
3189 self.assertEqual(
3190 set(registry.queryDatasets(name, collections=[run1, run2], findFirst=True)),
3191 {ref1},
3192 )
3194 def test_skypix_constraint_queries(self) -> None:
3195 """Test queries spatially constrained by a skypix data ID."""
3196 registry = self.makeRegistry()
3197 self.loadData(registry, "hsc-rc2-subset.yaml")
3198 patch_regions = {
3199 (data_id["tract"], data_id["patch"]): data_id.region
3200 for data_id in registry.queryDataIds(["patch"]).expanded()
3201 }
3202 skypix_dimension: SkyPixDimension = registry.dimensions["htm11"]
3203 # This check ensures the test doesn't become trivial due to a config
3204 # change; if it does, just pick a different HTML level.
3205 self.assertNotEqual(skypix_dimension, registry.dimensions.commonSkyPix)
3206 # Gather all skypix IDs that definitely overlap at least one of these
3207 # patches.
3208 relevant_skypix_ids = lsst.sphgeom.RangeSet()
3209 for patch_region in patch_regions.values():
3210 relevant_skypix_ids |= skypix_dimension.pixelization.interior(patch_region)
3211 # Look for a "nontrivial" skypix_id that overlaps at least one patch
3212 # and does not overlap at least one other patch.
3213 for skypix_id in itertools.chain.from_iterable(
3214 range(begin, end) for begin, end in relevant_skypix_ids
3215 ):
3216 skypix_region = skypix_dimension.pixelization.pixel(skypix_id)
3217 overlapping_patches = {
3218 patch_key
3219 for patch_key, patch_region in patch_regions.items()
3220 if not patch_region.isDisjointFrom(skypix_region)
3221 }
3222 if overlapping_patches and overlapping_patches != patch_regions.keys():
3223 break
3224 else:
3225 raise RuntimeError("Could not find usable skypix ID for this dimension configuration.")
3226 self.assertEqual(
3227 {
3228 (data_id["tract"], data_id["patch"])
3229 for data_id in registry.queryDataIds(
3230 ["patch"],
3231 dataId={skypix_dimension.name: skypix_id},
3232 )
3233 },
3234 overlapping_patches,
3235 )
3236 # Test that a three-way join that includes the common skypix system in
3237 # the dimensions doesn't generate redundant join terms in the query.
3238 full_data_ids = set(
3239 registry.queryDataIds(
3240 ["tract", "visit", "htm7"], skymap="hsc_rings_v1", instrument="HSC"
3241 ).expanded()
3242 )
3243 self.assertGreater(len(full_data_ids), 0)
3244 for data_id in full_data_ids:
3245 self.assertFalse(data_id.records["tract"].region.isDisjointFrom(data_id.records["htm7"].region))
3246 self.assertFalse(data_id.records["visit"].region.isDisjointFrom(data_id.records["htm7"].region))
3248 def test_spatial_constraint_queries(self) -> None:
3249 """Test queries in which one spatial dimension in the constraint (data
3250 ID or ``where`` string) constrains a different spatial dimension in the
3251 query result columns.
3252 """
3253 registry = self.makeRegistry()
3254 self.loadData(registry, "hsc-rc2-subset.yaml")
3255 patch_regions = {
3256 (data_id["tract"], data_id["patch"]): data_id.region
3257 for data_id in registry.queryDataIds(["patch"]).expanded()
3258 }
3259 observation_regions = {
3260 (data_id["visit"], data_id["detector"]): data_id.region
3261 for data_id in registry.queryDataIds(["visit", "detector"]).expanded()
3262 }
3263 all_combos = {
3264 (patch_key, observation_key)
3265 for patch_key, observation_key in itertools.product(patch_regions, observation_regions)
3266 }
3267 overlapping_combos = {
3268 (patch_key, observation_key)
3269 for patch_key, observation_key in all_combos
3270 if not patch_regions[patch_key].isDisjointFrom(observation_regions[observation_key])
3271 }
3272 # Check a direct spatial join with no constraint first.
3273 self.assertEqual(
3274 {
3275 ((data_id["tract"], data_id["patch"]), (data_id["visit"], data_id["detector"]))
3276 for data_id in registry.queryDataIds(["patch", "visit", "detector"])
3277 },
3278 overlapping_combos,
3279 )
3280 overlaps_by_patch: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3281 overlaps_by_observation: defaultdict[tuple[int, int], set[tuple[str, str]]] = defaultdict(set)
3282 for patch_key, observation_key in overlapping_combos:
3283 overlaps_by_patch[patch_key].add(observation_key)
3284 overlaps_by_observation[observation_key].add(patch_key)
3285 # Find patches and observations that overlap at least one of the other
3286 # but not all of the other.
3287 nontrivial_patch = next(
3288 iter(
3289 patch_key
3290 for patch_key, observation_keys in overlaps_by_patch.items()
3291 if observation_keys and observation_keys != observation_regions.keys()
3292 )
3293 )
3294 nontrivial_observation = next(
3295 iter(
3296 observation_key
3297 for observation_key, patch_keys in overlaps_by_observation.items()
3298 if patch_keys and patch_keys != patch_regions.keys()
3299 )
3300 )
3301 # Use the nontrivial patches and observations as constraints on the
3302 # other dimensions in various ways, first via a 'where' expression.
3303 # It's better in general to us 'bind' instead of f-strings, but these
3304 # all integers so there are no quoting concerns.
3305 self.assertEqual(
3306 {
3307 (data_id["visit"], data_id["detector"])
3308 for data_id in registry.queryDataIds(
3309 ["visit", "detector"],
3310 where=f"tract={nontrivial_patch[0]} AND patch={nontrivial_patch[1]}",
3311 skymap="hsc_rings_v1",
3312 )
3313 },
3314 overlaps_by_patch[nontrivial_patch],
3315 )
3316 self.assertEqual(
3317 {
3318 (data_id["tract"], data_id["patch"])
3319 for data_id in registry.queryDataIds(
3320 ["patch"],
3321 where=f"visit={nontrivial_observation[0]} AND detector={nontrivial_observation[1]}",
3322 instrument="HSC",
3323 )
3324 },
3325 overlaps_by_observation[nontrivial_observation],
3326 )
3327 # and then via the dataId argument.
3328 self.assertEqual(
3329 {
3330 (data_id["visit"], data_id["detector"])
3331 for data_id in registry.queryDataIds(
3332 ["visit", "detector"],
3333 dataId={
3334 "tract": nontrivial_patch[0],
3335 "patch": nontrivial_patch[1],
3336 },
3337 skymap="hsc_rings_v1",
3338 )
3339 },
3340 overlaps_by_patch[nontrivial_patch],
3341 )
3342 self.assertEqual(
3343 {
3344 (data_id["tract"], data_id["patch"])
3345 for data_id in registry.queryDataIds(
3346 ["patch"],
3347 dataId={
3348 "visit": nontrivial_observation[0],
3349 "detector": nontrivial_observation[1],
3350 },
3351 instrument="HSC",
3352 )
3353 },
3354 overlaps_by_observation[nontrivial_observation],
3355 )
3357 def test_query_projection_drop_postprocessing(self) -> None:
3358 """Test that projections and deduplications on query objects can
3359 drop post-query region filtering to ensure the query remains in
3360 the SQL engine.
3361 """
3362 registry = self.makeRegistry()
3363 self.loadData(registry, "base.yaml")
3364 self.loadData(registry, "spatial.yaml")
3366 def pop_transfer(tree: Relation) -> Relation:
3367 """If a relation tree terminates with a transfer to a new engine,
3368 return the relation prior to that transfer. If not, return the
3369 original relation.
3370 """
3371 match tree:
3372 case Transfer(target=target):
3373 return target
3374 case _:
3375 return tree
3377 # There's no public way to get a Query object yet, so we get one from a
3378 # DataCoordinateQueryResults private attribute. When a public API is
3379 # available this test should use it.
3380 query = registry.queryDataIds(["visit", "detector", "tract", "patch"])._query
3381 # We expect this query to terminate in the iteration engine originally,
3382 # because region-filtering is necessary.
3383 self.assertIsInstance(pop_transfer(query.relation).engine, iteration.Engine)
3384 # If we deduplicate, we usually have to do that downstream of the
3385 # filtering. That means the deduplication has to happen in the
3386 # iteration engine.
3387 self.assertIsInstance(pop_transfer(query.projected(unique=True).relation).engine, iteration.Engine)
3388 # If we pass drop_postprocessing, we instead drop the region filtering
3389 # so the deduplication can happen in SQL (though there might still be
3390 # transfer to iteration at the tail of the tree that we can ignore;
3391 # that's what the pop_transfer takes care of here).
3392 self.assertIsInstance(
3393 pop_transfer(query.projected(unique=True, drop_postprocessing=True).relation).engine,
3394 sql.Engine,
3395 )
3397 def test_query_empty_collections(self) -> None:
3398 """Test for registry query methods with empty collections. The methods
3399 should return empty result set (or None when applicable) and provide
3400 "doomed" diagnostics.
3401 """
3402 registry = self.makeRegistry()
3403 self.loadData(registry, "base.yaml")
3404 self.loadData(registry, "datasets.yaml")
3406 # Tests for registry.findDataset()
3407 with self.assertRaises(NoDefaultCollectionError):
3408 registry.findDataset("bias", instrument="Cam1", detector=1)
3409 self.assertIsNotNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=...))
3410 self.assertIsNone(registry.findDataset("bias", instrument="Cam1", detector=1, collections=[]))
3412 # Tests for registry.queryDatasets()
3413 with self.assertRaises(NoDefaultCollectionError):
3414 registry.queryDatasets("bias")
3415 self.assertTrue(list(registry.queryDatasets("bias", collections=...)))
3417 result = registry.queryDatasets("bias", collections=[])
3418 self.assertEqual(len(list(result)), 0)
3419 messages = list(result.explain_no_results())
3420 self.assertTrue(messages)
3421 self.assertTrue(any("because collection list is empty" in message for message in messages))
3423 # Tests for registry.queryDataIds()
3424 with self.assertRaises(NoDefaultCollectionError):
3425 registry.queryDataIds("detector", datasets="bias")
3426 self.assertTrue(list(registry.queryDataIds("detector", datasets="bias", collections=...)))
3428 result = registry.queryDataIds("detector", datasets="bias", collections=[])
3429 self.assertEqual(len(list(result)), 0)
3430 messages = list(result.explain_no_results())
3431 self.assertTrue(messages)
3432 self.assertTrue(any("because collection list is empty" in message for message in messages))
3434 # Tests for registry.queryDimensionRecords()
3435 with self.assertRaises(NoDefaultCollectionError):
3436 registry.queryDimensionRecords("detector", datasets="bias")
3437 self.assertTrue(list(registry.queryDimensionRecords("detector", datasets="bias", collections=...)))
3439 result = registry.queryDimensionRecords("detector", datasets="bias", collections=[])
3440 self.assertEqual(len(list(result)), 0)
3441 messages = list(result.explain_no_results())
3442 self.assertTrue(messages)
3443 self.assertTrue(any("because collection list is empty" in message for message in messages))