Coverage for python / lsst / daf / butler / tests / butler_queries.py: 8%
834 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 08:41 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 08:41 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ()
32import unittest
33import unittest.mock
34from abc import ABC, abstractmethod
35from collections.abc import Iterable, Sequence
36from operator import attrgetter
37from uuid import UUID
39import astropy.coordinates
40import astropy.time
41from numpy import int64
43from lsst.sphgeom import LonLat, Region
45from .._butler import Butler
46from .._collection_type import CollectionType
47from .._dataset_ref import DatasetRef
48from .._dataset_type import DatasetType
49from .._exceptions import (
50 EmptyQueryResultError,
51 InvalidQueryError,
52 MissingCollectionError,
53 MissingDatasetTypeError,
54)
55from .._timespan import Timespan
56from ..dimensions import DataCoordinate, DimensionRecord
57from ..direct_query_driver import DirectQueryDriver
58from ..queries import DimensionRecordQueryResults, Query
59from ..queries.tree import Predicate
60from ..registry import NoDefaultCollectionError, RegistryDefaults
61from .utils import TestCaseMixin
63# Simplified tuples of the detector records we'll frequently be querying for.
64DETECTOR_TUPLES = {
65 1: ("Cam1", 1, "Aa", "SCIENCE"),
66 2: ("Cam1", 2, "Ab", "SCIENCE"),
67 3: ("Cam1", 3, "Ba", "SCIENCE"),
68 4: ("Cam1", 4, "Bb", "WAVEFRONT"),
69}
72def make_detector_tuples(records: Iterable[DimensionRecord]) -> dict[int, tuple[str, int, str, str]]:
73 """Make tuples with the same entries as DETECTOR_TUPLES from an iterable of
74 detector dimension records.
76 Parameters
77 ----------
78 records : `~collections.abc.Iterable` [ `.dimensions.DimensionRecord` ]
79 Detector dimension records.
81 Returns
82 -------
83 tuples : `dict` [ `int`, `tuple` ]
84 Dictionary mapping detector ID to tuples with the same fields as the
85 ``DETECTOR_TUPLES`` constant in this file.
86 """
87 return {record.id: (record.instrument, record.id, record.full_name, record.purpose) for record in records}
90class ButlerQueryTests(ABC, TestCaseMixin):
91 """Base class for unit tests that test `lsst.daf.butler.Butler.query`
92 implementations.
93 """
95 @abstractmethod
96 def make_butler(self, *args: str) -> Butler:
97 """Make Butler instance populated with data used in the tests below.
99 Parameters
100 ----------
101 *args : str
102 Names of the files to pass to `load_data`.
104 Returns
105 -------
106 butler : `Butler`
107 Butler to use for tests.
108 """
109 raise NotImplementedError()
111 def load_data(self, butler: Butler, filename: str) -> None:
112 """Load registry test data from ``data_dir/<filename>``,
113 which should be a YAML import/export file.
115 This method should be called from implementations of `make_butler`
116 where the Registry should exist.
118 Parameters
119 ----------
120 butler : `~lsst.daf.butler.Butler`
121 The butler to use.
122 filename : `str`
123 Location of test data.
124 """
125 butler.import_(
126 filename=f"resource://lsst.daf.butler/tests/registry_data/{filename}",
127 without_datastore=True,
128 )
130 def check_detector_records(
131 self,
132 results: DimensionRecordQueryResults,
133 ids: Sequence[int] = (1, 2, 3, 4),
134 ordered: bool = False,
135 messages: Iterable[str] = (),
136 doomed: bool = False,
137 has_postprocessing: bool = False,
138 ) -> None:
139 self.assertEqual(results.element.name, "detector")
140 self.assertEqual(results.dimensions, results.dimensions.universe["detector"].minimal_group)
141 if has_postprocessing and not doomed:
142 self.assertEqual(results.count(discard=True), len(ids))
143 self.assertGreaterEqual(results.count(discard=False, exact=False), len(ids))
144 with self.assertRaisesRegex(InvalidQueryError, "^Cannot count query rows"):
145 results.count()
146 else:
147 self.assertEqual(results.count(discard=True), len(ids))
148 self.assertEqual(results.count(discard=False), len(ids))
149 self.assertEqual(results.count(discard=True, exact=False), len(ids))
150 self.assertEqual(results.count(discard=False, exact=False), len(ids))
151 self.assertEqual(results.any(), bool(ids))
152 if not doomed:
153 self.assertTrue(results.any(exact=False, execute=False))
154 with self.assertRaisesRegex(InvalidQueryError, "^Cannot obtain exact"):
155 results.any(exact=True, execute=False)
156 else:
157 self.assertFalse(results.any(exact=False, execute=False))
158 self.assertFalse(results.any(exact=True, execute=False))
159 self.assertCountEqual(results.explain_no_results(), list(messages))
160 self.check_detector_records_returned(list(results), ids=ids, ordered=ordered)
162 def check_detector_records_returned(
163 self,
164 results: list[DimensionRecord],
165 ids: Sequence[int] = (1, 2, 3, 4),
166 ordered: bool = False,
167 ) -> None:
168 expected = [DETECTOR_TUPLES[i] for i in ids]
169 queried = list(make_detector_tuples(results).values())
170 if ordered:
171 self.assertEqual(queried, expected)
172 else:
173 self.assertCountEqual(queried, expected)
175 def test_simple_record_query(self) -> None:
176 """Test query-system basics with simple queries for dimension
177 records.
179 This includes tests for order_by, limit, and where expressions, but
180 only for cases where there are no datasets, dimension projections,
181 or spatial/temporal overlaps.
182 """
183 butler = self.make_butler("base.yaml")
184 with butler.query() as query:
185 _x = query.expression_factory
186 results = query.dimension_records("detector")
187 self.check_detector_records(results)
188 self.check_detector_records_returned(butler.query_dimension_records("detector"))
189 self.assertEqual(len(butler.query_dimension_records("detector", limit=0)), 0)
190 self.check_detector_records(results.order_by("detector"), ordered=True)
191 self.check_detector_records_returned(
192 butler.query_dimension_records("detector", order_by="detector"), ordered=True
193 )
194 self.check_detector_records(
195 results.order_by(_x.detector.full_name.desc), [4, 3, 2, 1], ordered=True
196 )
197 self.check_detector_records_returned(
198 butler.query_dimension_records("detector", order_by="-full_name"),
199 ids=[4, 3, 2, 1],
200 ordered=True,
201 )
202 self.check_detector_records(results.order_by("detector").limit(2), [1, 2], ordered=True)
203 self.check_detector_records_returned(
204 butler.query_dimension_records("detector", limit=2, order_by="detector"),
205 ids=[1, 2],
206 ordered=True,
207 )
208 with self.assertLogs("lsst.daf.butler", level="WARNING") as wcm:
209 self.check_detector_records_returned(
210 butler.query_dimension_records("detector", limit=-2, order_by="-detector"),
211 ids=[4, 3],
212 ordered=True,
213 )
214 self.assertIn("More dimension records are available", wcm.output[0])
215 self.check_detector_records(results.where(_x.detector.raft == "B", instrument="Cam1"), [3, 4])
216 self.check_detector_records_returned(
217 butler.query_dimension_records(
218 "detector", where="detector.raft = :R", bind={"R": "B"}, instrument="Cam1"
219 ),
220 ids=[3, 4],
221 )
222 self.check_detector_records(
223 results.where(_x.detector.full_name.glob("B?"), instrument="Cam1"), [3, 4]
224 )
225 self.check_detector_records(
226 results.where(_x.detector.full_name.glob("*a"), instrument="Cam1"), [1, 3]
227 )
229 # Test incorrect type for glob() parameter.
230 with self.assertRaises(InvalidQueryError):
231 results.where(_x.detector.full_name.glob(1), instrument="Cam1") # type: ignore[arg-type]
233 def test_simple_data_coordinate_query(self) -> None:
234 butler = self.make_butler("base.yaml")
236 expected_detectors = [1, 2, 3, 4]
237 universe = butler.dimensions
238 expected_coordinates = [
239 DataCoordinate.standardize({"instrument": "Cam1", "detector": x}, universe=universe)
240 for x in expected_detectors
241 ]
243 with butler.query() as query:
244 # Test empty query
245 empty = DataCoordinate.make_empty(butler.dimensions)
246 self.assertCountEqual(list(query.data_ids([])), [empty])
247 self.assertCountEqual(butler.query_data_ids([]), [empty])
249 # Test query for a single dimension
250 results = query.data_ids(["detector"])
251 self.assertCountEqual(list(results), expected_coordinates)
253 # Limit.
254 results = query.data_ids(["detector"]).order_by("-detector").limit(2)
255 self.assertCountEqual(list(results), expected_coordinates[2:])
257 data_ids = butler.query_data_ids("detector")
258 self.assertCountEqual(data_ids, expected_coordinates)
260 data_ids = butler.query_data_ids("detector", order_by="-detector", limit=2)
261 self.assertCountEqual(data_ids, expected_coordinates[2:])
263 with self.assertLogs("lsst.daf.butler", level="WARNING") as wcm:
264 data_ids = butler.query_data_ids("detector", order_by="-detector", limit=-2)
265 self.assertCountEqual(data_ids, expected_coordinates[2:])
266 self.assertIn("More data IDs are available", wcm.output[0])
268 data_ids = butler.query_data_ids("detector", limit=0)
269 self.assertEqual(len(data_ids), 0)
271 def test_simple_dataset_query(self) -> None:
272 butler = self.make_butler("base.yaml", "datasets.yaml")
273 with butler.query() as query:
274 refs_q = list(query.datasets("bias", "imported_g").order_by("detector"))
275 refs_simple = butler.query_datasets("bias", "imported_g", order_by="detector")
276 self.assertCountEqual(refs_q, refs_simple)
278 for refs in (refs_q, refs_simple):
279 self.assertEqual(len(refs), 3)
280 self.assertEqual(refs[0].id, UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22"))
281 self.assertEqual(refs[1].id, UUID("51352db4-a47a-447c-b12d-a50b206b17cd"))
282 for detector, ref in enumerate(refs, 1):
283 self.assertEqual(ref.datasetType.name, "bias")
284 self.assertEqual(ref.dataId["instrument"], "Cam1")
285 self.assertEqual(ref.dataId["detector"], detector)
286 self.assertEqual(ref.run, "imported_g")
288 # Try again with limit.
289 with butler.query() as query:
290 refs_q = list(query.datasets("bias", "imported_g").order_by("detector").limit(2))
291 refs_simple = butler.query_datasets("bias", "imported_g", order_by="detector", limit=2)
292 self.assertCountEqual(refs_q, refs_simple)
293 self.assertEqual(len(refs_q), 2)
294 self.assertEqual(refs_q[0].id, UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22"))
295 self.assertEqual(refs_q[1].id, UUID("51352db4-a47a-447c-b12d-a50b206b17cd"))
297 # limit=0 means test the query but don't return anything and
298 # don't complain.
299 refs_simple = butler.query_datasets("bias", "imported_g", limit=0, explain=True)
300 self.assertEqual(len(refs_simple), 0)
302 # Explicitly run with no restrictions.
303 refs_simple = butler.query_datasets("bias", collections="*", find_first=False, limit=None)
304 self.assertEqual(len(refs_simple), 6)
306 # Now limit the number of results and look for a warning.
307 with self.assertLogs("lsst.daf.butler", level="WARNING") as lcm:
308 refs_simple = butler.query_datasets("bias", collections="*", find_first=False, limit=-4)
309 self.assertEqual(len(refs_simple), 4)
310 self.assertIn("More datasets are available", lcm.output[0])
312 with self.assertRaises(InvalidQueryError) as cm:
313 butler.query_datasets("bias", "*", detector=100, instrument="Unknown", find_first=True)
314 self.assertIn("Can not use wildcards", str(cm.exception))
315 with self.assertRaises(EmptyQueryResultError) as cm2:
316 butler.query_datasets("bias", "*", detector=100, instrument="Unknown", find_first=False)
317 self.assertIn("doomed", str(cm2.exception))
319 # Test for a regression of an issue where "band" was not being included
320 # in the data ID, despite being one of the dimensions in the "flat"
321 # dataset type.
322 #
323 # "band" is implied by "physical_filter", so it's technically not a
324 # 'required' dimension. However, the contract of query_datasets is
325 # that hasFull() should be true, so implied dimensions must be
326 # included.
327 refs = butler.query_datasets("flat", "imported_r", where="detector = 2", instrument="Cam1")
328 self.assertEqual(len(refs), 1)
329 flat = refs[0]
330 self.assertTrue(flat.dataId.hasFull())
331 self.assertEqual(flat.datasetType.name, "flat")
332 self.assertEqual(flat.dataId["instrument"], "Cam1")
333 self.assertEqual(flat.dataId["detector"], 2)
334 self.assertEqual(flat.dataId["physical_filter"], "Cam1-R1")
335 self.assertEqual(flat.dataId["band"], "r")
337 def test_general_query(self) -> None:
338 """Test Query.general and its result."""
339 butler = self.make_butler("base.yaml", "datasets.yaml")
340 dimensions = butler.dimensions["detector"].minimal_group
342 # Do simple dimension queries.
343 with butler.query() as query:
344 query = query.join_dimensions(dimensions)
345 rows = list(query.general(dimensions).order_by("detector"))
346 self.assertEqual(
347 rows,
348 [
349 {"instrument": "Cam1", "detector": 1},
350 {"instrument": "Cam1", "detector": 2},
351 {"instrument": "Cam1", "detector": 3},
352 {"instrument": "Cam1", "detector": 4},
353 ],
354 )
355 rows = list(
356 query.general(dimensions, "detector.full_name", "purpose").order_by(
357 "-detector.purpose", "full_name"
358 )
359 )
360 self.assertEqual(
361 rows,
362 [
363 {
364 "instrument": "Cam1",
365 "detector": 4,
366 "detector.full_name": "Bb",
367 "detector.purpose": "WAVEFRONT",
368 },
369 {
370 "instrument": "Cam1",
371 "detector": 1,
372 "detector.full_name": "Aa",
373 "detector.purpose": "SCIENCE",
374 },
375 {
376 "instrument": "Cam1",
377 "detector": 2,
378 "detector.full_name": "Ab",
379 "detector.purpose": "SCIENCE",
380 },
381 {
382 "instrument": "Cam1",
383 "detector": 3,
384 "detector.full_name": "Ba",
385 "detector.purpose": "SCIENCE",
386 },
387 ],
388 )
389 rows = list(
390 query.general(dimensions, "detector.full_name", "purpose").where(
391 "instrument = 'Cam1' AND purpose = 'WAVEFRONT'"
392 )
393 )
394 self.assertEqual(
395 rows,
396 [
397 {
398 "instrument": "Cam1",
399 "detector": 4,
400 "detector.full_name": "Bb",
401 "detector.purpose": "WAVEFRONT",
402 },
403 ],
404 )
405 result = query.general(dimensions, dimension_fields={"detector": {"full_name"}})
406 self.assertEqual(set(row["detector.full_name"] for row in result), {"Aa", "Ab", "Ba", "Bb"})
408 # Use "flat" whose dimension group includes implied dimension.
409 flat = butler.get_dataset_type("flat")
410 dimensions = butler.dimensions.conform(["detector", "physical_filter"])
412 # Do simple dataset queries in RUN collection.
413 with butler.query() as query:
414 query = query.join_dataset_search("flat", "imported_g")
415 # This just returns data IDs.
416 rows = list(query.general(dimensions).order_by("detector"))
417 self.assertEqual(
418 rows,
419 [
420 {"instrument": "Cam1", "detector": 2, "physical_filter": "Cam1-G", "band": "g"},
421 {"instrument": "Cam1", "detector": 3, "physical_filter": "Cam1-G", "band": "g"},
422 {"instrument": "Cam1", "detector": 4, "physical_filter": "Cam1-G", "band": "g"},
423 ],
424 )
426 result = query.general(dimensions, dataset_fields={"flat": ...}, find_first=True).order_by(
427 "detector"
428 )
429 ids = {row["flat.dataset_id"] for row in result}
430 self.assertEqual(
431 ids,
432 {
433 UUID("60c8a65c-7290-4c38-b1de-e3b1cdcf872d"),
434 UUID("84239e7f-c41f-46d5-97b9-a27976b98ceb"),
435 UUID("fd51bce1-2848-49d6-a378-f8a122f5139a"),
436 },
437 )
439 # Check what iter_tuples() returns
440 row_tuples = list(result.iter_tuples(flat))
441 self.assertEqual(len(row_tuples), 3)
442 for row_tuple in row_tuples:
443 self.assertEqual(len(row_tuple.refs), 1)
444 self.assertEqual(row_tuple.refs[0].datasetType, flat)
445 self.assertTrue(row_tuple.refs[0].dataId.hasFull())
446 self.assertFalse(row_tuple.refs[0].dataId.hasRecords())
447 self.assertTrue(row_tuple.data_id.hasFull())
448 self.assertFalse(row_tuple.data_id.hasRecords())
449 self.assertEqual(row_tuple.data_id.dimensions, dimensions)
450 self.assertEqual(row_tuple.raw_row["flat.run"], "imported_g")
452 flat1, flat2, flat3 = (row_tuple.refs[0] for row_tuple in row_tuples)
454 # Query datasets CALIBRATION/TAGGED collections.
455 butler.registry.registerCollection("tagged", CollectionType.TAGGED)
456 butler.registry.registerCollection("calib", CollectionType.CALIBRATION)
458 # Add two refs to tagged collection.
459 butler.registry.associate("tagged", [flat1, flat2])
461 # Certify some calibs.
462 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")
463 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai")
464 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai")
465 butler.registry.certify("calib", [flat1], Timespan(t1, t2))
466 butler.registry.certify("calib", [flat3], Timespan(t2, t3))
467 butler.registry.certify("calib", [flat1], Timespan(t3, None))
468 butler.registry.certify("calib", [flat2], Timespan.makeEmpty())
470 # Query tagged collection.
471 with butler.query() as query:
472 query = query.join_dataset_search("flat", ["tagged"])
474 result = query.general(
475 dimensions, "flat.dataset_id", "flat.run", "flat.collection", find_first=False
476 )
477 row_tuples = list(result.iter_tuples(flat))
478 self.assertEqual(len(row_tuples), 2)
479 self.assertEqual({row_tuple.refs[0] for row_tuple in row_tuples}, {flat1, flat2})
480 self.assertEqual({row_tuple.raw_row["flat.collection"] for row_tuple in row_tuples}, {"tagged"})
482 # Query calib collection.
483 with butler.query() as query:
484 query = query.join_dataset_search("flat", ["calib"])
485 result = query.general(
486 dimensions,
487 "flat.dataset_id",
488 "flat.run",
489 "flat.collection",
490 "flat.timespan",
491 find_first=False,
492 )
493 row_tuples = list(result.iter_tuples(flat))
494 self.assertEqual(len(row_tuples), 4)
495 self.assertEqual({row_tuple.refs[0] for row_tuple in row_tuples}, {flat1, flat2, flat3})
496 self.assertEqual({row_tuple.raw_row["flat.collection"] for row_tuple in row_tuples}, {"calib"})
497 self.assertEqual(
498 {row_tuple.raw_row["flat.timespan"] for row_tuple in row_tuples},
499 {Timespan(t1, t2), Timespan(t2, t3), Timespan(t3, None), Timespan.makeEmpty()},
500 )
502 # Query both tagged and calib collection.
503 with butler.query() as query:
504 query = query.join_dataset_search("flat", ["tagged", "calib"])
505 result = query.general(
506 dimensions,
507 "flat.dataset_id",
508 "flat.run",
509 "flat.collection",
510 "flat.timespan",
511 find_first=False,
512 )
513 row_tuples = list(result.iter_tuples(flat))
514 self.assertEqual(len(row_tuples), 6)
515 self.assertEqual(
516 {row_tuple.raw_row["flat.collection"] for row_tuple in row_tuples}, {"calib", "tagged"}
517 )
518 self.assertEqual(
519 {row_tuple.raw_row["flat.timespan"] for row_tuple in row_tuples},
520 {
521 Timespan(t1, t2),
522 Timespan(t2, t3),
523 Timespan(t3, None),
524 Timespan.makeEmpty(),
525 Timespan(None, None),
526 },
527 )
529 dimensions = butler.dimensions["detector"].minimal_group
531 # Include dimension records into query.
532 with butler.query() as query:
533 query = query.join_dimensions(dimensions)
534 result = query.general(dimensions).order_by("detector")
535 rows = list(result.with_dimension_records())
536 self.assertEqual(
537 rows[0],
538 {
539 "instrument": "Cam1",
540 "detector": 1,
541 "instrument.visit_max": 1024,
542 "instrument.visit_system": 1,
543 "instrument.exposure_max": 512,
544 "instrument.detector_max": 4,
545 "instrument.class_name": "lsst.pipe.base.Instrument",
546 "detector.full_name": "Aa",
547 "detector.name_in_raft": "a",
548 "detector.raft": "A",
549 "detector.purpose": "SCIENCE",
550 },
551 )
553 dimensions = butler.dimensions.conform(["detector", "physical_filter"])
555 # DataIds should come with records.
556 with butler.query() as query:
557 query = query.join_dataset_search("flat", "imported_g")
558 result = query.general(dimensions, dataset_fields={"flat": ...}, find_first=True).order_by(
559 "detector"
560 )
561 result = result.with_dimension_records()
562 row_tuples = list(result.iter_tuples(flat))
563 self.assertEqual(len(row_tuples), 3)
564 for row_tuple in row_tuples:
565 self.assertTrue(row_tuple.data_id.hasRecords())
566 self.assertEqual(len(row_tuple.refs), 1)
567 self.assertTrue(row_tuple.refs[0].dataId.hasRecords())
569 def test_query_ingest_date(self) -> None:
570 """Test general query returning ingest_date field."""
571 before_ingest = astropy.time.Time.now()
572 butler = self.make_butler("base.yaml", "datasets.yaml")
573 dimensions = butler.dimensions.conform(["detector", "physical_filter"])
575 # Check that returned type of ingest_date is astropy Time, must work
576 # for schema versions 1 and 2 of datasets manager.
577 with butler.query() as query:
578 query = query.join_dataset_search("flat", "imported_g")
579 rows = list(query.general(dimensions, dataset_fields={"flat": {"ingest_date"}}, find_first=False))
580 self.assertEqual(len(rows), 3)
581 for row in rows:
582 self.assertIsInstance(row["flat.ingest_date"], astropy.time.Time)
584 # Check that WHERE accepts astropy time
585 with butler.query() as query:
586 query = query.join_dataset_search("flat", "imported_g")
587 query1 = query.where("flat.ingest_date < :before_ingest", bind={"before_ingest": before_ingest})
588 rows = list(query1.general(dimensions))
589 self.assertEqual(len(rows), 0)
590 query1 = query.where("flat.ingest_date >= :before_ingest", bind={"before_ingest": before_ingest})
591 rows = list(query1.general(dimensions))
592 self.assertEqual(len(rows), 3)
593 # Same with a time in string literal.
594 query1 = query.where(f"flat.ingest_date < T'mjd/{before_ingest.tai.mjd}'")
595 rows = list(query1.general(dimensions))
596 self.assertEqual(len(rows), 0)
598 def test_implied_union_record_query(self) -> None:
599 """Test queries for a dimension ('band') that uses "implied union"
600 storage, in which its values are the union of the values for it in a
601 another dimension (physical_filter) that implies it.
602 """
603 butler = self.make_butler("base.yaml")
604 band = butler.dimensions["band"]
605 self.assertEqual(band.implied_union_target, butler.dimensions["physical_filter"])
606 with butler.query() as query:
607 self.assertCountEqual(
608 list(query.dimension_records("band")),
609 [band.RecordClass(name="g"), band.RecordClass(name="r")],
610 )
611 self.assertCountEqual(
612 list(query.where(physical_filter="Cam1-R1", instrument="Cam1").dimension_records("band")),
613 [band.RecordClass(name="r")],
614 )
615 self.assertCountEqual(
616 butler.query_dimension_records("band"),
617 [band.RecordClass(name="g"), band.RecordClass(name="r")],
618 )
619 self.assertCountEqual(
620 butler.query_dimension_records("band", physical_filter="Cam1-R1", instrument="Cam1"),
621 [band.RecordClass(name="r")],
622 )
624 def test_dataset_constrained_record_query(self) -> None:
625 """Test a query for dimension records constrained by the existence of
626 datasets of a particular type.
627 """
628 butler = self.make_butler("base.yaml", "datasets.yaml")
629 butler.registry.insertDimensionData("instrument", {"name": "Cam2"})
630 butler.collections.register("empty", CollectionType.RUN)
631 butler.collections.register("chain", CollectionType.CHAINED)
632 butler.collections.redefine_chain("chain", ["imported_g", "empty", "imported_r"])
633 with butler.query() as query:
634 # No collections here or in defaults is an error.
635 with self.assertRaises(NoDefaultCollectionError):
636 query.join_dataset_search("bias").dimension_records("detector").any()
637 butler.registry.defaults = RegistryDefaults(collections=["chain"])
638 with butler.query() as query:
639 _x = query.expression_factory
640 # Simplest case: this collection only has the first 3 detectors.
641 self.check_detector_records(
642 query.join_dataset_search("bias", collections=["imported_g"]).dimension_records("detector"),
643 [1, 2, 3],
644 )
645 # Together these collections have two biases for two of the
646 # detectors, but this shouldn't cause duplicate results.
647 self.check_detector_records(
648 query.join_dataset_search("bias", collections=["imported_g", "imported_r"]).dimension_records(
649 "detector"
650 ),
651 )
652 # Again we've got the potential for duplicates due to multiple
653 # datasets with the same data ID, and this time we force the
654 # deduplication to happen outside the dataset-search subquery by
655 # adding a WHERE filter on a dataset column. We also use the
656 # defaulted collection ('chain') to supply the collection.
657 self.check_detector_records(
658 query.join_dataset_search("bias")
659 .where(
660 _x.any(
661 _x.all(_x["bias"].collection == "imported_g", _x.detector.raft == "B"),
662 _x.all(_x["bias"].collection == "imported_r", _x.detector.raft == "A"),
663 ),
664 instrument="Cam1",
665 )
666 .dimension_records("detector"),
667 [2, 3],
668 )
669 # Flats have dimensions (physical_filter and band) we would
670 # normally include in query for detector records. This also should
671 # not cause duplicates.
672 self.check_detector_records(
673 query.join_dataset_search("flat", collections=["imported_g"]).dimension_records("detector"),
674 [2, 3, 4],
675 )
676 # No results, but for reasons we can't determine before we run the
677 # query.
678 self.check_detector_records(
679 query.join_dataset_search("flat", collections=["imported_g"])
680 .where(_x.band == "r")
681 .dimension_records("detector"),
682 [],
683 )
684 # No results, and we can diagnose why before we run the query.
685 self.check_detector_records(
686 query.join_dataset_search("bias", collections=["empty"]).dimension_records("detector"),
687 [],
688 messages=[
689 "Search for dataset type 'bias' in ['empty'] is doomed to fail.",
690 "No datasets of type 'bias' in collection 'empty'.",
691 ],
692 doomed=True,
693 )
694 self.check_detector_records(
695 query.join_dataset_search("bias", collections=["imported_g"])
696 .where(instrument="Cam2")
697 .dimension_records("detector"),
698 [],
699 messages=[
700 "Search for dataset type 'bias' in ['imported_g'] is doomed to fail.",
701 "No datasets with instrument='Cam2' in collection 'imported_g'.",
702 ],
703 doomed=True,
704 )
706 def test_duplicate_overlaps(self) -> None:
707 """Test for query option that enables duplicate rows in queries that
708 use skypix overalps.
709 """
710 butler = self.make_butler("base.yaml", "spatial.yaml")
711 butler.registry.defaults = RegistryDefaults(instrument="Cam1", skymap="SkyMap1")
712 with butler.query() as query:
713 data_ids = list(query.data_ids(["visit", "detector", "patch"]).where(visit=1, detector=1))
714 self.assertCountEqual(
715 [(data_id["tract"], data_id["patch"]) for data_id in data_ids], [(0, 0), (0, 2)]
716 )
718 query._allow_duplicate_overlaps = True
719 data_ids = list(query.data_ids(["visit", "detector", "patch"]).where(visit=1, detector=1))
720 self.assertCountEqual(
721 [(data_id["tract"], data_id["patch"]) for data_id in data_ids],
722 [(0, 0), (0, 0), (0, 2), (0, 2)],
723 )
725 def test_spatial_overlaps(self) -> None:
726 """Test queries for dimension records with spatial overlaps.
728 Run tests/data/registry/spatial.py to plot the various regions used in
729 this test.
730 """
731 butler = self.make_butler("base.yaml", "spatial.yaml")
732 # Set default governor data ID values both to test that code path and
733 # to keep us from having to repeat them in every 'where' call below.
734 butler.registry.defaults = RegistryDefaults(instrument="Cam1", skymap="SkyMap1")
735 htm7 = butler.dimensions.skypix_dimensions["htm7"]
736 with butler.query() as query:
737 _x = query.expression_factory
738 # Query for detectors from a particular visit that overlap an
739 # explicit region.
740 self.check_detector_records(
741 query.where(
742 _x.visit_detector_region.region.overlaps(htm7.pixelization.pixel(253954)),
743 visit=1,
744 ).dimension_records("detector"),
745 [1, 3, 4],
746 has_postprocessing=True,
747 )
748 self.check_detector_records_returned(
749 butler.query_dimension_records(
750 "detector",
751 where="visit_detector_region.region OVERLAPS :region",
752 bind={"region": htm7.pixelization.pixel(253954)},
753 visit=1,
754 ),
755 ids=[1, 3, 4],
756 )
757 # Query for detectors from a particular visit that overlap an htm7
758 # ID. This is basically the same query as the last one, but
759 # expressed as a spatial join, and we can recognize that
760 # postprocessing is not needed (while in the last case it did
761 # nothing, but we couldn't tell that in advance because the query
762 # didn't know the region came from htm7).
763 self.check_detector_records(
764 query.where(
765 _x.visit_detector_region.region.overlaps(_x.htm7.region),
766 visit=1,
767 htm7=253954,
768 ).dimension_records("detector"),
769 [1, 3, 4],
770 has_postprocessing=False,
771 )
772 # Repeat the last query but with the spatial join implicit rather
773 # than explicit.
774 self.check_detector_records(
775 query.where(
776 visit=1,
777 htm7=253954,
778 ).dimension_records("detector"),
779 [1, 3, 4],
780 has_postprocessing=False,
781 )
782 self.check_detector_records_returned(
783 butler.query_dimension_records(
784 "detector",
785 visit=1,
786 htm7=253954,
787 ),
788 ids=[1, 3, 4],
789 )
790 # Query for the detectors from any visit that overlap a region:
791 # this gets contributions from multiple visits, and would have
792 # duplicates if we didn't get rid of them via GROUP BY.
793 self.check_detector_records(
794 query.where(
795 _x.visit_detector_region.region.overlaps(htm7.pixelization.pixel(253954)),
796 ).dimension_records("detector"),
797 [1, 2, 3, 4],
798 has_postprocessing=True,
799 )
800 self.check_detector_records_returned(
801 butler.query_dimension_records(
802 "detector",
803 where="visit_detector_region.region OVERLAPS :region",
804 bind={"region": htm7.pixelization.pixel(253954)},
805 ),
806 ids=[1, 2, 3, 4],
807 )
808 # Once again we rewrite the region-constraint query as a spatial
809 # join, which drops the postprocessing. This join has to be
810 # explicit because `visit` no longer gets into the query dimensions
811 # some other way, and without it `detector` is not spatial.
812 self.check_detector_records(
813 query.where(
814 _x.visit_detector_region.region.overlaps(_x.htm7.region),
815 htm7=253954,
816 ).dimension_records("detector"),
817 [1, 2, 3, 4],
818 has_postprocessing=False,
819 )
820 # Query for detectors from any visit that overlap a patch. This
821 # requires joining visit_detector_region to htm7 and htm7 to patch,
822 # and then some postprocessing. We want to make sure there are no
823 # duplicates from a detector and patch both overlapping multiple
824 # htm7 tiles (which affects detectors 1 and 2) and that
825 # postprocessing filters out detector 4, which has one htm7 tile in
826 # common with the patch but does not actually overlap it.
827 self.check_detector_records(
828 query.where(
829 _x.visit_detector_region.region.overlaps(_x.patch.region),
830 tract=0,
831 patch=4,
832 ).dimension_records("detector"),
833 [1, 2, 3],
834 has_postprocessing=True,
835 )
836 # Same as above, but with a materialization.
837 self.check_detector_records(
838 query.where(
839 _x.visit_detector_region.region.overlaps(_x.patch.region),
840 tract=0,
841 patch=4,
842 )
843 .materialize(dimensions=["detector"])
844 .dimension_records("detector"),
845 [1, 2, 3],
846 has_postprocessing=True,
847 )
848 # Query for that patch's region and express the previous query as
849 # a region-constraint instead of a spatial join.
850 (patch_record,) = query.where(tract=0, patch=4).dimension_records("patch")
851 self.check_detector_records(
852 query.where(
853 _x.visit_detector_region.region.overlaps(patch_record.region),
854 ).dimension_records("detector"),
855 [1, 2, 3],
856 has_postprocessing=True,
857 )
858 self.check_detector_records_returned(
859 butler.query_dimension_records(
860 "detector",
861 where="visit_detector_region.region OVERLAPS :region",
862 bind={"region": patch_record.region},
863 ),
864 ids=[1, 2, 3],
865 )
866 # Query for detectors where a patch/visit+detector overlap is
867 # satisfied, in the case where there are no rows with an overlap,
868 # but the union of the patch regions overlaps the union of the
869 # visit+detector regions.
870 self.check_detector_records(
871 query.where(
872 _x.visit_detector_region.region.overlaps(_x.patch.region),
873 _x.any(
874 _x.all(_x.tract == 1, _x.visit == 1),
875 _x.all(_x.tract == 0, _x.patch == 0, _x.visit == 2),
876 ),
877 ).dimension_records("detector"),
878 [],
879 has_postprocessing=True,
880 )
881 # Combine postprocessing with order_by and limit.
882 self.check_detector_records(
883 query.where(
884 _x.visit_detector_region.region.overlaps(patch_record.region),
885 )
886 .dimension_records("detector")
887 .order_by(_x.detector.desc)
888 .limit(2),
889 [3, 2],
890 has_postprocessing=True,
891 )
892 self.check_detector_records_returned(
893 butler.query_dimension_records(
894 "detector",
895 where="visit_detector_region.region OVERLAPS :region",
896 bind={"region": patch_record.region},
897 order_by="-detector",
898 limit=2,
899 ),
900 ids=[3, 2],
901 )
902 # Try a case where there are some records before postprocessing but
903 # none afterwards.
904 self.check_detector_records(
905 query.where(
906 _x.visit_detector_region.region.overlaps(patch_record.region),
907 detector=4,
908 ).dimension_records("detector"),
909 [],
910 has_postprocessing=True,
911 )
912 self.check_detector_records_returned(
913 butler.query_dimension_records(
914 "detector",
915 where="visit_detector_region.region OVERLAPS :region",
916 bind={"region": patch_record.region},
917 detector=4,
918 explain=False,
919 ),
920 ids=[],
921 )
922 # Check spatial queries using points instead of regions.
923 # This (ra, dec) is a point in the center of the region for visit
924 # 1, detector 3.
925 ra = 0.25209391431545386 # degrees
926 dec = 0.9269112711026793 # degrees
928 def _check_visit_id(query: Query) -> None:
929 result = list(query.data_ids(["visit", "detector"]))
930 self.assertEqual(len(result), 1)
931 id = result[0]
932 self.assertEqual(id["visit"], 1)
933 self.assertEqual(id["detector"], 3)
935 # Basic POINT() syntax.
936 _check_visit_id(query.where(f"visit_detector_region.region OVERLAPS POINT({ra}, {dec})"))
937 _check_visit_id(query.where(f"POINT({ra}, {dec}) OVERLAPS visit_detector_region.region"))
939 # dec of 1 is close enough to still be in the region, and tests
940 # conversion of integer to float.
941 _check_visit_id(query.where(f"visit_detector_region.region OVERLAPS POINT({ra}, 1)"))
943 # Negative values are allowed for dec, since it's defined as -90 to
944 # 90. Tract 1, patch 4 slightly overlaps some negative dec values.
945 result = list(query.where("patch.region OVERLAPS POINT(0.335, -0.000000001)").data_ids(["patch"]))
946 self.assertEqual(len(result), 1)
947 id = result[0]
948 self.assertEqual(id["patch"], 4)
949 self.assertEqual(id["tract"], 1)
950 # Out of bounds dec values are not allowed.
951 with self.assertRaisesRegex(ValueError, "invalid latitude angle"):
952 list(query.where("patch.region OVERLAPS POINT(0.335, -91)").data_ids(["patch"]))
954 # Negative ra values are allowed.
955 _check_visit_id(query.where(f"POINT({ra - 360}, {dec}) OVERLAPS visit_detector_region.region"))
957 # Substitute ra and dec values via bind instead of literals in the
958 # string.
959 _check_visit_id(
960 query.where(
961 "visit_detector_region.region OVERLAPS POINT(:ra, :dec)", bind={"ra": ra, "dec": dec}
962 )
963 )
965 # Bind in a point object instead of specifying ra/dec separately.
966 _check_visit_id(
967 query.where(
968 "visit_detector_region.region OVERLAPS :my_point",
969 bind={"my_point": LonLat.fromDegrees(ra, dec)},
970 )
971 )
972 _check_visit_id(
973 query.where(
974 "visit_detector_region.region OVERLAPS :my_point",
975 bind={"my_point": astropy.coordinates.SkyCoord(ra, dec, frame="icrs", unit="deg")},
976 )
977 )
978 # Make sure alternative coordinate frames in astropy SkyCoord are
979 # handled.
980 _check_visit_id(
981 query.where(
982 "visit_detector_region.region OVERLAPS :my_point",
983 bind={
984 "my_point": astropy.coordinates.SkyCoord(
985 ra, dec, frame="icrs", unit="deg"
986 ).transform_to("galactic")
987 },
988 )
989 )
991 # Compare against literal values using ExpressionFactory.
992 _check_visit_id(
993 query.where(_x.visit_detector_region.region.overlaps(LonLat.fromDegrees(ra, dec)))
994 )
995 _check_visit_id(
996 query.where(
997 _x.visit_detector_region.region.overlaps(
998 astropy.coordinates.SkyCoord(ra, dec, frame="icrs", unit="deg")
999 )
1000 )
1001 )
1003 # Check errors for invalid syntax.
1004 with self.assertRaisesRegex(
1005 InvalidQueryError, r"Expression 'visit.id' in POINT\(\) is not a literal number."
1006 ):
1007 query.where(f"visit_detector_region.region OVERLAPS POINT(visit.id, {dec})")
1008 with self.assertRaisesRegex(
1009 InvalidQueryError, r"Expression ''not-a-number'' in POINT\(\) is not a literal number."
1010 ):
1011 query.where(f"visit_detector_region.region OVERLAPS POINT({ra}, 'not-a-number')")
1013 # astropy's SkyCoord can be array-valued, but we expect only a
1014 # single point.
1015 array_point = astropy.coordinates.SkyCoord(
1016 ra=[10, 11, 12, 13], dec=[41, -5, 42, 0], unit="deg", frame="icrs"
1017 )
1018 with self.assertRaisesRegex(ValueError, "Astropy SkyCoord contained an array of points"):
1019 query.where(
1020 "visit_detector_region.region OVERLAPS :my_point",
1021 bind={"my_point": array_point},
1022 )
1024 def test_auto_spatial_joins(self) -> None:
1025 """Test the addition of automatic spatial joins in the presence and
1026 absence of datasets with dimensions that cross spatial families.
1027 """
1028 butler = self.make_butler("base.yaml", "spatial.yaml")
1029 # Set default governor data ID values both to test that code path and
1030 # to keep us from having to repeat them in every 'where' call below.
1031 butler.registry.defaults = RegistryDefaults(instrument="Cam1", skymap="SkyMap1")
1032 # Add some datasets with {tract, visit, detector} dimensions.
1033 # These will cover all {visit, detector}s that overlap tract=0.
1034 cat = DatasetType(
1035 "cat",
1036 dimensions=butler.dimensions.conform(["visit", "detector", "tract"]),
1037 storageClass="ArrowTable",
1038 )
1039 butler.registry.registerDatasetType(cat)
1040 butler.collections.register("run1")
1041 butler.registry.insertDatasets(
1042 cat,
1043 [
1044 {"instrument": "Cam1", "skymap": "SkyMap1", "visit": 1, "detector": 1, "tract": 0},
1045 {"instrument": "Cam1", "skymap": "SkyMap1", "visit": 1, "detector": 2, "tract": 0},
1046 {"instrument": "Cam1", "skymap": "SkyMap1", "visit": 1, "detector": 3, "tract": 0},
1047 {"instrument": "Cam1", "skymap": "SkyMap1", "visit": 1, "detector": 4, "tract": 0},
1048 {"instrument": "Cam1", "skymap": "SkyMap1", "visit": 2, "detector": 1, "tract": 0},
1049 {"instrument": "Cam1", "skymap": "SkyMap1", "visit": 2, "detector": 2, "tract": 0},
1050 ],
1051 run="run1",
1052 )
1053 with butler.query() as query:
1054 # When we query for just these dimensions with that dataset type
1055 # included, we shouldn't need a spatial join, because we assume
1056 # it's embedded in the rows of that dataset search.
1057 q1 = query.join_dataset_search(cat, "run1").data_ids(["visit", "detector", "tract"])
1058 # If there's no explicit spatial join, there's no postprocessing,
1059 # and hence we can do an exact count with discard=False.
1060 self.assertEqual(q1.count(exact=True, discard=False), 6)
1061 self.assertCountEqual(
1062 [(row["visit"], row["detector"], row["tract"]) for row in q1],
1063 [
1064 (1, 1, 0),
1065 (1, 2, 0),
1066 (1, 3, 0),
1067 (1, 4, 0),
1068 (2, 1, 0),
1069 (2, 2, 0),
1070 ],
1071 )
1072 # When we query for these dimensions and another one that wants
1073 # a more precise spatial join, we should get that more precise
1074 # spatial join.
1075 q1 = query.join_dataset_search(cat, "run1").data_ids(["visit", "detector", "tract", "patch"])
1076 self.assertCountEqual(
1077 [(row["visit"], row["detector"], row["tract"], row["patch"]) for row in q1],
1078 [
1079 (1, 1, 0, 0),
1080 (1, 1, 0, 2),
1081 (1, 2, 0, 0),
1082 (1, 2, 0, 1),
1083 (1, 2, 0, 2),
1084 (1, 2, 0, 3),
1085 (1, 3, 0, 2),
1086 (1, 3, 0, 4),
1087 (1, 4, 0, 2),
1088 (1, 4, 0, 3),
1089 (2, 1, 0, 4),
1090 (2, 2, 0, 4),
1091 (2, 2, 0, 5),
1092 ],
1093 )
1095 def test_common_skypix_overlaps(self) -> None:
1096 """Test spatial overlap queries that return htm7 records."""
1097 butler = self.make_butler("base.yaml", "spatial.yaml")
1098 # Insert some datasets that use a skypix dimension, since some queries
1099 # are only possible if a superset of the skypix IDs are in the query
1100 # already.
1101 cat1 = DatasetType("cat1", dimensions=butler.dimensions.conform(["htm7"]), storageClass="ArrowTable")
1102 butler.registry.registerDatasetType(cat1)
1103 butler.registry.registerCollection("refcats", CollectionType.RUN)
1104 butler.registry.insertDatasets(cat1, [{"htm7": i} for i in range(253952, 253968)], run="refcats")
1105 with butler.query() as query:
1106 _x = query.expression_factory
1107 # Explicit join to patch.
1108 self.assertCountEqual(
1109 [
1110 record.id
1111 for record in query.where(
1112 _x.htm7.region.overlaps(_x.patch.region), skymap="SkyMap1", tract=0, patch=4
1113 ).dimension_records("htm7")
1114 ],
1115 [253954, 253955],
1116 )
1117 # Implicit join to patch.
1118 self.assertCountEqual(
1119 [
1120 record.id
1121 for record in query.where(skymap="SkyMap1", tract=0, patch=4).dimension_records("htm7")
1122 ],
1123 [253954, 253955],
1124 )
1125 self.assertCountEqual(
1126 [
1127 record.id
1128 for record in butler.query_dimension_records("htm7", skymap="SkyMap1", tract=0, patch=4)
1129 ],
1130 [253954, 253955],
1131 )
1132 # Constraint on the patch region (with the query not knowing it
1133 # corresponds to that patch).
1134 (patch,) = query.where(skymap="SkyMap1", tract=0, patch=4).dimension_records("patch")
1135 self.assertCountEqual(
1136 [
1137 record.id
1138 for record in query.join_dataset_search("cat1", collections=["refcats"])
1139 .where(_x.htm7.region.overlaps(patch.region))
1140 .dimension_records("htm7")
1141 ],
1142 [253954, 253955],
1143 )
1145 def test_spatial_constraint_queries(self) -> None:
1146 """Test queries in which one spatial dimension in the constraint (data
1147 ID or ``where`` string) constrains a different spatial dimension in the
1148 query result columns.
1149 """
1150 butler = self.make_butler("base.yaml", "spatial.yaml")
1151 with butler.query() as query:
1152 # This tests the case where the 'tract' region is needed for
1153 # postprocessing, to compare against the visit region, but is not
1154 # needed in the resulting data ID.
1155 self.assertCountEqual(
1156 [0],
1157 [
1158 data_id["tract"]
1159 for data_id in query.data_ids(["tract"]).where({"instrument": "Cam1", "visit": 1})
1160 ],
1161 )
1162 self.assertCountEqual(
1163 [0],
1164 [
1165 data_id["tract"]
1166 for data_id in butler.query_data_ids(["tract"], instrument="Cam1", visit=1)
1167 ],
1168 )
1170 # This tests the case where the 'tract' region is needed in
1171 # postprocessing AND is also returned in the result rows.
1172 region_hex = (
1173 "7022408b0df0feef3f20378b0df0fe6f3fe23d8b0df0fe8f3ff1d8af0460ffef3f"
1174 "efcfaf0460ff6f3f75e0830388ff873f31aaeb0730ffef3fb0a5eb0730ff7f3f65"
1175 "bdf00564ff873f31aaeb0730ffef3fb1aeeb0730ff7f3f65bdf00564ff873f3e1c"
1176 "2f0fe0feef3f6e57630b28ff873fef52630b28ff873f911ade5e30fdef3f2d9626"
1177 "47e4fd873f0d952647e4fd973f553df64a80fdef3fd438f64a80fd7f3f20af3838"
1178 "20fe973f58462440b0fdef3f573d2440b0fd6f3fe2351b3044fe973f22408b0df0"
1179 "feef3f20378b0df0fe6f3f61428b0df0fe8f3f"
1180 )
1181 self.assertEqual(
1182 [(0, region_hex)],
1183 [
1184 (record.id, record.region.encode().hex())
1185 for record in query.dimension_records("tract").where({"instrument": "Cam1", "visit": 1})
1186 ],
1187 )
1188 self.assertEqual(
1189 [(0, region_hex)],
1190 [
1191 (record.id, record.region.encode().hex())
1192 for record in butler.query_dimension_records("tract", instrument="Cam1", visit=1)
1193 ],
1194 )
1196 def test_data_coordinate_upload(self) -> None:
1197 """Test queries for dimension records with a data coordinate upload."""
1198 butler = self.make_butler("base.yaml", "spatial.yaml")
1199 with butler.query() as query:
1200 # Query with a data ID upload that has an irrelevant row (there's
1201 # no data with "Cam2").
1202 self.check_detector_records(
1203 query.join_data_coordinates(
1204 [
1205 DataCoordinate.standardize(instrument="Cam1", detector=1, universe=butler.dimensions),
1206 DataCoordinate.standardize(instrument="Cam1", detector=3, universe=butler.dimensions),
1207 DataCoordinate.standardize(instrument="Cam2", detector=4, universe=butler.dimensions),
1208 ]
1209 ).dimension_records("detector"),
1210 [1, 3],
1211 )
1212 # Query with a data ID upload that directly contains duplicates,
1213 # which should not appear in the results.
1214 self.check_detector_records(
1215 query.join_data_coordinates(
1216 [
1217 DataCoordinate.standardize(instrument="Cam1", detector=1, universe=butler.dimensions),
1218 DataCoordinate.standardize(instrument="Cam1", detector=3, universe=butler.dimensions),
1219 DataCoordinate.standardize(instrument="Cam1", detector=3, universe=butler.dimensions),
1220 ]
1221 ).dimension_records("detector"),
1222 [1, 3],
1223 )
1224 # Query with a data ID upload that has extra dimensions that could
1225 # also introduce duplicates if we're not careful.
1226 self.check_detector_records(
1227 query.join_data_coordinates(
1228 [
1229 DataCoordinate.standardize(
1230 instrument="Cam1", visit=1, detector=1, universe=butler.dimensions
1231 ),
1232 DataCoordinate.standardize(
1233 instrument="Cam1", visit=2, detector=3, universe=butler.dimensions
1234 ),
1235 DataCoordinate.standardize(
1236 instrument="Cam1", visit=1, detector=3, universe=butler.dimensions
1237 ),
1238 ]
1239 ).dimension_records("detector"),
1240 [1, 3],
1241 )
1242 # Query with a data ID upload that has extra dimensions that are
1243 # used in a constraint.
1244 self.check_detector_records(
1245 query.join_data_coordinates(
1246 [
1247 DataCoordinate.standardize(
1248 instrument="Cam1", visit=1, detector=1, universe=butler.dimensions
1249 ),
1250 DataCoordinate.standardize(
1251 instrument="Cam1", visit=2, detector=3, universe=butler.dimensions
1252 ),
1253 DataCoordinate.standardize(
1254 instrument="Cam1", visit=1, detector=3, universe=butler.dimensions
1255 ),
1256 ]
1257 )
1258 .where(instrument="Cam1", visit=2)
1259 .dimension_records("detector"),
1260 [3],
1261 )
1262 # Query with a data ID upload that must be spatially joined to
1263 # the other dimensions. This join is added automatically.
1264 self.check_detector_records(
1265 query.join_data_coordinates(
1266 [
1267 DataCoordinate.standardize(
1268 skymap="SkyMap1", tract=1, patch=1, universe=butler.dimensions
1269 ),
1270 DataCoordinate.standardize(
1271 skymap="SkyMap1", tract=1, patch=2, universe=butler.dimensions
1272 ),
1273 DataCoordinate.standardize(
1274 skymap="SkyMap1", tract=1, patch=3, universe=butler.dimensions
1275 ),
1276 ]
1277 )
1278 .where(instrument="Cam1", visit=2)
1279 .dimension_records("detector"),
1280 [2, 3, 4],
1281 has_postprocessing=True,
1282 )
1283 # Query with a data ID upload that embeds a spatial relationship.
1284 # This prevents automatic creation of a spatial join. To make the
1285 # test more interesting, the spatial relationship embedded in these
1286 # data IDs is nonsense: it includes combinations that do not
1287 # overlap, while leaving out combinations that do overlap.
1288 self.check_detector_records(
1289 query.join_data_coordinates(
1290 [
1291 DataCoordinate.standardize(
1292 skymap="SkyMap1",
1293 tract=1,
1294 patch=1,
1295 instrument="Cam1",
1296 visit=1,
1297 detector=1,
1298 universe=butler.dimensions,
1299 ),
1300 DataCoordinate.standardize(
1301 skymap="SkyMap1",
1302 tract=1,
1303 patch=1,
1304 instrument="Cam1",
1305 visit=1,
1306 detector=2,
1307 universe=butler.dimensions,
1308 ),
1309 DataCoordinate.standardize(
1310 skymap="SkyMap1",
1311 tract=1,
1312 patch=3,
1313 instrument="Cam1",
1314 visit=1,
1315 detector=3,
1316 universe=butler.dimensions,
1317 ),
1318 ]
1319 )
1320 .where(skymap="SkyMap1", tract=1, patch=1)
1321 .dimension_records("detector"),
1322 [1, 2],
1323 )
1324 # Query with an empty data ID upload (not a useful thing to do,
1325 # but a way to probe edge-case behavior).
1326 self.check_detector_records(
1327 query.join_data_coordinates(
1328 [
1329 DataCoordinate.make_empty(universe=butler.dimensions),
1330 ]
1331 ).dimension_records("detector"),
1332 [1, 2, 3, 4],
1333 )
1335 def test_data_coordinate_upload_force_temp_table(self) -> None:
1336 """Test queries for dimension records with a data coordinate upload
1337 that is so big it has to go into a temporary table rather than be
1338 included directly into the query via bind params (by making the
1339 threshold for making a a temporary table tiny).
1341 This test assumes a DirectQueryDriver and is automatically skipped when
1342 some other driver is found.
1343 """
1344 butler = self.make_butler("base.yaml", "spatial.yaml")
1345 with butler.query() as query:
1346 if not isinstance(query._driver, DirectQueryDriver):
1347 raise unittest.SkipTest("Test requires meddling with DirectQueryDriver internals.")
1348 query._driver._constant_rows_limit = 2
1349 data_coordinates = [
1350 DataCoordinate.standardize(instrument="Cam1", detector=1, universe=butler.dimensions),
1351 DataCoordinate.standardize(instrument="Cam1", detector=3, universe=butler.dimensions),
1352 DataCoordinate.standardize(instrument="Cam1", detector=4, universe=butler.dimensions),
1353 ]
1354 self.check_detector_records(
1355 query.join_data_coordinates(data_coordinates).dimension_records("detector"),
1356 [1, 3, 4],
1357 )
1359 # Make sure it can fall back to a VALUES clause if temporary tables
1360 # are not supported by the DB.
1361 with unittest.mock.patch.object(query._driver.db, "_allow_temporary_tables", False):
1362 self.check_detector_records(
1363 query.join_data_coordinates(data_coordinates).dimension_records("detector"),
1364 [1, 3, 4],
1365 )
1367 def test_materialization(self) -> None:
1368 """Test querying for dimension records against a materialized previous
1369 query.
1370 """
1371 butler = self.make_butler("base.yaml", "datasets.yaml", "spatial.yaml")
1372 with butler.query() as query:
1373 _x = query.expression_factory
1374 # Simple case where the materialization has just the dimensions
1375 # we need for the rest of the query.
1376 self.check_detector_records(
1377 query.where(_x.detector.raft == "A", instrument="Cam1")
1378 .materialize()
1379 .dimension_records("detector"),
1380 [1, 2],
1381 )
1382 # This materialization has extra dimensions that could cause
1383 # duplicates if we don't SELECT DISTINCT them away.
1384 self.check_detector_records(
1385 query.join_dimensions(["visit", "detector"])
1386 .where(_x.detector.raft == "A", instrument="Cam1")
1387 .materialize()
1388 .dimension_records("detector"),
1389 [1, 2],
1390 )
1391 # Materialize a spatial-join, which should prevent the creation
1392 # of a spatial join in the downstream query.
1393 self.check_detector_records(
1394 query.join_dimensions(["visit", "detector", "tract"])
1395 .materialize()
1396 .where(skymap="SkyMap1", tract=0, instrument="Cam1", visit=2)
1397 .dimension_records("detector"),
1398 [1, 2],
1399 has_postprocessing=True,
1400 )
1401 # Materialize with a dataset join.
1402 self.check_detector_records(
1403 query.join_dataset_search("bias", collections=["imported_g"])
1404 .materialize(datasets=["bias"])
1405 .dimension_records("detector"),
1406 [1, 2, 3],
1407 )
1409 def test_materialization_find_first(self) -> None:
1410 """Test querying for datasets with find_first against a materialized
1411 query.
1412 """
1413 butler = self.make_butler("ci_hsc-subset.yaml", "ci_hsc-subset-skymap.yaml")
1415 run = "HSC/runs/ci_hsc/20240806T180642Z"
1416 extra_run = "HSC/runs/ci_hsc/20240806T180642Z-extra"
1418 # Find few datasets to duplicate.
1419 refs = butler.query_datasets("calexp", run, limit=3)
1420 data_ids = [ref.dataId for ref in refs]
1422 butler.collections.register(extra_run)
1423 butler.registry.insertDatasets("calexp", data_ids, extra_run)
1425 collections = [run, extra_run, "skymaps"]
1426 with butler.query() as query:
1427 query = query.join_dimensions(
1428 [
1429 "instrument",
1430 "physical_filter",
1431 "band",
1432 "visit",
1433 "detector",
1434 "day_obs",
1435 "skymap",
1436 "tract",
1437 ]
1438 )
1439 query = query.join_dataset_search("skyMap", collections)
1440 query = query.join_dataset_search("calexp", collections)
1441 query = query.where({}, "instrument='HSC' AND skymap='discrete/ci_hsc'", bind=None)
1442 m_query = query.materialize()
1443 _ = list(m_query.datasets("skyMap", collections))
1444 _ = list(m_query.datasets("calexp", collections))
1446 def test_materialization_no_results(self) -> None:
1447 """Test querying for datasets when materialized table is empty."""
1448 butler = self.make_butler("ci_hsc-subset.yaml", "ci_hsc-subset-skymap.yaml")
1450 run = "HSC/runs/ci_hsc/20240806T180642Z"
1452 # Register a dataset type but do not add any datasets.
1453 butler.registry.registerDatasetType(
1454 DatasetType("nothing", ["visit", "detector"], "int", universe=butler.dimensions)
1455 )
1457 collections = [run]
1458 with butler.query() as query:
1459 query = query.join_dimensions(
1460 [
1461 "instrument",
1462 "physical_filter",
1463 "band",
1464 "visit",
1465 "detector",
1466 "day_obs",
1467 "skymap",
1468 "tract",
1469 ]
1470 )
1471 query = query.join_dataset_search("calexp", collections)
1472 query = query.join_dataset_search("nothing", collections)
1473 query = query.where({}, "instrument='HSC' AND skymap='discrete/ci_hsc'", bind=None)
1474 no_results = "\n".join(query.explain_no_results())
1475 self.assertIn("No datasets of type 'nothing'", no_results)
1477 m_query = query.materialize()
1478 result = m_query.datasets("nothing")
1479 self.assertFalse(result.any())
1480 no_results = "\n".join(result.explain_no_results())
1481 self.assertIn("No datasets of type 'nothing'", no_results)
1483 def test_timespan_results(self) -> None:
1484 """Test returning dimension records that include timespans."""
1485 butler = self.make_butler("base.yaml", "spatial.yaml")
1486 with butler.query() as query:
1487 query_results = list(query.dimension_records("visit"))
1488 simple_results = butler.query_dimension_records("visit")
1489 for results in (query_results, simple_results):
1490 self.assertCountEqual(
1491 [(record.id, record.timespan.begin, record.timespan.end) for record in results],
1492 [
1493 (
1494 1,
1495 astropy.time.Time("2021-09-09T03:00:00", format="isot", scale="tai"),
1496 astropy.time.Time("2021-09-09T03:01:00", format="isot", scale="tai"),
1497 ),
1498 (
1499 2,
1500 astropy.time.Time("2021-09-09T03:02:00", format="isot", scale="tai"),
1501 astropy.time.Time("2021-09-09T03:03:00", format="isot", scale="tai"),
1502 ),
1503 ],
1504 )
1506 def test_direct_driver_paging(self) -> None:
1507 """Test queries for dimension records that require multiple pages (by
1508 making the page size tiny for DirectQueryDriver).
1510 For RemoteQueryDriver, we can't manipulate the page size so this just
1511 checks that the driver context manager logic is executing.
1512 """
1513 butler = self.make_butler("base.yaml")
1514 # Basic test where pages should be transparent.
1515 with butler.query() as query:
1516 if isinstance(query._driver, DirectQueryDriver):
1517 query._driver._raw_page_size = 2
1518 self.check_detector_records(
1519 query.dimension_records("detector"),
1520 [1, 2, 3, 4],
1521 )
1522 # Test that it's an error to continue query iteration after closing the
1523 # context manager.
1524 with butler.query() as query:
1525 if isinstance(query._driver, DirectQueryDriver):
1526 query._driver._raw_page_size = 2
1527 iterator = iter(query.dimension_records("detector"))
1528 next(iterator)
1529 with self.assertRaisesRegex(RuntimeError, "Cannot continue query result iteration"):
1530 list(iterator)
1532 def test_column_expressions(self) -> None:
1533 """Test queries with a wide variant of column expressions."""
1534 butler = self.make_butler("base.yaml", "spatial.yaml")
1535 butler.registry.defaults = RegistryDefaults(instrument="Cam1")
1536 with butler.query() as query:
1537 _x = query.expression_factory
1538 self.check_detector_records(
1539 query.where(_x.not_(_x.detector != 2)).dimension_records("detector"),
1540 [2],
1541 )
1542 self.check_detector_records_returned(
1543 butler.query_dimension_records("detector", where="NOT (detector != 2)"),
1544 [2],
1545 )
1546 self.check_detector_records(
1547 # Empty string expression should evaluate to True.
1548 query.where(_x.detector == 2, "").dimension_records("detector"),
1549 [2],
1550 )
1551 self.check_detector_records(
1552 query.where(_x.literal(2) == _x.detector).dimension_records("detector"),
1553 [2],
1554 )
1555 self.check_detector_records(
1556 query.where(_x.literal(2) == _x.detector + 1).dimension_records("detector"),
1557 [1],
1558 )
1559 self.check_detector_records(
1560 query.where(-_x.detector == -3).dimension_records("detector"),
1561 [3],
1562 )
1563 self.check_detector_records(
1564 query.where(_x.detector == 1, _x.detector == 2).dimension_records("detector"),
1565 [],
1566 messages=["'where' expression requires both detector=2 and detector=1."],
1567 )
1568 self.assertCountEqual(
1569 [
1570 record.id
1571 for record in query.where(
1572 # Datetime equal to the "begin" of the timespan.
1573 _x.visit.timespan.overlaps(
1574 astropy.time.Time("2021-09-09T03:00:00", format="isot", scale="tai")
1575 )
1576 ).dimension_records("visit")
1577 ],
1578 # Timespan begin bound is inclusive, so the record should
1579 # match.
1580 [1],
1581 )
1582 self.assertCountEqual(
1583 [
1584 record.id
1585 for record in query.where(
1586 # Datetime equal to the "end" of the timespan.
1587 _x.visit.timespan.overlaps(
1588 astropy.time.Time("2021-09-09T03:01:00", format="isot", scale="tai")
1589 )
1590 ).dimension_records("visit")
1591 ],
1592 # Timespan end bound is exclusive, so we should get no records.
1593 [],
1594 )
1595 self.assertCountEqual(
1596 [
1597 record.id
1598 for record in query.where(
1599 # In the middle of the timespan.
1600 _x.visit.timespan.overlaps(
1601 astropy.time.Time("2021-09-09T03:02:30", format="isot", scale="tai")
1602 )
1603 ).dimension_records("visit")
1604 ],
1605 [2],
1606 )
1607 self.assertCountEqual(
1608 [
1609 record.id
1610 for record in butler.query_dimension_records(
1611 # In the middle of the timespan.
1612 "visit",
1613 where="visit.timespan OVERLAPS(:ts)",
1614 bind={"ts": astropy.time.Time("2021-09-09T03:02:30", format="isot", scale="tai")},
1615 )
1616 ],
1617 [2],
1618 )
1619 self.assertCountEqual(
1620 [
1621 record.id
1622 for record in query.where(
1623 _x.visit.timespan.overlaps(
1624 Timespan(
1625 begin=astropy.time.Time("2021-09-09T03:02:30", format="isot", scale="tai"),
1626 end=None,
1627 )
1628 )
1629 ).dimension_records("visit")
1630 ],
1631 [2],
1632 )
1633 self.assertCountEqual(
1634 [
1635 record.id
1636 for record in query.where(
1637 _x.not_(
1638 _x.visit.timespan.end
1639 < astropy.time.Time("2021-09-09T03:02:30", format="isot", scale="tai"),
1640 )
1641 ).dimension_records("visit")
1642 ],
1643 [2],
1644 )
1645 self.assertCountEqual(
1646 [
1647 record.id
1648 for record in query.where(
1649 _x.visit.timespan.begin
1650 > astropy.time.Time("2021-09-09T03:01:30", format="isot", scale="tai")
1651 ).dimension_records("visit")
1652 ],
1653 [2],
1654 )
1655 self.assertCountEqual(
1656 [
1657 record.id
1658 for record in query.where(
1659 (_x.visit.exposure_time + -(5.0 * _x.visit.zenith_angle)) > 0.0
1660 ).dimension_records("visit")
1661 ],
1662 [1],
1663 )
1664 self.assertCountEqual(
1665 [
1666 record.id
1667 for record in query.where(_x.visit.exposure_time - 5.0 >= 50.0).dimension_records("visit")
1668 ],
1669 [1],
1670 )
1671 self.assertCountEqual(
1672 [record.id for record in query.where(_x.visit.id % 2 != 0).dimension_records("visit")],
1673 [1],
1674 )
1675 self.assertCountEqual(
1676 [
1677 record.id
1678 for record in query.where(_x.visit.zenith_angle / 5.0 <= 1.0).dimension_records("visit")
1679 ],
1680 [1],
1681 )
1682 self.assertCountEqual(
1683 [record.id for record in query.where(_x.visit.timespan.is_null).dimension_records("visit")],
1684 [],
1685 )
1686 self.assertCountEqual(
1687 [
1688 record.id
1689 for record in query.where(_x.visit.exposure_time.is_null).dimension_records("visit")
1690 ],
1691 [],
1692 )
1694 # Allow comparison of float columns with int literals
1695 self.assertCountEqual(
1696 [record.id for record in query.where("visit.exposure_time > 50").dimension_records("visit")],
1697 [1],
1698 )
1699 self.assertCountEqual(
1700 [record.id for record in query.where(_x.visit.exposure_time > 50).dimension_records("visit")],
1701 [1],
1702 )
1704 self.check_detector_records(
1705 query.where(_x.detector.in_iterable([1, 3, 4])).dimension_records("detector"),
1706 [1, 3, 4],
1707 )
1708 self.check_detector_records_returned(
1709 butler.query_dimension_records(
1710 "detector", where="detector IN (:det)", bind={"det": [1, 3, 4]}
1711 ),
1712 [1, 3, 4],
1713 )
1714 self.check_detector_records(
1715 query.where(_x.detector.in_range(start=2, stop=None)).dimension_records("detector"),
1716 [2, 3, 4],
1717 )
1718 self.check_detector_records(
1719 query.where(_x.detector.in_range(start=1, stop=3)).dimension_records("detector"),
1720 [1, 2],
1721 )
1722 self.check_detector_records(
1723 query.where(_x.detector.in_range(start=1, stop=None, step=2)).dimension_records("detector"),
1724 [1, 3],
1725 )
1726 self.check_detector_records(
1727 query.where(_x.detector.in_range(start=1, stop=2)).dimension_records("detector"),
1728 [1],
1729 )
1730 # This is a complex way to write a much simpler query ("where
1731 # detector.raft == 'A'"), but it tests code paths that would
1732 # otherwise require a lot more test setup.
1733 self.check_detector_records(
1734 query.where(
1735 _x.detector.in_query(_x.detector, query.where(_x.detector.raft == "A"))
1736 ).dimension_records("detector"),
1737 [1, 2],
1738 )
1739 # Error to reference tract without skymap in a WHERE clause.
1740 with self.assertRaises(InvalidQueryError):
1741 list(query.where(_x.tract == 4).dimension_records("patch"))
1743 def test_boolean_columns(self) -> None:
1744 """Test that boolean columns work as expected when specifying
1745 expressions.
1746 """
1747 # Exposure is the only dimension that has boolean columns, and this set
1748 # of data has most of the pre-requisites for exposure set up.
1749 butler = self.make_butler("base.yaml", "spatial.yaml")
1750 butler.registry.insertDimensionData("group", {"instrument": "Cam1", "name": "1"})
1752 base_data = {"instrument": "Cam1", "physical_filter": "Cam1-R1", "group": "1", "day_obs": 20210909}
1754 TRUE_ID = 1000
1755 FALSE_ID_1 = 2001
1756 FALSE_ID_2 = 2002
1757 NULL_ID_1 = 3000
1758 records = [
1759 {"id": TRUE_ID, "obs_id": "true-1", "can_see_sky": True},
1760 {"id": FALSE_ID_1, "obs_id": "false-1", "can_see_sky": False, "observation_type": "science"},
1761 {"id": FALSE_ID_2, "obs_id": "false-2", "can_see_sky": False, "observation_type": None},
1762 {"id": NULL_ID_1, "obs_id": "null-1", "can_see_sky": None},
1763 ]
1764 for record in records:
1765 butler.registry.insertDimensionData("exposure", base_data | record)
1767 # Go through the registry interface to cover the old query system, too.
1768 # This can be removed once the old query system is removed.
1769 def _run_registry_query(where: str) -> list[int]:
1770 return _get_exposure_ids_from_dimension_records(
1771 butler.registry.queryDimensionRecords("exposure", where=where, instrument="Cam1")
1772 )
1774 def _run_simple_query(where: str) -> list[int]:
1775 return _get_exposure_ids_from_dimension_records(
1776 butler.query_dimension_records("exposure", where=where, instrument="Cam1")
1777 )
1779 def _run_query(where: str) -> list[int]:
1780 with butler.query() as query:
1781 return _get_exposure_ids_from_dimension_records(
1782 query.dimension_records("exposure").where(where, instrument="Cam1")
1783 )
1785 # Test boolean columns in the `where` string syntax.
1786 for test, query_func in [
1787 ("registry", _run_registry_query),
1788 ("new-query", _run_query),
1789 ("simple", _run_simple_query),
1790 ]:
1791 with self.subTest(test):
1792 # Boolean columns should be usable standalone as an expression.
1793 self.assertCountEqual(query_func("exposure.can_see_sky"), [TRUE_ID])
1795 # You can find false values in the column with NOT. The NOT of
1796 # NULL is NULL, consistent with SQL semantics -- so records
1797 # with NULL can_see_sky are not included here.
1798 self.assertCountEqual(query_func("NOT exposure.can_see_sky"), [FALSE_ID_1, FALSE_ID_2])
1800 # Make sure the bare column composes with other expressions
1801 # correctly.
1802 self.assertCountEqual(
1803 query_func("exposure.can_see_sky OR exposure = 2001"), [TRUE_ID, FALSE_ID_1]
1804 )
1806 # Find nulls and non-nulls.
1807 #
1808 # This is run only against the new query system. It appears that the
1809 # `= NULL` syntax never had test coverage in the old query system and
1810 # doesn't work for any column types. Not worth fixing since we are
1811 # dropping that code soon.
1812 nulls = [NULL_ID_1]
1813 non_nulls = [TRUE_ID, FALSE_ID_1, FALSE_ID_2]
1814 self.assertCountEqual(_run_query("exposure.can_see_sky = NULL"), nulls)
1815 self.assertCountEqual(_run_query("exposure.can_see_sky != NULL"), non_nulls)
1816 self.assertCountEqual(_run_query("NULL = exposure.can_see_sky"), nulls)
1817 self.assertCountEqual(_run_query("NULL != exposure.can_see_sky"), non_nulls)
1819 # You can't do a NULL check on an arbitrary boolean predicate.
1820 with self.assertRaises(InvalidQueryError):
1821 _run_query("NULL = (exposure.can_see_sky AND exposure = 2001)")
1823 # Check null finding for non-boolean columns, too.
1824 self.assertEqual(
1825 _run_query("exposure.observation_type = NULL AND NOT exposure.can_see_sky"), [FALSE_ID_2]
1826 )
1827 self.assertEqual(
1828 _run_query("exposure.observation_type != NULL AND NOT exposure.can_see_sky"), [FALSE_ID_1]
1829 )
1830 self.assertEqual(
1831 _run_query("NULL = exposure.observation_type AND NOT exposure.can_see_sky"), [FALSE_ID_2]
1832 )
1833 self.assertEqual(
1834 _run_query("NULL != exposure.observation_type AND NOT exposure.can_see_sky"), [FALSE_ID_1]
1835 )
1837 # Test boolean columns in ExpressionFactory.
1838 with butler.query() as query:
1839 x = query.expression_factory
1841 def do_query(constraint: Predicate) -> list[int]:
1842 return _get_exposure_ids_from_dimension_records(
1843 query.dimension_records("exposure").where(constraint, instrument="Cam1")
1844 )
1846 # Boolean columns should be usable standalone as a Predicate.
1847 self.assertCountEqual(do_query(x.exposure.can_see_sky.as_boolean()), [TRUE_ID])
1849 # You can find false values in the column with NOT. The NOT of
1850 # NULL is NULL, consistent with SQL semantics -- so records
1851 # with NULL can_see_sky are not included here.
1852 self.assertCountEqual(
1853 do_query(x.exposure.can_see_sky.as_boolean().logical_not()), [FALSE_ID_1, FALSE_ID_2]
1854 )
1856 # Searching for nulls works.
1857 self.assertCountEqual(do_query(x.exposure.can_see_sky.is_null), [NULL_ID_1])
1859 # Attempting to use operators that only apply to non-boolean types
1860 # is an error.
1861 with self.assertRaisesRegex(
1862 InvalidQueryError,
1863 r"Boolean expression 'exposure.can_see_sky' can't be used directly in other expressions."
1864 r" Call the 'as_boolean\(\)' method to convert it to a Predicate instead.",
1865 ):
1866 x.exposure.can_see_sky == 1
1868 # Non-boolean types can't be converted directly to Predicate.
1869 with self.assertRaisesRegex(
1870 InvalidQueryError,
1871 r"Expression 'exposure.observation_type' with type 'string' can't be used directly"
1872 r" as a boolean value.",
1873 ):
1874 x.exposure.observation_type.as_boolean()
1876 def test_dataset_region_queries(self) -> None:
1877 """Test region queries for datasets."""
1878 # Import data to play with.
1879 butler = self.make_butler("base.yaml", "ci_hsc-subset.yaml")
1881 run = "HSC/runs/ci_hsc/20240806T180642Z"
1882 with butler.query() as query:
1883 # Return everything.
1884 results = query.datasets("calexp", collections=run)
1885 # Sort by data coordinate.
1886 refs = sorted(results.with_dimension_records(), key=attrgetter("dataId"))
1887 self.assertEqual(len(refs), 33)
1889 # Use a region from the first visit.
1890 first_visit_region = refs[0].dataId.visit.region # type: ignore
1892 # Get a visit detector region from the first ref.
1893 with butler.query() as query:
1894 data_id = refs[0].dataId.mapping
1895 records = list(query.dimension_records("visit_detector_region").where(**data_id)) # type: ignore
1896 self.assertEqual(len(records), 1)
1898 for pos, use_bind, count in (
1899 ("CIRCLE 320. -0.25 10.", True, 33), # Match everything.
1900 ("CIRCLE 321.0 -0.4 0.01", True, 1), # Should be small region on 1 detector.
1901 ("CIRCLE 321.1 -0.35 0.02", True, 2),
1902 ("CIRCLE 321.1 -0.48 0.05", True, 1), # Center off the region.
1903 ("CIRCLE 321.0 -0.5 0.01", True, 0), # No overlap.
1904 (first_visit_region.to_ivoa_pos(), True, 33), # Visit region overlaps everything.
1905 (records[0].region.to_ivoa_pos(), True, 17), # Some overlap.
1906 ("CIRCLE(320., -0.25, 10.)", False, 33), # Match everything.
1907 ("CIRCLE(321.0, -0.4, 0.01)", False, 1), # Should be small region on 1 detector.
1908 ("CIRCLE(321.0, -0.5, 0.01)", False, 0), # No overlap.
1909 ("BOX(320, -0.25, 5, 5)", False, 33), # Match everything.
1910 ("BOX(321.0, -0.4, 0.01, 0.01)", False, 1), # Should be small region on 1 detector.
1911 ("BOX(321.0, -0.5, 0.01, 0.01)", False, 0), # No overlap.
1912 ("POLYGON(320, -10, 320, 10, 340, 10, 340, -10)", False, 33), # Match everything.
1913 ("POLYGON(320.99, -0.401, 320.99, -0.399, 321.01, -0.399, 321.01, -0.401)", False, 1),
1914 ("POLYGON(320.99, -0.501, 320.99, -0.499, 321.01, -0.499, 321.01, -0.501)", False, 0),
1915 ("REGION('CIRCLE 320. -0.25 10.')", False, 33), # Match everything.
1916 ("REGION('RANGE 310 330 -10 10')", False, 33), # Match everything.
1917 ("REGION('RANGE 320.99 321.01 -0.401 -0.399')", False, 1), # Small region on 1 detector.
1918 ("REGION('POLYGON 320.99 -0.501 320.99 -0.499 321.01 -0.499 321.01 -0.501')", False, 0),
1919 ):
1920 if use_bind:
1921 overlap_where = "visit_detector_region.region OVERLAPS :POS"
1922 bind = {"POS": Region.from_ivoa_pos(pos)}
1923 else:
1924 overlap_where = f"visit_detector_region.region OVERLAPS {pos}"
1925 bind = {}
1926 with butler.query() as query:
1927 results = query.datasets("calexp", collections=run)
1928 results = results.where(f"instrument = 'HSC' AND {overlap_where}", bind=bind)
1929 refs = list(results)
1930 self.assertEqual(len(refs), count, f"POS={pos} REFS={refs}")
1932 simple_refs = butler.query_datasets(
1933 "calexp",
1934 collections=run,
1935 instrument="HSC",
1936 where=overlap_where,
1937 bind=bind,
1938 explain=False,
1939 )
1940 self.assertCountEqual(refs, simple_refs)
1942 def test_dataset_time_queries(self) -> None:
1943 """Test temporal queries for datasets."""
1944 # Import data to play with.
1945 butler = self.make_butler("base.yaml", "ci_hsc-subset.yaml")
1947 # Some times from the test data.
1948 v_903334_pre = astropy.time.Time("2013-01-01T12:00:00", scale="tai", format="isot")
1949 v_903334_mid = astropy.time.Time("2013-06-17T13:29:20", scale="tai", format="isot")
1950 v_904014_pre = astropy.time.Time("2013-11-01T12:00:00", scale="tai", format="isot")
1951 v_904014_post = astropy.time.Time("2013-12-21T12:00:00", scale="tai", format="isot")
1953 with butler.query() as query:
1954 run = "HSC/runs/ci_hsc/20240806T180642Z"
1955 results = query.datasets("calexp", collections=run)
1957 # Use a time during the middle of a visit.
1958 v_903334 = results.where(
1959 "instrument = 'HSC' and visit.timespan OVERLAPS(:ts)", bind={"ts": v_903334_mid}
1960 )
1961 self.assertEqual(len(list(v_903334)), 4)
1963 # Timespan covering first half of the data.
1964 first_half = results.where(
1965 "instrument = 'HSC' and visit.timespan OVERLAPS(:t1, :t2)",
1966 bind={"t1": v_903334_pre, "t2": v_904014_pre},
1967 )
1968 self.assertEqual(len(list(first_half)), 17)
1970 # Query using a timespan object.
1971 with_ts = results.where(
1972 "instrument = 'HSC' and visit.timespan OVERLAPS(:ts)",
1973 bind={"ts": Timespan(v_904014_pre, v_904014_post)},
1974 )
1975 self.assertEqual(len(list(with_ts)), 16)
1977 def test_calibration_join_queries(self) -> None:
1978 """Test using the 'general' query result type to join observations to
1979 calibration datasets temporally.
1981 We have to use general results because we want calibration DatasetRefs
1982 and data IDs that include the observation identifiers (which are not
1983 part of the calibration dataset dimensions).
1984 """
1985 butler = self.make_butler("base.yaml", "datasets.yaml")
1986 # Set up some timestamps.
1987 t1 = astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")
1988 t2 = astropy.time.Time("2020-01-01T02:00:00", format="isot", scale="tai")
1989 t3 = astropy.time.Time("2020-01-01T03:00:00", format="isot", scale="tai")
1990 t4 = astropy.time.Time("2020-01-01T04:00:00", format="isot", scale="tai")
1991 t5 = astropy.time.Time("2020-01-01T05:00:00", format="isot", scale="tai")
1992 # Insert some exposure records with timespans between each sequential
1993 # pair of those.
1994 butler.registry.insertDimensionData(
1995 "day_obs", {"instrument": "Cam1", "id": 20200101, "timespan": Timespan(t1, t5)}
1996 )
1997 butler.registry.insertDimensionData(
1998 "group",
1999 {"instrument": "Cam1", "name": "group0"},
2000 {"instrument": "Cam1", "name": "group1"},
2001 {"instrument": "Cam1", "name": "group2"},
2002 {"instrument": "Cam1", "name": "group3"},
2003 )
2004 butler.registry.insertDimensionData(
2005 "exposure",
2006 {
2007 "instrument": "Cam1",
2008 "id": 0,
2009 "group": "group0",
2010 "obs_id": "zero",
2011 "physical_filter": "Cam1-G",
2012 "day_obs": 20200101,
2013 "timespan": Timespan(t1, t2),
2014 },
2015 {
2016 "instrument": "Cam1",
2017 "id": 1,
2018 "group": "group1",
2019 "obs_id": "one",
2020 "physical_filter": "Cam1-G",
2021 "day_obs": 20200101,
2022 "timespan": Timespan(t2, t3),
2023 },
2024 {
2025 "instrument": "Cam1",
2026 "id": 2,
2027 "group": "group2",
2028 "obs_id": "two",
2029 "physical_filter": "Cam1-G",
2030 "day_obs": 20200101,
2031 "timespan": Timespan(t3, t4),
2032 },
2033 {
2034 "instrument": "Cam1",
2035 "id": 3,
2036 "group": "group3",
2037 "obs_id": "three",
2038 "physical_filter": "Cam1-G",
2039 "day_obs": 20200101,
2040 "timespan": Timespan(t4, t5),
2041 },
2042 )
2043 # Get references to the datasets we imported.
2044 bias = butler.get_dataset_type("bias")
2045 bias2a = butler.find_dataset("bias", instrument="Cam1", detector=2, collections="imported_g")
2046 assert bias2a is not None
2047 bias3a = butler.find_dataset("bias", instrument="Cam1", detector=3, collections="imported_g")
2048 assert bias3a is not None
2049 bias2b = butler.find_dataset("bias", instrument="Cam1", detector=2, collections="imported_r")
2050 assert bias2b is not None
2051 bias3b = butler.find_dataset("bias", instrument="Cam1", detector=3, collections="imported_r")
2052 assert bias3b is not None
2053 # Register the main calibration collection we'll be working with.
2054 collection = "Cam1/calibs"
2055 butler.collections.register(collection, type=CollectionType.CALIBRATION)
2056 # Certify 2a dataset with [t2, t4) validity.
2057 butler.registry.certify(collection, [bias2a], Timespan(begin=t2, end=t4))
2058 # Certify 3a over [t1, t3).
2059 butler.registry.certify(collection, [bias3a], Timespan(begin=t1, end=t3))
2060 # Certify 2b and 3b together over [t4, ∞).
2061 butler.registry.certify(collection, [bias2b, bias3b], Timespan(begin=t4, end=None))
2062 # Query for (bias, exposure, detector) combinations.
2063 base_data_id = DataCoordinate.standardize(instrument="Cam1", universe=butler.dimensions)
2064 with butler.query() as q:
2065 x = q.expression_factory
2066 q = q.join_dimensions(["exposure"])
2067 q = q.join_dataset_search("bias", [collection])
2068 # Query for all calibs with an explicit temporal join.
2069 self.assertCountEqual(
2070 [
2071 (data_id, refs[0])
2072 for data_id, refs, _ in q.where(
2073 x["bias"].timespan.overlaps(x.exposure.timespan), base_data_id
2074 )
2075 .general(
2076 butler.dimensions.conform(["exposure", "detector"]),
2077 dataset_fields={"bias": ...},
2078 find_first=True,
2079 )
2080 .iter_tuples(bias)
2081 ],
2082 [
2083 (DataCoordinate.standardize(base_data_id, detector=2, exposure=1), bias2a),
2084 (DataCoordinate.standardize(base_data_id, detector=2, exposure=2), bias2a),
2085 (DataCoordinate.standardize(base_data_id, detector=3, exposure=0), bias3a),
2086 (DataCoordinate.standardize(base_data_id, detector=3, exposure=1), bias3a),
2087 (DataCoordinate.standardize(base_data_id, detector=2, exposure=3), bias2b),
2088 (DataCoordinate.standardize(base_data_id, detector=3, exposure=3), bias3b),
2089 ],
2090 )
2091 # Query for all calibs with the temporal join implicit and the
2092 # dimensions given as an incomplete list (detector is added by
2093 # the dataset results).
2094 self.assertCountEqual(
2095 [
2096 (data_id, refs[0])
2097 for data_id, refs, _ in q.where(base_data_id)
2098 .general(["exposure"], dataset_fields={"bias": ...}, find_first=True)
2099 .iter_tuples(bias)
2100 ],
2101 [
2102 (DataCoordinate.standardize(base_data_id, detector=2, exposure=1), bias2a),
2103 (DataCoordinate.standardize(base_data_id, detector=2, exposure=2), bias2a),
2104 (DataCoordinate.standardize(base_data_id, detector=3, exposure=0), bias3a),
2105 (DataCoordinate.standardize(base_data_id, detector=3, exposure=1), bias3a),
2106 (DataCoordinate.standardize(base_data_id, detector=2, exposure=3), bias2b),
2107 (DataCoordinate.standardize(base_data_id, detector=3, exposure=3), bias3b),
2108 ],
2109 )
2110 # Query with an explicit timespan, but no calibration collections.
2111 # This should succeed because the timespan for the dataset_tags tables
2112 # are logically unbounded, not Null.
2113 with butler.query() as query:
2114 timespan_column = query.expression_factory["bias"].timespan
2115 result = (
2116 query.datasets("bias", collections=["imported_g"])
2117 .where(instrument="Cam1", detector=2)
2118 .where(
2119 timespan_column.overlaps(
2120 Timespan(begin=t1, end=t2),
2121 )
2122 )
2123 )
2124 refs = list(result)
2125 self.assertEqual([ref.id for ref in refs], [bias2a.id])
2127 # Query with an explicit timespan and a RUN collection ahead of
2128 # a CALIBRATION collection that would also match; the RUN collection
2129 # should win.
2130 with butler.query() as query:
2131 timespan_column = query.expression_factory["bias"].timespan
2132 result = (
2133 query.datasets("bias", collections=["imported_g", collection])
2134 .where(instrument="Cam1", detector=2)
2135 .where(
2136 timespan_column.overlaps(
2137 Timespan(begin=t1, end=t2),
2138 )
2139 )
2140 )
2141 refs = list(result)
2142 self.assertEqual([ref.id for ref in refs], [bias2a.id])
2144 # Query in multiple collections, with one of the collections being a
2145 # calibration collection. This triggers special cases related to
2146 # timespan columns in the query code.
2147 refs = butler.query_datasets(
2148 "bias",
2149 collections=[collection, "imported_g"],
2150 where="instrument = 'Cam1' and detector = 2",
2151 find_first=False,
2152 )
2153 self.assertEqual(
2154 sorted([str(ref.id) for ref in refs]),
2155 ["51352db4-a47a-447c-b12d-a50b206b17cd", "87f3e68d-258d-41b7-8ea5-edf3557ccb30"],
2156 )
2157 with butler.query() as query:
2158 query = query.join_dataset_search("bias", [collection, "imported_g"])
2159 query = query.where("instrument = 'Cam1' and detector = 2")
2161 results = list(
2162 query.general(
2163 ["detector"],
2164 dataset_fields={"bias": {"dataset_id"}},
2165 find_first=False,
2166 ).iter_tuples()
2167 )
2168 # Dataset ID should be de-duplicated with no collection/timespan
2169 # column present.
2170 self.assertEqual(
2171 sorted([str(x.raw_row["bias.dataset_id"]) for x in results]),
2172 ["51352db4-a47a-447c-b12d-a50b206b17cd", "87f3e68d-258d-41b7-8ea5-edf3557ccb30"],
2173 )
2175 results = list(
2176 query.general(
2177 ["detector"],
2178 dataset_fields={"bias": {"dataset_id", "timespan"}},
2179 find_first=False,
2180 ).iter_tuples()
2181 )
2182 # We should have one row for each timespan associated with each
2183 # dataset ID. The extra copy of "51352..." comes from the run
2184 # collection, with a timespan of None.
2185 self.assertEqual(
2186 sorted([str(x.raw_row["bias.dataset_id"]) for x in results]),
2187 [
2188 "51352db4-a47a-447c-b12d-a50b206b17cd",
2189 "51352db4-a47a-447c-b12d-a50b206b17cd",
2190 "87f3e68d-258d-41b7-8ea5-edf3557ccb30",
2191 ],
2192 )
2194 results = list(
2195 query.general(
2196 ["detector"],
2197 dataset_fields={"bias": {"dataset_id", "collection"}},
2198 find_first=False,
2199 ).iter_tuples()
2200 )
2201 # We should have one row for each collection associated with each
2202 # dataset ID.
2203 self.assertEqual(
2204 sorted([(str(x.raw_row["bias.dataset_id"]), x.raw_row["bias.collection"]) for x in results]),
2205 [
2206 ("51352db4-a47a-447c-b12d-a50b206b17cd", "Cam1/calibs"),
2207 ("51352db4-a47a-447c-b12d-a50b206b17cd", "imported_g"),
2208 ("87f3e68d-258d-41b7-8ea5-edf3557ccb30", "Cam1/calibs"),
2209 ],
2210 )
2212 def test_collection_query_info(self) -> None:
2213 butler = self.make_butler("base.yaml", "datasets.yaml")
2215 info = butler.collections.query_info("imported_g", include_summary=True)
2216 self.assertEqual(len(info), 1)
2217 dataset_types = info[0].dataset_types
2218 assert dataset_types is not None
2219 self.assertCountEqual(dataset_types, ["flat", "bias"])
2221 info = butler.collections.query_info("imported_g", include_summary=True, summary_datasets=["flat"])
2222 self.assertEqual(len(info), 1)
2223 dataset_types = info[0].dataset_types
2224 assert dataset_types is not None
2225 self.assertCountEqual(dataset_types, ["flat"])
2227 def test_dataset_queries(self) -> None:
2228 butler = self.make_butler("base.yaml", "spatial.yaml")
2230 # Need a dataset with some spatial information to trigger aggregate
2231 # value logic in queries.
2232 butler.registry.registerDatasetType(
2233 DatasetType("dt", ["visit", "detector"], "int", universe=butler.dimensions)
2234 )
2235 butler.collections.register("run1")
2236 butler.registry.insertDatasets("dt", [{"instrument": "Cam1", "visit": 1, "detector": 1}], "run1")
2238 # Tests for a regression of DM-46340, where invalid SQL would be
2239 # generated when the list of collections is a single run collection and
2240 # there is region-postprocessing logic involved. This was due to
2241 # missing type information associated with the "run" dataset field.
2242 result = butler.query_datasets(
2243 "dt",
2244 "run1",
2245 where="instrument='Cam1' and skymap='SkyMap1' and visit=1 and tract=0",
2246 with_dimension_records=True,
2247 )
2248 self.assertEqual(result[0].dataId, {"instrument": "Cam1", "visit": 1, "detector": 1})
2249 self.assertEqual(result[0].run, "run1")
2251 # A similar issue to the "run" issue above was occuring with the
2252 # 'collection' dataset field.
2253 with butler.query() as query:
2254 rows = list(
2255 query.join_dataset_search("dt", "run1")
2256 .where("instrument='Cam1' and skymap='SkyMap1' and visit=1 and tract=0")
2257 .general(
2258 dimensions=["visit", "detector"],
2259 dataset_fields={"dt": set(["collection"])},
2260 find_first=True,
2261 )
2262 )
2263 self.assertEqual(len(rows), 1)
2264 self.assertEqual(rows[0]["visit"], 1)
2265 self.assertEqual(rows[0]["dt.collection"], "run1")
2267 # Test that dataset fields like ingest_date can be used in the 'where'
2268 # clause.
2269 result = butler.query_datasets("dt", "run1", where="ingest_date > T'2000-01-01'")
2270 self.assertEqual(len(result), 1)
2271 result = butler.query_datasets("dt", "run1", where="ingest_date < T'2000-01-01'", explain=False)
2272 self.assertEqual(len(result), 0)
2273 result = butler.query_datasets(
2274 "dt", "run1", where="ingest_date OVERLAPS (T'2000-01-01', T'2099-01-01')"
2275 )
2276 self.assertEqual(len(result), 1)
2277 result = butler.query_datasets(
2278 "dt", "run1", where="(T'2000-01-01', T'2099-01-01') OVERLAPS ingest_date"
2279 )
2280 self.assertEqual(len(result), 1)
2281 result = butler.query_datasets(
2282 "dt", "run1", where="(T'2000-01-01', T'2001-01-01') OVERLAPS ingest_date", explain=False
2283 )
2284 self.assertEqual(len(result), 0)
2286 def test_multiple_instrument_queries(self) -> None:
2287 """Test that multiple-instrument queries are not rejected as having
2288 governor dimension ambiguities.
2289 """
2290 butler = self.make_butler("base.yaml")
2291 butler.registry.insertDimensionData("instrument", {"name": "Cam2"})
2292 self.assertCountEqual(
2293 butler.query_data_ids(["detector"], where="instrument='Cam1' OR instrument='Cam2'"),
2294 [
2295 DataCoordinate.standardize(instrument="Cam1", detector=n, universe=butler.dimensions)
2296 for n in range(1, 5)
2297 ],
2298 )
2299 self.assertCountEqual(
2300 butler.query_data_ids(
2301 ["detector"],
2302 where="(instrument='Cam1' OR instrument='Cam2') AND visit.region OVERLAPS :region",
2303 bind={"region": Region.from_ivoa_pos("CIRCLE 320. -0.25 10.")},
2304 explain=False,
2305 ),
2306 # No visits in this test dataset means no result, but the point of
2307 # the test is just that the query can be constructed at all.
2308 [],
2309 )
2310 self.assertCountEqual(
2311 butler.query_data_ids(
2312 ["instrument"],
2313 where="(instrument='Cam1' AND detector=2) OR (instrument='Cam2' AND detector=500)",
2314 explain=False,
2315 ),
2316 [DataCoordinate.standardize(instrument="Cam1", universe=butler.dimensions)],
2317 )
2319 def test_default_data_id(self) -> None:
2320 butler = self.make_butler("base.yaml")
2321 butler.registry.insertDimensionData("instrument", {"name": "Cam2"})
2322 butler.registry.insertDimensionData(
2323 "physical_filter", {"instrument": "Cam2", "name": "Cam2-G", "band": "g"}
2324 )
2326 # With no default data ID, queries should return results for all
2327 # instruments.
2328 result = butler.query_dimension_records("physical_filter")
2329 names = [x.name for x in result]
2330 self.assertCountEqual(names, ["Cam1-G", "Cam1-R1", "Cam1-R2", "Cam2-G"])
2332 result = butler.query_dimension_records("physical_filter", where="band='g'")
2333 names = [x.name for x in result]
2334 self.assertCountEqual(names, ["Cam1-G", "Cam2-G"])
2336 # When there is no default data ID and a where clause references
2337 # something depending on instrument, it throws an error as a
2338 # sanity check.
2339 # In this case, 'instrument' is not part of the dimensions returned by
2340 # the query, so there is extra logic needed to detect the need for the
2341 # default data ID.
2342 with self.assertRaisesRegex(
2343 InvalidQueryError,
2344 "Query 'where' expression references a dimension dependent on instrument"
2345 " without constraining it directly.",
2346 ):
2347 butler.query_data_ids(["band"], where="physical_filter='Cam1-G'")
2349 # Override the default data ID to specify a default instrument for
2350 # subsequent tests.
2351 butler.registry.defaults = RegistryDefaults(instrument="Cam1")
2353 # When a where clause references something depending on instrument, use
2354 # the default data ID to constrain the instrument.
2355 # In this case, 'instrument' is not part of the dimensions returned by
2356 # the query, so there is extra logic needed to detect the need for the
2357 # default data ID.
2358 data_ids = butler.query_data_ids(["band"], where="physical_filter='Cam1-G'")
2359 self.assertEqual([x["band"] for x in data_ids], ["g"])
2360 # Default data ID instrument=Cam1 does not match Cam2, so there are no
2361 # results.
2362 data_ids = butler.query_data_ids(["band"], where="physical_filter='Cam2-G'", explain=False)
2363 self.assertEqual(data_ids, [])
2364 # Overriding the default lets us get the results.
2365 data_ids = butler.query_data_ids(["band"], where="instrument='Cam2' and physical_filter='Cam2-G'")
2366 self.assertEqual([x["band"] for x in data_ids], ["g"])
2368 # Query for a dimension that depends on instrument should pull in the
2369 # default data ID instrument="Cam1" to constrain results.
2370 result = butler.query_dimension_records("physical_filter")
2371 names = [x.name for x in result]
2372 self.assertCountEqual(names, ["Cam1-G", "Cam1-R1", "Cam1-R2"])
2374 # Query for a dimension that depends on instrument should pull in the
2375 # default data ID instrument="Cam1" to constrain results, if the where
2376 # clause does not explicitly specify instrument.
2377 result = butler.query_dimension_records("physical_filter", where="band='g'")
2378 names = [x.name for x in result]
2379 self.assertEqual(names, ["Cam1-G"])
2381 # Queries that specify instrument explicitly in the where clause
2382 # should ignore the default data ID.
2383 result = butler.query_dimension_records("physical_filter", where="instrument='Cam2'")
2384 names = [x.name for x in result]
2385 self.assertCountEqual(names, ["Cam2-G"])
2387 result = butler.query_dimension_records("physical_filter", where="instrument IN ('Cam2')")
2388 names = [x.name for x in result]
2389 self.assertCountEqual(names, ["Cam2-G"])
2391 def test_unusual_column_literals(self) -> None:
2392 butler = self.make_butler("base.yaml")
2394 # Users frequently use numpy integer types as literals in queries.
2395 result = butler.query_dimension_records(
2396 "detector", data_id={"instrument": "Cam1", "detector": int64(1)}
2397 )
2398 names = [x.full_name for x in result]
2399 self.assertEqual(names, ["Aa"])
2401 result = butler.query_dimension_records(
2402 "detector", where="instrument='Cam1' and detector=:an_integer", bind={"an_integer": int64(2)}
2403 )
2404 names = [x.full_name for x in result]
2405 self.assertEqual(names, ["Ab"])
2407 with butler.query() as query:
2408 x = query.expression_factory
2409 result = list(
2410 query.dimension_records("detector").where(x.instrument == "Cam1", x.detector == int64(3))
2411 )
2412 names = [x.full_name for x in result]
2413 self.assertEqual(names, ["Ba"])
2415 def test_query_all_datasets(self) -> None:
2416 butler = self.make_butler("base.yaml", "datasets.yaml")
2418 # Make sure that refs are coming out well-formed.
2419 datasets = butler.query_all_datasets("imported_r", where="detector = 2", instrument="Cam1")
2420 datasets.sort(key=lambda ref: ref.datasetType.name)
2421 self.assertEqual(len(datasets), 2)
2422 bias = datasets[0]
2423 self.assertEqual(bias.datasetType.name, "bias")
2424 self.assertEqual(bias.dataId["instrument"], "Cam1")
2425 self.assertEqual(bias.dataId["detector"], 2)
2426 self.assertEqual(bias.run, "imported_r")
2427 self.assertEqual(bias.id, UUID("87f3e68d-258d-41b7-8ea5-edf3557ccb30"))
2428 flat = datasets[1]
2429 self.assertEqual(flat.datasetType.name, "flat")
2430 self.assertEqual(flat.dataId["instrument"], "Cam1")
2431 self.assertEqual(flat.dataId["detector"], 2)
2432 self.assertEqual(flat.dataId["physical_filter"], "Cam1-R1")
2433 self.assertEqual(flat.dataId["band"], "r")
2434 self.assertEqual(flat.run, "imported_r")
2435 self.assertEqual(flat.id, UUID("c1296796-56c5-4acf-9b49-40d920c6f840"))
2437 # Querying for everything finds everything.
2438 results = butler.query_all_datasets("*", find_first=False)
2439 self.assertEqual(len(results), 13)
2441 # constraining by data ID works
2442 detector_1_ids = ("d0bb04cd-d697-4a83-ba53-cdfcd58e3a0c", "e15ab039-bc8b-4135-87c5-90902a7c0b22")
2443 results = butler.query_all_datasets(
2444 "*", data_id={"detector": 1, "instrument": "Cam1"}, find_first=False
2445 )
2446 self.assertCountEqual(detector_1_ids, _ref_uuids(results))
2448 # bind values work.
2449 results = butler.query_all_datasets(
2450 "*", where="detector=:my_bind and instrument='Cam1'", bind={"my_bind": 1}, find_first=False
2451 )
2452 self.assertCountEqual(detector_1_ids, _ref_uuids(results))
2454 # find_first requires ordered collections.
2455 with self.assertRaisesRegex(InvalidQueryError, "Can not use wildcards"):
2456 results = butler.query_all_datasets("*")
2458 butler.collections.register("chain", CollectionType.CHAINED)
2459 butler.collections.redefine_chain("chain", ["imported_g", "imported_r"])
2460 results = butler.query_all_datasets(
2461 "chain", where="detector=2 and instrument = 'Cam1'", find_first=True
2462 )
2463 # find_first searches the collection chain in order.
2464 self.assertCountEqual(
2465 _ref_uuids(results),
2466 [
2467 "51352db4-a47a-447c-b12d-a50b206b17cd", # imported_g bias
2468 "60c8a65c-7290-4c38-b1de-e3b1cdcf872d", # imported_g flat
2469 "c1296796-56c5-4acf-9b49-40d920c6f840", # imported_r flat
2470 # There is also a bias dataset with detector=2 in imported_r,
2471 # but it is masked by the presence of the same data ID in
2472 # imported_g.
2473 ],
2474 )
2476 # collection searches work.
2477 results = butler.query_all_datasets(
2478 "*g", where="detector=1 and instrument = 'Cam1'", find_first=False
2479 )
2480 self.assertEqual(_ref_uuids(results), ["e15ab039-bc8b-4135-87c5-90902a7c0b22"])
2482 # we raise for missing collections with explicit names.
2483 with self.assertRaises(MissingCollectionError):
2484 results = butler.query_all_datasets("nonexistent")
2485 # we don't raise for collection wildcard searches that find nothing.
2486 results = butler.query_all_datasets("nonexistent*", find_first=False)
2487 self.assertEqual(results, [])
2489 # dataset type searches work.
2490 results = butler.query_all_datasets(
2491 "*", name="b*", where="detector=1 and instrument = 'Cam1'", find_first=False
2492 )
2493 self.assertEqual(_ref_uuids(results), ["e15ab039-bc8b-4135-87c5-90902a7c0b22"])
2495 # Missing dataset types raise.
2496 with self.assertRaises(MissingDatasetTypeError):
2497 results = butler.query_all_datasets("chain", name=["notfound", "flat"])
2498 with self.assertRaises(MissingDatasetTypeError):
2499 results = butler.query_all_datasets("chain", name="notfound*")
2501 # Limit of 3 lands at the boundary of a dataset type.
2502 # Limit of 4 is in the middle of a dataset type.
2503 for limit in [3, 4]:
2504 with self.subTest(limit=limit):
2505 results = butler.query_all_datasets("imported_g", limit=limit)
2506 self.assertEqual(len(results), limit)
2507 with self.assertLogs(level="WARNING") as log:
2508 results = butler.query_all_datasets("imported_g", limit=-limit)
2509 self.assertEqual(len(results), limit)
2510 self.assertIn("requested limit", log.output[0])
2512 results = butler.query_all_datasets("imported_g", limit=0)
2513 self.assertEqual(len(results), 0)
2515 # 'where' constraints that don't apply to all dataset types follow the
2516 # same rules as query_datasets.
2517 results = butler.query_all_datasets(
2518 "*", where="detector = 2 and band = 'g' and instrument = 'Cam1'", find_first=False
2519 )
2520 self.assertCountEqual(
2521 _ref_uuids(results),
2522 [
2523 # bias does not have 'band'
2524 "51352db4-a47a-447c-b12d-a50b206b17cd",
2525 "87f3e68d-258d-41b7-8ea5-edf3557ccb30",
2526 # flat does have 'band', and we filter based on it
2527 "60c8a65c-7290-4c38-b1de-e3b1cdcf872d",
2528 ],
2529 )
2531 # Default collections and data ID apply.
2532 butler.registry.defaults = RegistryDefaults(collections="imported_g")
2533 results = butler.query_all_datasets(where="detector = 2")
2534 self.assertCountEqual(
2535 _ref_uuids(results),
2536 ["51352db4-a47a-447c-b12d-a50b206b17cd", "60c8a65c-7290-4c38-b1de-e3b1cdcf872d"],
2537 )
2539 def test_irrelevant_governor_constraints(self) -> None:
2540 """Test that constraining an irrelevant governor dimension doesn't
2541 break dataset queries.
2542 """
2543 butler = self.make_butler("base.yaml", "spatial.yaml")
2544 butler.registry.insertDimensionData("instrument", {"name": "Cam2"})
2545 a = DatasetType("a", {"detector"}, "StructuredDataDict", universe=butler.dimensions)
2546 b = DatasetType("b", {"tract"}, "StructuredDataDict", universe=butler.dimensions)
2547 butler.registry.registerDatasetType(a)
2548 butler.registry.registerDatasetType(b)
2549 collection = "run1"
2550 butler.collections.register(collection)
2551 (ref_a,) = butler.registry.insertDatasets(a, [{"instrument": "Cam1", "detector": 2}], run=collection)
2552 (ref_b,) = butler.registry.insertDatasets(b, [{"skymap": "SkyMap1", "tract": 1}], run=collection)
2553 # First, some sanity-check query that's mostly to check the test setup.
2554 self.assertEqual(butler.query_datasets("a", collections=collection), [ref_a])
2555 self.assertEqual(butler.query_datasets("b", collections=collection), [ref_b])
2556 # Now check that we can get both with an irrelevant constraint.
2557 # In the first case, there is a dataset of a different type that is
2558 # consistent with the constraint in the collection:
2559 self.assertEqual(butler.query_datasets("a", collections=collection, skymap="SkyMap1"), [ref_a])
2560 # In the second case there is a dataset of a different type that is
2561 # inconsistent with the constraint in the collection:
2562 self.assertEqual(butler.query_datasets("b", collections=collection, instrument="Cam2"), [ref_b])
2564 def test_inferred_primary_key(self) -> None:
2565 """Test expressions that have an unqualified reference to a primary key
2566 field whose dimension must be inferred from context.
2567 """
2568 butler = self.make_butler("base.yaml")
2569 self.assertEqual(
2570 butler.query_dimension_records("detector", instrument="Cam1", where="id=2"),
2571 butler.query_dimension_records("detector", instrument="Cam1", detector=2),
2572 )
2574 def test_glob_expression(self) -> None:
2575 """Test GLOB() function in user expressions."""
2576 butler = self.make_butler("base.yaml")
2578 tests = (
2579 ("full_name", "*", 4),
2580 ("full_name", "\\*", 0),
2581 ("full_name", "A*", 2),
2582 ("full_name", "A?", 2),
2583 ("full_name", "??", 4),
2584 ("full_name", "*a", 2),
2585 ("full_name", "A[ab]", 0),
2586 ("purpose", "*EN?E", 3),
2587 ("purpose", "\\*CIENC\\*", 0),
2588 ("full_name", "%", 0),
2589 ("full_name", "__", 0),
2590 ("full_name", "a", 0),
2591 ("full_name", "", 0),
2592 )
2594 for column, pattern, count in tests:
2595 # Pattern as a literal string.
2596 records = butler.query_dimension_records(
2597 "detector", instrument="Cam1", where=f"GLOB({column}, '{pattern}')", explain=False
2598 )
2599 self.assertEqual(len(records), count)
2601 # Check that bind works with pattern.
2602 records = butler.query_dimension_records(
2603 "detector",
2604 instrument="Cam1",
2605 where=f"GLOB({column}, :pattern)",
2606 explain=False,
2607 bind={"pattern": pattern},
2608 )
2609 self.assertEqual(len(records), count)
2611 # Check that glob works on dimension itself, not just metadata.
2612 records = butler.query_dimension_records(
2613 "detector", where="GLOB(instrument, '?a*1') AND GLOB(full_name, '*')"
2614 )
2615 self.assertEqual(len(records), 4)
2617 # Check exceptions.
2618 with self.assertRaisesRegex(InvalidQueryError, "first argument must be a string column"):
2619 butler.query_dimension_records("detector", instrument="Cam1", where="GLOB(detector, '*')")
2621 # This ofails at parser level because parser expects string literal.
2622 with self.assertRaisesRegex(InvalidQueryError, "Failed to parse expression"):
2623 butler.query_dimension_records("detector", instrument="Cam1", where="GLOB(full_name, full_name)")
2625 def test_dataset_id_queries(self) -> None:
2626 """Test queries on dataset_id."""
2627 butler = self.make_butler("base.yaml", "datasets.yaml")
2629 dataset_id = UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22")
2631 refs = butler.query_datasets(
2632 "bias",
2633 "imported_g",
2634 instrument="Cam1",
2635 where="dataset_id = :ID",
2636 bind={"ID": dataset_id},
2637 )
2638 self.assertEqual({ref.id for ref in refs}, {dataset_id})
2640 dataset_ids = {
2641 UUID("87f3e68d-258d-41b7-8ea5-edf3557ccb30"),
2642 UUID("dc0ef017-dc94-4118-b431-d65b1ef89a5f"),
2643 UUID("e255067d-dcc5-4f39-9824-0baa5817d3e5"),
2644 }
2645 refs = butler.query_datasets(
2646 "bias",
2647 "imported_r",
2648 instrument="Cam1",
2649 where="bias.dataset_id IN (:IDS)",
2650 bind={"IDS": dataset_ids},
2651 )
2652 self.assertEqual({ref.id for ref in refs}, dataset_ids)
2654 refs = butler.query_datasets(
2655 "bias",
2656 "imported_g",
2657 instrument="Cam1",
2658 where="dataset_id = UUID('e15ab039-bc8b-4135-87c5-90902a7c0b22')",
2659 )
2660 self.assertEqual({ref.id for ref in refs}, {dataset_id})
2662 refs = butler.query_datasets(
2663 "bias",
2664 "imported_r",
2665 instrument="Cam1",
2666 where=(
2667 "bias.dataset_id IN ("
2668 "UUID('87f3e68d-258d-41b7-8ea5-edf3557ccb30'), "
2669 "UUID('dc0ef017-dc94-4118-b431-d65b1ef89a5f'), "
2670 "UUID('e255067d-dcc5-4f39-9824-0baa5817d3e5')"
2671 ")"
2672 ),
2673 )
2674 self.assertEqual({ref.id for ref in refs}, dataset_ids)
2677def _get_exposure_ids_from_dimension_records(dimension_records: Iterable[DimensionRecord]) -> list[int]:
2678 output = []
2679 for rec in dimension_records:
2680 id = rec.dataId["exposure"]
2681 assert isinstance(id, int)
2682 output.append(id)
2684 return output
2687def _ref_uuids(refs: list[DatasetRef]) -> list[str]:
2688 return [str(ref.id) for ref in refs]