Coverage for python / lsst / daf / butler / queries / _query.py: 16%
211 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Query", "QueryFactoryFunction")
32from collections.abc import Callable, Iterable, Mapping, Set
33from contextlib import AbstractContextManager
34from types import EllipsisType
35from typing import Any, TypeAlias, final
37import astropy.table
39from lsst.utils.iteration import ensure_iterable
41from .._dataset_type import DatasetType
42from .._exceptions import DimensionNameError, InvalidQueryError
43from .._storage_class import StorageClassFactory
44from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup
45from ..registry import DatasetTypeError
46from ._base import QueryBase
47from ._data_coordinate_query_results import DataCoordinateQueryResults
48from ._dataset_query_results import DatasetRefQueryResults
49from ._dimension_record_query_results import DimensionRecordQueryResults
50from ._general_query_results import GeneralQueryResults
51from ._identifiers import IdentifierContext, interpret_identifier
52from .convert_args import convert_where_args
53from .driver import QueryDriver
54from .expression_factory import ExpressionFactory
55from .predicate_constraints_summary import PredicateConstraintsSummary
56from .result_specs import (
57 DataCoordinateResultSpec,
58 DatasetRefResultSpec,
59 DimensionRecordResultSpec,
60 GeneralResultSpec,
61)
62from .tree import (
63 ANY_DATASET,
64 DatasetFieldName,
65 DatasetFieldReference,
66 DatasetSearch,
67 DimensionFieldReference,
68 DimensionKeyReference,
69 Predicate,
70 QueryTree,
71 make_identity_query_tree,
72)
75@final
76class Query(QueryBase):
77 """A method-chaining builder for butler queries.
79 Parameters
80 ----------
81 driver : `~.queries.driver.QueryDriver`
82 Implementation object that knows how to actually execute queries.
83 tree : `~.queries.tree.QueryTree`, optional
84 Description of the query as a tree of joins and column expressions.
85 Defaults to the result of a call to
86 `~.queries.tree.make_identity_query_tree`.
88 Notes
89 -----
90 `Query` objects should never be constructed directly by users; use
91 `Butler.query <lsst.daf.butler.Butler.query>` instead.
93 A `Query` object represents the first stage of query construction, in which
94 constraints and joins are defined (roughly corresponding to the WHERE and
95 FROM clauses in SQL). The various "results" objects represent the second
96 (and final) stage, where the columns returned are specified and any sorting
97 or integer slicing can be applied. Result objects are obtained from the
98 `data_ids`, `datasets`, and `dimension_records` methods.
100 `Query` and query-result objects are always immutable (except for caching
101 information fetched from the database or server), so modifier methods
102 always return a new object without modifying the current one.
103 """
105 def __init__(self, driver: QueryDriver, tree: QueryTree | None = None):
106 # __init__ defined here because there are multiple base classes and
107 # not all define __init__ (and hence inherit object.__init__, which
108 # just ignores its args). Even if we just delegate to super(), it
109 # seems less fragile to make it explicit here.
110 if tree is None:
111 tree = make_identity_query_tree(driver.universe)
112 super().__init__(driver, tree)
114 # If ``_allow_duplicate_overlaps`` is set to `True` then query will be
115 # allowed to generate non-distinct rows for spatial overlaps. This is
116 # not a part of public API for now, to be used by graph builder as
117 # optimization.
118 self._allow_duplicate_overlaps: bool = False
120 @property
121 def constraint_dataset_types(self) -> Set[str]:
122 """The names of all dataset types joined into the query.
124 The existence of datasets of these types constrains the data IDs of any
125 type of result. Fields for these dataset types are also usable in
126 'where' expressions.
127 """
128 # Note that this includes only dataset type names, not `DatasetType`
129 # instances; the `DatasetQueryResults` adapter returned by the
130 # `datasets` method does include `DatasetType` instances, since it is
131 # in a better position to track and respect any storage class override
132 # specified.
133 return self._tree.datasets.keys()
135 @property
136 def constraint_dimensions(self) -> DimensionGroup:
137 """Dimensions currently present in the query, either directly or
138 indirectly.
140 This includes dimensions that are present in any joined subquery (such
141 as a dataset search, materialization, or data ID upload) or `where`
142 argument, as well as any required or implied dependency of those
143 dimensions.
144 """
145 return self._tree.dimensions
147 @property
148 def expression_factory(self) -> ExpressionFactory:
149 """A factory for column expressions using overloaded operators.
150 (`~lsst.daf.butler.queries.expression_factory.ExpressionFactory`).
152 Notes
153 -----
154 Typically this attribute will be assigned to a single-character local
155 variable, and then its (dynamic) attributes can be used to obtain
156 references to columns that can be included in a query::
158 with butler.query() as query:
159 x = query.expression_factory
160 query = query.where(
161 x.instrument == "LSSTCam",
162 x.visit.day_obs > 20240701,
163 x.any(x.band == "u", x.band == "y"),
164 )
166 As shown above, the returned object also has an
167 `~lsst.daf.butler.queries.expression_factory.ExpressionFactory.any`
168 method to create combine expressions with logical OR (as well as
169 `~lsst.daf.butler.queries.expression_factory.ExpressionFactory.not_`
170 and
171 `~lsst.daf.butler.queries.expression_factory.ExpressionFactory.all`,
172 though the latter is rarely necessary since `where` already combines
173 its arguments with AND).
175 Proxies for fields associated with individual datasets but not
176 dimension records (``dataset_id``, ``ingest_date``, ``run``,
177 ``collection``, as well as ``timespan`` for
178 `~lsst.daf.butler.CollectionType.CALIBRATION` collection searches) can
179 be obtained with dict-like access instead::
181 with butler.query() as query:
182 query = query.order_by(x["raw"].ingest_date)
184 Expression proxy objects that correspond to scalar columns overload the
185 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``,
186 ``>=``) and provide
187 `~lsst.daf.butler.queries.expression_factory.ScalarExpressionProxy.in_range`,
188 `~lsst.daf.butler.queries.expression_factory.ScalarExpressionProxy.in_iterable`, and
189 `~lsst.daf.butler.queries.expression_factory.ScalarExpressionProxy.in_query`
190 methods for membership tests. For ``order_by`` contexts, they also have a
191 `~lsst.daf.butler.queries.expression_factory.ScalarExpressionProxy.desc`
192 property to indicate that the sort order for that expression should be
193 reversed.
195 Proxy objects for
196 `region <lsst.daf.butler.queries.expression_factory.RegionProxy>` and
197 `timespan <lsst.daf.butler.queries.expression_factory.TimespanProxy>`
198 fields have an ``overlaps`` method, and timespans also have
199 `~lsst.daf.butler.queries.expression_factory.TimespanProxy.begin` and
200 `~lsst.daf.butler.queries.expression_factory.TimespanProxy.end`
201 properties to access scalar expression proxies for the bounds.
203 All proxy objects also have an
204 `~lsst.daf.butler.queries.expression_factory.ExpressionProxy.is_null`
205 property.
207 Literal values can be created by calling
208 `ExpressionFactory.literal <lsst.daf.butler.queries.expression_factory.ExpressionFactory.literal>`,
209 but can almost always be created implicitly via overloaded operators
210 instead.
211 """ # noqa: W505, long docstrings
212 return ExpressionFactory(self._driver.universe)
214 def data_ids(
215 self, dimensions: DimensionGroup | Iterable[str] | str | None = None
216 ) -> DataCoordinateQueryResults:
217 """Return a result object that is a `~lsst.daf.butler.DataCoordinate`
218 iterable.
220 Parameters
221 ----------
222 dimensions : `~lsst.daf.butler.DimensionGroup`, `str`, or \
223 `~collections.abc.Iterable` [`str`], optional
224 The dimensions of the data IDs to yield, as either
225 `~lsst.daf.butler.DimensionGroup` instances or `str` names. Will
226 be automatically expanded to a complete
227 `~lsst.daf.butler.DimensionGroup`. These dimensions do not need to
228 match the query's current dimensions. Default is
229 `constraint_dimensions`.
231 Returns
232 -------
233 data_ids : `~lsst.daf.butler.queries.DataCoordinateQueryResults`
234 Data IDs matching the given query parameters. These are guaranteed
235 to identify all dimensions (``DataCoordinate.hasFull`` returns
236 `True`), but will not contain `~lsst.daf.butler.DimensionRecord`
237 objects (``DataCoordinate.hasRecords`` returns `False`). Call
238 `~DataCoordinateQueryResults.with_dimension_records` on the
239 returned object to include dimension records as well.
240 """
241 tree = self._tree
242 if dimensions is None:
243 dimensions = self._tree.dimensions
244 else:
245 dimensions = self._driver.universe.conform(dimensions)
246 if not dimensions <= self._tree.dimensions:
247 tree = tree.join_dimensions(dimensions)
248 result_spec = DataCoordinateResultSpec(
249 dimensions=dimensions,
250 include_dimension_records=False,
251 allow_duplicate_overlaps=self._allow_duplicate_overlaps,
252 )
253 return DataCoordinateQueryResults(self._driver, tree, result_spec)
255 def datasets(
256 self,
257 dataset_type: str | DatasetType,
258 collections: str | Iterable[str] | None = None,
259 *,
260 find_first: bool = True,
261 ) -> DatasetRefQueryResults:
262 """Return a result object that is a `~lsst.daf.butler.DatasetRef`
263 iterable.
265 Parameters
266 ----------
267 dataset_type : `str` or `~lsst.daf.butler.DatasetType`
268 The dataset type to search for.
269 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
270 The collection or collections to search, in order. If not provided
271 or `None`, and the dataset has not already been joined into the
272 query, the default collection search path for this butler is used.
273 find_first : `bool`, optional
274 If `True` (default), for each result data ID, only yield one
275 `~lsst.daf.butler.DatasetRef` of each
276 `~lsst.daf.butler.DatasetType`, from the first collection in
277 which a dataset of that dataset type appears (according to the
278 order of ``collections`` passed in). If `True`, ``collections``
279 must not be ``...``.
281 Returns
282 -------
283 refs : `lsst.daf.butler.queries.DatasetRefQueryResults`
284 Dataset references matching the given query criteria. Nested data
285 IDs are guaranteed to include values for all implied dimensions
286 (i.e. ``DataCoordinate.hasFull`` will return `True`), but will not
287 include dimension records (``DataCoordinate.hasRecords`` will be
288 `False`) unless
289 `~.queries.DatasetRefQueryResults.with_dimension_records` is
290 called on the result object (which returns a new one).
292 Raises
293 ------
294 lsst.daf.butler.registry.DatasetTypeExpressionError
295 Raised when the ``dataset_type`` expression is invalid.
296 lsst.daf.butler.registry.NoDefaultCollectionError
297 Raised when ``collections`` is `None` and default butler
298 collections are not defined.
299 TypeError
300 Raised when the arguments are incompatible, such as when a
301 collection wildcard is passed when ``find_first`` is `True`
302 """
303 dataset_type_name, storage_class_name, query = self._join_dataset_search_impl(
304 dataset_type, collections
305 )
306 dataset_search = query._tree.datasets[dataset_type_name]
307 spec = DatasetRefResultSpec.model_construct(
308 dataset_type_name=dataset_type_name,
309 dimensions=dataset_search.dimensions,
310 storage_class_name=storage_class_name,
311 include_dimension_records=False,
312 find_first=find_first,
313 allow_duplicate_overlaps=self._allow_duplicate_overlaps,
314 )
315 return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec)
317 def dimension_records(self, element: str) -> DimensionRecordQueryResults:
318 """Return a result object that is a `~lsst.daf.butler.DimensionRecord`
319 iterable.
321 Parameters
322 ----------
323 element : `str`
324 The name of a dimension element to obtain records for.
326 Returns
327 -------
328 records : `lsst.daf.butler.queries.DimensionRecordQueryResults`
329 Data IDs matching the given query parameters.
330 """
331 if element not in self._driver.universe:
332 # Prefer an explicit exception over a KeyError below.
333 raise DimensionNameError(
334 f"No such dimension '{element}', available dimensions: " + str(self._driver.universe.elements)
335 )
336 tree = self._tree
337 if element not in tree.dimensions.elements:
338 tree = tree.join_dimensions(self._driver.universe[element].minimal_group)
339 result_spec = DimensionRecordResultSpec(
340 element=self._driver.universe[element], allow_duplicate_overlaps=self._allow_duplicate_overlaps
341 )
342 return DimensionRecordQueryResults(self._driver, tree, result_spec)
344 def general(
345 self,
346 dimensions: DimensionGroup | Iterable[str],
347 *names: str,
348 dimension_fields: Mapping[str, Set[str]] | None = None,
349 dataset_fields: Mapping[str, Set[DatasetFieldName] | EllipsisType] | None = None,
350 find_first: bool | None = None,
351 ) -> GeneralQueryResults:
352 """Execute query returning general result.
354 **This is an experimental interface and may change at any time.**
356 Parameters
357 ----------
358 dimensions : `~lsst.daf.butler.DimensionGroup` or \
359 `~collections.abc.Iterable` [ `str` ]
360 The dimensions that span all fields returned by this query.
361 *names : `str`
362 Names of dimensions fields (in "dimension.field" format), dataset
363 fields (in "dataset_type.field" format) to include in this query.
364 dimension_fields : `~collections.abc.Mapping` [`str`, \
365 `~collections.abc.Set` [`str`]], optional
366 Dimension record fields included in this query, keyed by dimension
367 element name.
368 dataset_fields : `~collections.abc.Mapping` [`str`, \
369 `~collections.abc.Set` | ``...`` ], optional
370 Dataset fields included in this query, the key in the mapping is
371 dataset type name. Ellipsis (``...``) can be used for value
372 to include all dataset fields needed to extract
373 `~lsst.daf.butler.DatasetRef` instances later.
374 find_first : `bool`, optional
375 Whether this query requires find-first resolution for a dataset.
376 This is ignored and can be omitted if the query has no dataset
377 fields. It must be explicitly set to `False` if there are multiple
378 dataset types with fields, or if any dataset's ``collections``
379 or ``timespan`` fields are included in the results.
381 Returns
382 -------
383 result : `~lsst.daf.butler.queries.GeneralQueryResults`
384 Query result that can be iterated over.
386 Notes
387 -----
388 The dimensions of the returned query are automatically expanded to
389 include those associated with all dimension and dataset fields; the
390 ``dimensions`` argument is just the minimal dimensions to return.
391 """
392 if dimension_fields is None:
393 dimension_fields = {}
394 if dataset_fields is None:
395 dataset_fields = {}
396 # Processing fields from mapping args, processing the special `...`
397 # wildcard and dropping keys with empty values.
398 dataset_fields_dict: dict[str, set[DatasetFieldName]] = {}
399 for dataset_type_name, fields_for_dataset_type in dataset_fields.items():
400 if fields_for_dataset_type is ...:
401 new_fields_for_dataset_type: set[DatasetFieldName] = {
402 "run",
403 "dataset_id",
404 } # all we need for DatasetRefs.
405 else:
406 new_fields_for_dataset_type = set(fields_for_dataset_type)
407 if new_fields_for_dataset_type:
408 dataset_fields_dict[dataset_type_name] = new_fields_for_dataset_type
409 dimension_fields_dict = {
410 element_name: new_fields_for_element
411 for element_name, fields_for_element in dimension_fields.items()
412 if (new_fields_for_element := set(fields_for_element))
413 }
414 # Parse all names passed as positional arguments, and start to
415 # accumulate additional dimension names we'll need in the results.
416 dimensions = self._driver.universe.conform(dimensions)
417 context = IdentifierContext(dimensions, set(self._tree.datasets))
418 extra_dimension_names: set[str] = set()
419 for name in names:
420 identifier = interpret_identifier(context, name)
421 match identifier:
422 case DimensionKeyReference(dimension=dimension):
423 # Could be because someone asked for the key field.
424 extra_dimension_names.add(dimension.name)
425 case DimensionFieldReference(element=element, field=field):
426 dimension_fields_dict.setdefault(element.name, set()).add(field)
427 case DatasetFieldReference(dataset_type=dataset_type, field=dataset_field):
428 if dataset_type is ANY_DATASET:
429 raise InvalidQueryError("Dataset wildcard fields are not supported by Query.general.")
430 dataset_fields_dict.setdefault(dataset_type, set()).add(dataset_field)
431 case _:
432 raise TypeError(f"Unexpected type of identifier ({name}): {identifier}")
433 # Add more dimension names from the field mappings (including those
434 # we just populated from args). Also check that the dataset fields
435 # are consistent with find_first.
436 for element_name, fields in dimension_fields_dict.items():
437 extra_dimension_names.update(self._driver.universe[element_name].minimal_group.names)
438 for dataset_type_name, fields_for_dataset_type in dataset_fields_dict.items():
439 if "collections" in fields_for_dataset_type and find_first is not False:
440 raise InvalidQueryError(
441 f"find_first=False must be passed explicitly if {dataset_type_name}.collections "
442 "is included in query results."
443 )
444 if "timespan" in fields_for_dataset_type and find_first is not False:
445 raise InvalidQueryError(
446 f"find_first=False must be passed explicitly if {dataset_type_name}.timespan "
447 "is included in query results."
448 )
449 try:
450 dataset_search = self._tree.datasets[dataset_type_name]
451 except KeyError:
452 raise InvalidQueryError(
453 f"A search for dataset type {dataset_type_name!r} must be explicitly joined into the "
454 "query before including its fields in query results."
455 ) from None
456 extra_dimension_names.update(dataset_search.dimensions.names)
457 if find_first is None:
458 if dataset_fields_dict:
459 raise InvalidQueryError(
460 "find_first must be passed if dataset fields are included in query results."
461 )
462 else:
463 find_first = False
464 if find_first and len(dataset_fields_dict) != 1:
465 raise InvalidQueryError(
466 "find_first=True is not valid unless exactly one dataset type's fields are requested."
467 )
468 # Combine extra dimensions with the original ones.
469 dimensions = self._driver.universe.conform(dimensions.names | extra_dimension_names)
470 # Merge missing dimensions into the tree.
471 tree = self._tree
472 if not dimensions <= tree.dimensions:
473 tree = tree.join_dimensions(dimensions)
474 result_spec = GeneralResultSpec(
475 dimensions=dimensions,
476 dimension_fields=dimension_fields_dict,
477 dataset_fields=dataset_fields_dict,
478 find_first=find_first,
479 allow_duplicate_overlaps=self._allow_duplicate_overlaps,
480 )
481 return GeneralQueryResults(self._driver, tree=tree, spec=result_spec)
483 def materialize(
484 self,
485 *,
486 dimensions: Iterable[str] | DimensionGroup | None = None,
487 datasets: Iterable[str] | None = None,
488 ) -> Query:
489 """Execute the query, save its results to a temporary location, and
490 return a new query that represents fetching or joining against those
491 saved results.
493 Parameters
494 ----------
495 dimensions : `~collections.abc.Iterable` [ `str` ] or \
496 `~lsst.daf.butler.DimensionGroup`, optional
497 Dimensions to include in the temporary results. Default is to
498 include all dimensions in the query.
499 datasets : `~collections.abc.Iterable` [ `str` ], optional
500 Names of dataset types that should be included in the new query;
501 default is to include `constraint_dataset_types`.
503 Returns
504 -------
505 query : `Query`
506 A new query object whose that represents the materialized rows.
508 Notes
509 -----
510 Only dimension key columns and (at the discretion of the
511 implementation) certain dataset columns are actually materialized,
512 since at this stage we do not know which dataset or dimension record
513 fields are actually needed in result rows, and these can be joined back
514 in on the materialized dimension keys. But all constraints on those
515 dimension keys (including dataset existence) are applied to the
516 materialized rows.
517 """
518 if datasets is None:
519 datasets = frozenset(self.constraint_dataset_types)
520 else:
521 datasets = frozenset(datasets)
522 if not (datasets <= self.constraint_dataset_types):
523 raise InvalidQueryError(
524 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query."
525 )
526 if dimensions is None:
527 dimensions = self._tree.dimensions
528 else:
529 dimensions = self._driver.universe.conform(dimensions)
530 key = self._driver.materialize(
531 self._tree, dimensions, datasets, allow_duplicate_overlaps=self._allow_duplicate_overlaps
532 )
533 tree = make_identity_query_tree(self._driver.universe).join_materialization(
534 key, dimensions=dimensions
535 )
536 for dataset_type_name in datasets:
537 dataset_search = self._tree.datasets[dataset_type_name]
538 if not (dataset_search.dimensions <= tree.dimensions):
539 raise InvalidQueryError(
540 f"Materialization-backed query has dimensions {tree.dimensions}, which do not "
541 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. "
542 "Expand the dimensions or drop this dataset type in the arguments to materialize to "
543 "avoid this error."
544 )
545 tree = tree.join_dataset(dataset_type_name, dataset_search)
546 return Query(self._driver, tree)
548 def join_dataset_search(
549 self,
550 dataset_type: str | DatasetType,
551 collections: Iterable[str] | None = None,
552 ) -> Query:
553 """Return a new query with a search for a dataset joined in.
555 Parameters
556 ----------
557 dataset_type : `str` or `~lsst.daf.butler.DatasetType`
558 Dataset type or name. May not refer to a dataset component.
559 collections : `~collections.abc.Iterable` [ `str` ], optional
560 Iterable of collections to search. Order is preserved, but will
561 not matter if the dataset search is only used as a constraint on
562 dimensions or if ``find_first=False`` when requesting results. If
563 not present or `None`, the default collection search path will be
564 used.
566 Returns
567 -------
568 query : `Query`
569 A new query object with dataset columns available and rows
570 restricted to those consistent with the found data IDs.
572 Raises
573 ------
574 DatasetTypeError
575 Raised if given dataset type is inconsistent with the registered
576 dataset type.
577 MissingDatasetTypeError
578 Raised if the dataset type has not been registered and only a
579 `str` dataset type name was given.
581 Notes
582 -----
583 This method may require communication with the server unless the
584 dataset type and collections have already been referenced by the same
585 query context.
586 """
587 _, _, query = self._join_dataset_search_impl(
588 dataset_type, collections, allow_storage_class_overrides=False
589 )
590 return query
592 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query:
593 """Return a new query that joins in an explicit iterable of data IDs.
595 Parameters
596 ----------
597 iterable : `~collections.abc.Iterable` \
598 [`~lsst.daf.butler.DataCoordinate`]
599 Iterable of `~lsst.daf.butler.DataCoordinate`. All items must have
600 the same dimensions. Must have at least one item.
602 Returns
603 -------
604 query : `Query`
605 A new query object with the data IDs joined in.
606 """
607 rows: set[tuple[DataIdValue, ...]] = set()
608 dimensions: DimensionGroup | None = None
609 for data_coordinate in iterable:
610 if dimensions is None:
611 dimensions = data_coordinate.dimensions
612 elif dimensions != data_coordinate.dimensions:
613 raise InvalidQueryError(
614 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}."
615 )
616 rows.add(data_coordinate.required_values)
617 if dimensions is None:
618 raise InvalidQueryError("Cannot upload an empty data coordinate set.")
619 key = self._driver.upload_data_coordinates(dimensions, rows)
620 return Query(
621 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key),
622 driver=self._driver,
623 )
625 def join_data_coordinate_table(self, table: astropy.table.Table) -> Query:
626 """Return a new query that joins in an explicit table of data IDs.
628 Parameters
629 ----------
630 table : `astropy.table.Table`
631 A table of data IDs to join. Columns must be dimension names, and
632 columns for dimensions whose values that are implied by others are
633 ignored. If there is no column for a required dimension is missing
634 but is fully constrained to a literal by a previous `where` call, a
635 constant-valued column will be added.
637 Returns
638 -------
639 query : `Query`
640 A new query object with the data IDs joined in.
641 """
642 if not len(table):
643 raise InvalidQueryError("Cannot upload an empty data coordinate set.")
644 column_names = set(table.colnames)
645 dimensions = self._driver.universe.conform(column_names)
646 # To avoid numpy scalar types that will upset SQLAlchemy, we turn the
647 # columns we care about into lists of regular Python scalars. We do
648 # this in dimensions.required order so we can zip the values of this
649 # dict later to make data ID 'required_values' tuples.
650 column_lists = {d: table[d].data.tolist() if d in column_names else None for d in dimensions.required}
651 if not column_names.issuperset(dimensions.required):
652 # If columns are missing, see if they're fixed by a previous
653 # `where` call or equivalent.
654 predicate_summary = PredicateConstraintsSummary(self._tree.predicate)
655 missing = dimensions.required - column_names
656 provided_by_predicate = predicate_summary.constraint_data_id.keys() & missing
657 missing -= provided_by_predicate
658 if missing:
659 raise InvalidQueryError(f"Data coordinate table is missing required dimension(s) {missing}.")
660 if provided_by_predicate:
661 for k in provided_by_predicate:
662 column_lists[k] = [predicate_summary.constraint_data_id[k]] * len(table)
663 key = self._driver.upload_data_coordinates(dimensions, zip(*column_lists.values(), strict=True))
664 return Query(
665 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key),
666 driver=self._driver,
667 )
669 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query:
670 """Return a new query that joins the logical tables for additional
671 dimensions.
673 Parameters
674 ----------
675 dimensions : `~collections.abc.Iterable` [ `str` ] or \
676 `~lsst.daf.butler.DimensionGroup`
677 Names of dimensions to join in.
679 Returns
680 -------
681 query : `Query`
682 A new query object with the dimensions joined in.
684 Notes
685 -----
686 Dimensions are automatically joined in whenever needed, so this method
687 should rarely need to be called directly.
688 """
689 dimensions = self._driver.universe.conform(dimensions)
690 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver)
692 def where(
693 self,
694 *args: str | Predicate | DataId,
695 bind: Mapping[str, Any] | None = None,
696 **kwargs: Any,
697 ) -> Query:
698 """Return a query with a boolean-expression filter on its rows.
700 Parameters
701 ----------
702 *args
703 Constraints to apply, combined with logical AND. Arguments may be
704 `str` expressions to parse,
705 `~lsst.daf.butler.queries.tree.Predicate` objects (these are
706 typically constructed via `expression_factory`) or data IDs.
707 bind : `~collections.abc.Mapping`
708 Mapping from string identifier appearing in a string expression to
709 a literal value that should be substituted for it. This is
710 recommended instead of embedding literals directly into the
711 expression, especially for strings, timespans, or other types where
712 quoting or formatting is nontrivial.
713 **kwargs
714 Data ID key value pairs that extend and override any present in
715 ``*args``.
717 Returns
718 -------
719 query : `Query`
720 A new query object with the given row filters (as well as any
721 already present in ``self``). All row filters are combined with
722 logical AND.
724 Notes
725 -----
726 Expressions referring to dimensions or dimension elements are resolved
727 automatically. References to dataset fields (see `expression_factory`
728 for the distinction) cannot be resolved by default; they must either be
729 preceded by a call to `join_dataset_search` or must be passed to
730 `DatasetRefQueryResults.where <lsst.daf.butler.queries.DatasetRefQueryResults.where>`
731 instead.
733 Data ID values are not checked for consistency; they are extracted from
734 ``args`` and then ``kwargs`` and combined, with later values overriding
735 earlier ones.
736 """ # noqa: W505, long docstrings
737 return Query(
738 tree=self._tree.where(
739 convert_where_args(
740 self.constraint_dimensions,
741 self.constraint_dataset_types,
742 *args,
743 bind=bind,
744 **kwargs,
745 )
746 ),
747 driver=self._driver,
748 )
750 def _skip_governor_validation(self) -> Query:
751 tree = self._tree.model_copy(update={"validateGovernorConstraints": False})
752 return Query(tree=tree, driver=self._driver)
754 def _join_dataset_search_impl(
755 self,
756 dataset_type: str | DatasetType,
757 collections: Iterable[str] | None = None,
758 allow_storage_class_overrides: bool = True,
759 ) -> tuple[str, str, Query]:
760 """Implement `join_dataset_search`, and also return the dataset type
761 name and storage class, in addition to the modified Query.
762 """
763 # In this method we need the dimensions of the dataset type, but we
764 # might not need the storage class, since the dataset may only be used
765 # as an existence constraint. It depends on whether
766 # `join_dataset_search` or `datasets` is calling this method.
767 dimensions: DimensionGroup | None = None
768 storage_class_name: str | None = None
769 # Handle DatasetType vs. str arg.
770 if isinstance(dataset_type, DatasetType):
771 dataset_type_name = dataset_type.name
772 dimensions = dataset_type.dimensions
773 storage_class_name = dataset_type.storageClass_name
774 elif isinstance(dataset_type, str):
775 dataset_type_name = dataset_type
776 else:
777 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.")
778 # See if this dataset has already been joined into the query.
779 if existing_search := self._tree.datasets.get(dataset_type_name):
780 if collections is None:
781 collections = existing_search.collections
782 else:
783 collections = tuple(ensure_iterable(collections))
784 if collections != existing_search.collections:
785 raise InvalidQueryError(
786 f"Dataset type {dataset_type_name!r} was already joined into this "
787 "query with a different collection search path (previously "
788 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])."
789 )
790 if dimensions is None:
791 dimensions = existing_search.dimensions
792 else:
793 if collections is None:
794 collections = self._driver.get_default_collections()
795 collections = tuple(ensure_iterable(collections))
796 # Look up the data repository definition of the dataset type to check
797 # for consistency, or get dimensions and storage class if we don't have
798 # them.
799 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name)
800 resolved_dimensions = resolved_dataset_type.dimensions
801 if dimensions is not None and dimensions != resolved_dimensions:
802 raise DatasetTypeError(
803 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the "
804 f"registered dimensions {resolved_dimensions}."
805 )
806 if storage_class_name is not None:
807 if storage_class_name != resolved_dataset_type.storageClass_name:
808 if not allow_storage_class_overrides:
809 raise InvalidQueryError(
810 f"Storage class {storage_class_name!r} for dataset type {dataset_type!r} differs "
811 f"from repository definition {resolved_dataset_type.storageClass_name!r}, but "
812 "join_dataset_search does not are about storage classes and cannot record this "
813 "override. Pass the override to `Query.datasets` instead."
814 )
815 if not (
816 StorageClassFactory()
817 .getStorageClass(storage_class_name)
818 .can_convert(resolved_dataset_type.storageClass)
819 ):
820 raise DatasetTypeError(
821 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not "
822 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}."
823 )
824 else:
825 storage_class_name = resolved_dataset_type.storageClass_name
826 dataset_search = DatasetSearch.model_construct(
827 collections=collections,
828 dimensions=resolved_dimensions,
829 )
830 return (
831 dataset_type_name,
832 storage_class_name,
833 Query(self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)),
834 )
837QueryFactoryFunction: TypeAlias = Callable[[], AbstractContextManager[Query]]
838"""
839Type signature for a function returning a context manager that sets up a
840`Query` object. (That is, a function equivalent to ``Butler.query()``).
841"""