Coverage for python/lsst/daf/butler/queries/_query.py: 25%
123 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-04 02:55 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-04 02:55 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Query",)
32from collections.abc import Iterable, Mapping, Set
33from typing import Any, final
35from lsst.utils.iteration import ensure_iterable
37from .._dataset_type import DatasetType
38from .._storage_class import StorageClassFactory
39from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup
40from ..registry import DatasetTypeError
41from ._base import QueryBase
42from ._data_coordinate_query_results import DataCoordinateQueryResults
43from ._dataset_query_results import DatasetRefQueryResults
44from ._dimension_record_query_results import DimensionRecordQueryResults
45from .convert_args import convert_where_args
46from .driver import QueryDriver
47from .expression_factory import ExpressionFactory
48from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec, DimensionRecordResultSpec
49from .tree import DatasetSearch, InvalidQueryError, Predicate, QueryTree, make_identity_query_tree
52@final
53class Query(QueryBase):
54 """A method-chaining builder for butler queries.
56 Parameters
57 ----------
58 driver : `QueryDriver`
59 Implementation object that knows how to actually execute queries.
60 tree : `QueryTree`
61 Description of the query as a tree of joins and column expressions. The
62 instance returned directly by the `Butler._query` entry point should be
63 constructed via `make_identity_query_tree`.
65 Notes
66 -----
67 `Query` objects should never be constructed directly by users; use
68 `Butler._query` instead.
70 A `Query` object represents the first stage of query construction, in which
71 constraints and joins are defined (roughly corresponding to the WHERE and
72 FROM clauses in SQL). The various "results" objects represent the second
73 (and final) stage, where the columns returned are specified and any sorting
74 or integer slicing can be applied. Result objects are obtained from the
75 `data_ids`, `datasets`, and `dimension_records` methods.
77 `Query` and query-result objects are always immutable (except for caching
78 information fetched from the database or server), so modifier methods
79 always return a new object without modifying the current one.
80 """
82 def __init__(self, driver: QueryDriver, tree: QueryTree):
83 # __init__ defined here because there are multiple base classes and
84 # not all define __init__ (and hence inherit object.__init__, which
85 # just ignores its args). Even if we just delegate to super(), it
86 # seems less fragile to make it explicit here.
87 super().__init__(driver, tree)
89 @property
90 def constraint_dataset_types(self) -> Set[str]:
91 """The names of all dataset types joined into the query.
93 The existence of datasets of these types constrains the data IDs of any
94 type of result. Fields for these dataset types are also usable in
95 'where' expressions.
96 """
97 # Note that this includes only dataset type names, not `DatasetType`
98 # instances; the `DatasetQueryResults` adapter returned by the
99 # `datasets` method does include `DatasetType` instances, since it is
100 # in a better position to track and respect any storage class override
101 # specified.
102 return self._tree.datasets.keys()
104 @property
105 def constraint_dimensions(self) -> DimensionGroup:
106 """Dimensions currently present in the query, either directly or
107 indirectly.
109 This includes dimensions that are present in any joined subquery (such
110 as a dataset search, materialization, or data ID upload) or `where`
111 argument, as well as any required or implied dependency of those
112 dimensions.
113 """
114 return self._tree.dimensions
116 @property
117 def expression_factory(self) -> ExpressionFactory:
118 """A factory for column expressions using overloaded operators.
120 Notes
121 -----
122 Typically this attribute will be assigned to a single-character local
123 variable, and then its (dynamic) attributes can be used to obtain
124 references to columns that can be included in a query::
126 with butler._query() as query:
127 x = query.expression_factory
128 query = query.where(
129 x.instrument == "LSSTCam",
130 x.visit.day_obs > 20240701,
131 x.any(x.band == 'u', x.band == 'y'),
132 )
134 As shown above, the returned object also has an `any` method to create
135 combine expressions with logical OR (as well as `not_` and `all`,
136 though the latter is rarely necessary since `where` already combines
137 its arguments with AND).
139 Proxies for fields associated with dataset types (``dataset_id``,
140 ``ingest_date``, ``run``, ``collection``, as well as ``timespan`` for
141 `~CollectionType.CALIBRATION` collection searches) can be obtained with
142 dict-like access instead::
144 with butler._query() as query:
145 query = query.order_by(x["raw"].ingest_date)
147 Expression proxy objects that correspond to scalar columns overload the
148 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``,
149 ``>=``) and provide `~ScalarExpressionProxy.in_range`,
150 `~ScalarExpressionProxy.in_iterable`, and
151 `~ScalarExpressionProxy.in_query` methods for membership tests. For
152 `order_by` contexts, they also have a `~ScalarExpressionProxy.desc`
153 property to indicate that the sort order for that expression should be
154 reversed.
156 Proxy objects for region and timespan fields have an `overlaps` method,
157 and timespans also have `~TimespanProxy.begin` and `~TimespanProxy.end`
158 properties to access scalar expression proxies for the bounds.
160 All proxy objects also have a `~ExpressionProxy.is_null` property.
162 Literal values can be created by calling `ExpressionFactory.literal`,
163 but can almost always be created implicitly via overloaded operators
164 instead.
165 """
166 return ExpressionFactory(self._driver.universe)
168 def data_ids(
169 self, dimensions: DimensionGroup | Iterable[str] | str | None = None
170 ) -> DataCoordinateQueryResults:
171 """Return a result object that is a `DataCoordinate` iterable.
173 Parameters
174 ----------
175 dimensions : `DimensionGroup`, `str`, or \
176 `~collections.abc.Iterable` [`str`], optional
177 The dimensions of the data IDs to yield, as either `DimensionGroup`
178 instances or `str` names. Will be automatically expanded to a
179 complete `DimensionGroup`. These dimensions do not need to match
180 the query's current `dimensions`. Default is
181 `constraint_dimensions`.
183 Returns
184 -------
185 data_ids : `DataCoordinateQueryResults`
186 Data IDs matching the given query parameters. These are guaranteed
187 to identify all dimensions (`DataCoordinate.hasFull` returns
188 `True`), but will not contain `DimensionRecord` objects
189 (`DataCoordinate.hasRecords` returns `False`). Call
190 `~DataCoordinateQueryResults.with_dimension_records` on the
191 returned object to include dimension records as well.
192 """
193 tree = self._tree
194 if dimensions is None:
195 dimensions = self._tree.dimensions
196 else:
197 dimensions = self._driver.universe.conform(dimensions)
198 if not dimensions <= self._tree.dimensions:
199 tree = tree.join_dimensions(dimensions)
200 result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False)
201 return DataCoordinateQueryResults(self._driver, tree, result_spec)
203 def datasets(
204 self,
205 dataset_type: str | DatasetType,
206 collections: str | Iterable[str] | None = None,
207 *,
208 find_first: bool = True,
209 ) -> DatasetRefQueryResults:
210 """Return a result object that is a `DatasetRef` iterable.
212 Parameters
213 ----------
214 dataset_type : `str` or `DatasetType`
215 The dataset type to search for.
216 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
217 The collection or collections to search, in order. If not provided
218 or `None`, and the dataset has not already been joined into the
219 query, the default collection search path for this butler is used.
220 find_first : `bool`, optional
221 If `True` (default), for each result data ID, only yield one
222 `DatasetRef` of each `DatasetType`, from the first collection in
223 which a dataset of that dataset type appears (according to the
224 order of ``collections`` passed in). If `True`, ``collections``
225 must not be ``...``.
227 Returns
228 -------
229 refs : `.queries.DatasetRefQueryResults`
230 Dataset references matching the given query criteria. Nested data
231 IDs are guaranteed to include values for all implied dimensions
232 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
233 include dimension records (`DataCoordinate.hasRecords` will be
234 `False`) unless
235 `~.queries.DatasetRefQueryResults.with_dimension_records` is
236 called on the result object (which returns a new one).
238 Raises
239 ------
240 lsst.daf.butler.registry.DatasetTypeExpressionError
241 Raised when the ``dataset_type`` expression is invalid.
242 lsst.daf.butler.registry.NoDefaultCollectionError
243 Raised when ``collections`` is `None` and default butler
244 collections are not defined.
245 TypeError
246 Raised when the arguments are incompatible, such as when a
247 collection wildcard is passed when ``find_first`` is `True`
249 Notes
250 -----
251 When multiple dataset types are queried in a single call, the
252 results of this operation are equivalent to querying for each dataset
253 type separately in turn, and no information about the relationships
254 between datasets of different types is included.
255 """
256 dataset_type_name, storage_class_name, query = self._join_dataset_search_impl(
257 dataset_type, collections
258 )
259 dataset_search = query._tree.datasets[dataset_type_name]
260 spec = DatasetRefResultSpec.model_construct(
261 dataset_type_name=dataset_type_name,
262 dimensions=dataset_search.dimensions,
263 storage_class_name=storage_class_name,
264 include_dimension_records=False,
265 find_first=find_first,
266 )
267 return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec)
269 def dimension_records(self, element: str) -> DimensionRecordQueryResults:
270 """Return a result object that is a `DimensionRecord` iterable.
272 Parameters
273 ----------
274 element : `str`
275 The name of a dimension element to obtain records for.
277 Returns
278 -------
279 records : `.queries.DimensionRecordQueryResults`
280 Data IDs matching the given query parameters.
281 """
282 tree = self._tree
283 if element not in tree.dimensions.elements:
284 tree = tree.join_dimensions(self._driver.universe[element].minimal_group)
285 result_spec = DimensionRecordResultSpec(element=self._driver.universe[element])
286 return DimensionRecordQueryResults(self._driver, tree, result_spec)
288 def materialize(
289 self,
290 *,
291 dimensions: Iterable[str] | DimensionGroup | None = None,
292 datasets: Iterable[str] | None = None,
293 ) -> Query:
294 """Execute the query, save its results to a temporary location, and
295 return a new query that represents fetching or joining against those
296 saved results.
298 Parameters
299 ----------
300 dimensions : `~collections.abc.Iterable` [ `str` ] or \
301 `DimensionGroup`, optional
302 Dimensions to include in the temporary results. Default is to
303 include all dimensions in the query.
304 datasets : `~collections.abc.Iterable` [ `str` ], optional
305 Names of dataset types that should be included in the new query;
306 default is to include `constraint_dataset_types`.
308 Returns
309 -------
310 query : `Query`
311 A new query object whose that represents the materialized rows.
313 Notes
314 -----
315 Only dimension key columns and (at the discretion of the
316 implementation) certain dataset columns are actually materialized,
317 since at this stage we do not know which dataset or dimension record
318 fields are actually needed in result rows, and these can be joined back
319 in on the materialized dimension keys. But all constraints on those
320 dimension keys (including dataset existence) are applied to the
321 materialized rows.
322 """
323 if datasets is None:
324 datasets = frozenset(self.constraint_dataset_types)
325 else:
326 datasets = frozenset(datasets)
327 if not (datasets <= self.constraint_dataset_types):
328 raise InvalidQueryError(
329 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query."
330 )
331 if dimensions is None:
332 dimensions = self._tree.dimensions
333 else:
334 dimensions = self._driver.universe.conform(dimensions)
335 key = self._driver.materialize(self._tree, dimensions, datasets)
336 tree = make_identity_query_tree(self._driver.universe).join_materialization(
337 key, dimensions=dimensions
338 )
339 for dataset_type_name in datasets:
340 dataset_search = self._tree.datasets[dataset_type_name]
341 if not (dataset_search.dimensions <= tree.dimensions):
342 raise InvalidQueryError(
343 f"Materialization-backed query has dimensions {tree.dimensions}, which do not "
344 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. "
345 "Expand the dimensions or drop this dataset type in the arguments to materialize to "
346 "avoid this error."
347 )
348 tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name])
349 return Query(self._driver, tree)
351 def join_dataset_search(
352 self,
353 dataset_type: str | DatasetType,
354 collections: Iterable[str] | None = None,
355 ) -> Query:
356 """Return a new query with a search for a dataset joined in.
358 Parameters
359 ----------
360 dataset_type : `str` or `DatasetType`
361 Dataset type or name. May not refer to a dataset component.
362 collections : `~collections.abc.Iterable` [ `str` ], optional
363 Iterable of collections to search. Order is preserved, but will
364 not matter if the dataset search is only used as a constraint on
365 dimensions or if ``find_first=False`` when requesting results. If
366 not present or `None`, the default collection search path will be
367 used.
369 Returns
370 -------
371 query : `Query`
372 A new query object with dataset columns available and rows
373 restricted to those consistent with the found data IDs.
375 Raises
376 ------
377 DatasetTypeError
378 Raised given dataset type is inconsistent with the registered
379 dataset type.
380 MissingDatasetTypeError
381 Raised if the dataset type has not been registered and only a
382 `str` dataset type name was given.
384 Notes
385 -----
386 This method may require communication with the server unless the
387 dataset type and collections have already been referenced by the same
388 query context.
389 """
390 _, _, query = self._join_dataset_search_impl(
391 dataset_type, collections, allow_storage_class_overrides=False
392 )
393 return query
395 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query:
396 """Return a new query that joins in an explicit table of data IDs.
398 Parameters
399 ----------
400 iterable : `~collections.abc.Iterable` [ `DataCoordinate` ]
401 Iterable of `DataCoordinate`. All items must have the same
402 dimensions. Must have at least one item.
404 Returns
405 -------
406 query : `Query`
407 A new query object with the data IDs joined in.
408 """
409 rows: set[tuple[DataIdValue, ...]] = set()
410 dimensions: DimensionGroup | None = None
411 for data_coordinate in iterable:
412 if dimensions is None:
413 dimensions = data_coordinate.dimensions
414 elif dimensions != data_coordinate.dimensions:
415 raise InvalidQueryError(
416 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}."
417 )
418 rows.add(data_coordinate.required_values)
419 if dimensions is None:
420 raise InvalidQueryError("Cannot upload an empty data coordinate set.")
421 key = self._driver.upload_data_coordinates(dimensions, rows)
422 return Query(
423 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), driver=self._driver
424 )
426 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query:
427 """Return a new query that joins the logical tables for additional
428 dimensions.
430 Parameters
431 ----------
432 dimensions : `~collections.abc.Iterable` [ `str` ] or `DimensionGroup`
433 Names of dimensions to join in.
435 Returns
436 -------
437 query : `Query`
438 A new query object with the dimensions joined in.
440 Notes
441 -----
442 Dimensions are automatically joined in whenever needed, so this method
443 should rarely need to be called directly.
444 """
445 dimensions = self._driver.universe.conform(dimensions)
446 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver)
448 def where(
449 self,
450 *args: str | Predicate | DataId,
451 bind: Mapping[str, Any] | None = None,
452 **kwargs: Any,
453 ) -> Query:
454 """Return a query with a boolean-expression filter on its rows.
456 Parameters
457 ----------
458 *args
459 Constraints to apply, combined with logical AND. Arguments may be
460 `str` expressions to parse, `Predicate` objects (these are
461 typically constructed via `expression_factory`) or data IDs.
462 bind : `~collections.abc.Mapping`
463 Mapping from string identifier appearing in a string expression to
464 a literal value that should be substituted for it. This is
465 recommended instead of embedding literals directly into the
466 expression, especially for strings, timespans, or other types where
467 quoting or formatting is nontrivial.
468 **kwargs
469 Data ID key value pairs that extend and override any present in
470 ``*args``.
472 Returns
473 -------
474 query : `Query`
475 A new query object with the given row filters (as well as any
476 already present in ``self``). All row filters are combined with
477 logical AND.
479 Notes
480 -----
481 If an expression references a dimension or dimension element that is
482 not already present in the query, it will be joined in, but dataset
483 searches must already be joined into a query in order to reference
484 their fields in expressions.
486 Data ID values are not checked for consistency; they are extracted from
487 ``args`` and then ``kwargs`` and combined, with later values overriding
488 earlier ones.
489 """
490 return Query(
491 tree=self._tree.where(
492 convert_where_args(self.dimensions, self.constraint_dataset_types, *args, bind=bind, **kwargs)
493 ),
494 driver=self._driver,
495 )
497 def _join_dataset_search_impl(
498 self,
499 dataset_type: str | DatasetType,
500 collections: Iterable[str] | None = None,
501 allow_storage_class_overrides: bool = True,
502 ) -> tuple[str, str, Query]:
503 """Implement `join_dataset_search`, and also return the dataset type
504 name and storage class, in addition to the modified Query.
505 """
506 # In this method we need the dimensions of the dataset type, but we
507 # might not need the storage class, since the dataset may only be used
508 # as an existence constraint. It depends on whether
509 # `join_dataset_search` or `datasets` is calling this method.
510 dimensions: DimensionGroup | None = None
511 storage_class_name: str | None = None
512 # Handle DatasetType vs. str arg.
513 if isinstance(dataset_type, DatasetType):
514 dataset_type_name = dataset_type.name
515 dimensions = dataset_type.dimensions.as_group()
516 storage_class_name = dataset_type.storageClass_name
517 elif isinstance(dataset_type, str):
518 dataset_type_name = dataset_type
519 else:
520 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.")
521 # See if this dataset has already been joined into the query.
522 if existing_search := self._tree.datasets.get(dataset_type_name):
523 if collections is None:
524 collections = existing_search.collections
525 else:
526 collections = tuple(ensure_iterable(collections))
527 if collections != existing_search.collections:
528 raise InvalidQueryError(
529 f"Dataset type {dataset_type_name!r} was already joined into this "
530 "query with a different collection search path (previously "
531 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])."
532 )
533 if dimensions is None:
534 dimensions = existing_search.dimensions
535 else:
536 if collections is None:
537 collections = self._driver.get_default_collections()
538 collections = tuple(ensure_iterable(collections))
539 # Look up the data repository definition of the dataset type to check
540 # for consistency, or get dimensions and storage class if we don't have
541 # them.
542 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name)
543 resolved_dimensions = resolved_dataset_type.dimensions.as_group()
544 if dimensions is not None and dimensions != resolved_dimensions:
545 raise DatasetTypeError(
546 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the "
547 f"registered dimensions {resolved_dimensions}."
548 )
549 if storage_class_name is not None:
550 if storage_class_name != resolved_dataset_type.storageClass_name:
551 if not allow_storage_class_overrides:
552 raise InvalidQueryError(
553 f"Storage class {storage_class_name!r} for dataset type {dataset_type!r} differs "
554 f"from repository definition {resolved_dataset_type.storageClass_name!r}, but "
555 "join_dataset_search does not are about storage classes and cannot record this "
556 "override. Pass the override to `Query.datasets` instead."
557 )
558 if not (
559 StorageClassFactory()
560 .getStorageClass(storage_class_name)
561 .can_convert(resolved_dataset_type.storageClass)
562 ):
563 raise DatasetTypeError(
564 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not "
565 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}."
566 )
567 else:
568 storage_class_name = resolved_dataset_type.storageClass_name
569 dataset_search = DatasetSearch.model_construct(
570 collections=collections,
571 dimensions=resolved_dimensions,
572 )
573 return (
574 dataset_type_name,
575 storage_class_name,
576 Query(self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)),
577 )