Coverage for python/lsst/daf/butler/queries/_query.py: 24%
126 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-26 02:47 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-26 02:47 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Query",)
32from collections.abc import Iterable, Mapping, Set
33from typing import Any, final
35from lsst.utils.iteration import ensure_iterable
37from .._dataset_type import DatasetType
38from .._exceptions import InvalidQueryError
39from .._storage_class import StorageClassFactory
40from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup
41from ..registry import DatasetTypeError
42from ._base import QueryBase
43from ._data_coordinate_query_results import DataCoordinateQueryResults
44from ._dataset_query_results import DatasetRefQueryResults
45from ._dimension_record_query_results import DimensionRecordQueryResults
46from .convert_args import convert_where_args
47from .driver import QueryDriver
48from .expression_factory import ExpressionFactory
49from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec, DimensionRecordResultSpec
50from .tree import DatasetSearch, Predicate, QueryTree, make_identity_query_tree
53@final
54class Query(QueryBase):
55 """A method-chaining builder for butler queries.
57 Parameters
58 ----------
59 driver : `QueryDriver`
60 Implementation object that knows how to actually execute queries.
61 tree : `QueryTree`, optional
62 Description of the query as a tree of joins and column expressions.
63 Defaults to the result of a call to `tree.make_identity_query_tree`.
65 Notes
66 -----
67 `Query` objects should never be constructed directly by users; use
68 `Butler._query` instead.
70 A `Query` object represents the first stage of query construction, in which
71 constraints and joins are defined (roughly corresponding to the WHERE and
72 FROM clauses in SQL). The various "results" objects represent the second
73 (and final) stage, where the columns returned are specified and any sorting
74 or integer slicing can be applied. Result objects are obtained from the
75 `data_ids`, `datasets`, and `dimension_records` methods.
77 `Query` and query-result objects are always immutable (except for caching
78 information fetched from the database or server), so modifier methods
79 always return a new object without modifying the current one.
80 """
82 def __init__(self, driver: QueryDriver, tree: QueryTree | None = None):
83 # __init__ defined here because there are multiple base classes and
84 # not all define __init__ (and hence inherit object.__init__, which
85 # just ignores its args). Even if we just delegate to super(), it
86 # seems less fragile to make it explicit here.
87 if tree is None:
88 tree = make_identity_query_tree(driver.universe)
89 super().__init__(driver, tree)
91 @property
92 def constraint_dataset_types(self) -> Set[str]:
93 """The names of all dataset types joined into the query.
95 The existence of datasets of these types constrains the data IDs of any
96 type of result. Fields for these dataset types are also usable in
97 'where' expressions.
98 """
99 # Note that this includes only dataset type names, not `DatasetType`
100 # instances; the `DatasetQueryResults` adapter returned by the
101 # `datasets` method does include `DatasetType` instances, since it is
102 # in a better position to track and respect any storage class override
103 # specified.
104 return self._tree.datasets.keys()
106 @property
107 def constraint_dimensions(self) -> DimensionGroup:
108 """Dimensions currently present in the query, either directly or
109 indirectly.
111 This includes dimensions that are present in any joined subquery (such
112 as a dataset search, materialization, or data ID upload) or `where`
113 argument, as well as any required or implied dependency of those
114 dimensions.
115 """
116 return self._tree.dimensions
118 @property
119 def expression_factory(self) -> ExpressionFactory:
120 """A factory for column expressions using overloaded operators.
122 Notes
123 -----
124 Typically this attribute will be assigned to a single-character local
125 variable, and then its (dynamic) attributes can be used to obtain
126 references to columns that can be included in a query::
128 with butler._query() as query:
129 x = query.expression_factory
130 query = query.where(
131 x.instrument == "LSSTCam",
132 x.visit.day_obs > 20240701,
133 x.any(x.band == 'u', x.band == 'y'),
134 )
136 As shown above, the returned object also has an `any` method to create
137 combine expressions with logical OR (as well as `not_` and `all`,
138 though the latter is rarely necessary since `where` already combines
139 its arguments with AND).
141 Proxies for fields associated with dataset types (``dataset_id``,
142 ``ingest_date``, ``run``, ``collection``, as well as ``timespan`` for
143 `~CollectionType.CALIBRATION` collection searches) can be obtained with
144 dict-like access instead::
146 with butler._query() as query:
147 query = query.order_by(x["raw"].ingest_date)
149 Expression proxy objects that correspond to scalar columns overload the
150 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``,
151 ``>=``) and provide `~ScalarExpressionProxy.in_range`,
152 `~ScalarExpressionProxy.in_iterable`, and
153 `~ScalarExpressionProxy.in_query` methods for membership tests. For
154 `order_by` contexts, they also have a `~ScalarExpressionProxy.desc`
155 property to indicate that the sort order for that expression should be
156 reversed.
158 Proxy objects for region and timespan fields have an `overlaps` method,
159 and timespans also have `~TimespanProxy.begin` and `~TimespanProxy.end`
160 properties to access scalar expression proxies for the bounds.
162 All proxy objects also have a `~ExpressionProxy.is_null` property.
164 Literal values can be created by calling `ExpressionFactory.literal`,
165 but can almost always be created implicitly via overloaded operators
166 instead.
167 """
168 return ExpressionFactory(self._driver.universe)
170 def data_ids(
171 self, dimensions: DimensionGroup | Iterable[str] | str | None = None
172 ) -> DataCoordinateQueryResults:
173 """Return a result object that is a `DataCoordinate` iterable.
175 Parameters
176 ----------
177 dimensions : `DimensionGroup`, `str`, or \
178 `~collections.abc.Iterable` [`str`], optional
179 The dimensions of the data IDs to yield, as either `DimensionGroup`
180 instances or `str` names. Will be automatically expanded to a
181 complete `DimensionGroup`. These dimensions do not need to match
182 the query's current `dimensions`. Default is
183 `constraint_dimensions`.
185 Returns
186 -------
187 data_ids : `DataCoordinateQueryResults`
188 Data IDs matching the given query parameters. These are guaranteed
189 to identify all dimensions (`DataCoordinate.hasFull` returns
190 `True`), but will not contain `DimensionRecord` objects
191 (`DataCoordinate.hasRecords` returns `False`). Call
192 `~DataCoordinateQueryResults.with_dimension_records` on the
193 returned object to include dimension records as well.
194 """
195 tree = self._tree
196 if dimensions is None:
197 dimensions = self._tree.dimensions
198 else:
199 dimensions = self._driver.universe.conform(dimensions)
200 if not dimensions <= self._tree.dimensions:
201 tree = tree.join_dimensions(dimensions)
202 result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False)
203 return DataCoordinateQueryResults(self._driver, tree, result_spec)
205 def datasets(
206 self,
207 dataset_type: str | DatasetType,
208 collections: str | Iterable[str] | None = None,
209 *,
210 find_first: bool = True,
211 ) -> DatasetRefQueryResults:
212 """Return a result object that is a `DatasetRef` iterable.
214 Parameters
215 ----------
216 dataset_type : `str` or `DatasetType`
217 The dataset type to search for.
218 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
219 The collection or collections to search, in order. If not provided
220 or `None`, and the dataset has not already been joined into the
221 query, the default collection search path for this butler is used.
222 find_first : `bool`, optional
223 If `True` (default), for each result data ID, only yield one
224 `DatasetRef` of each `DatasetType`, from the first collection in
225 which a dataset of that dataset type appears (according to the
226 order of ``collections`` passed in). If `True`, ``collections``
227 must not be ``...``.
229 Returns
230 -------
231 refs : `.queries.DatasetRefQueryResults`
232 Dataset references matching the given query criteria. Nested data
233 IDs are guaranteed to include values for all implied dimensions
234 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
235 include dimension records (`DataCoordinate.hasRecords` will be
236 `False`) unless
237 `~.queries.DatasetRefQueryResults.with_dimension_records` is
238 called on the result object (which returns a new one).
240 Raises
241 ------
242 lsst.daf.butler.registry.DatasetTypeExpressionError
243 Raised when the ``dataset_type`` expression is invalid.
244 lsst.daf.butler.registry.NoDefaultCollectionError
245 Raised when ``collections`` is `None` and default butler
246 collections are not defined.
247 TypeError
248 Raised when the arguments are incompatible, such as when a
249 collection wildcard is passed when ``find_first`` is `True`
251 Notes
252 -----
253 When multiple dataset types are queried in a single call, the
254 results of this operation are equivalent to querying for each dataset
255 type separately in turn, and no information about the relationships
256 between datasets of different types is included.
257 """
258 dataset_type_name, storage_class_name, query = self._join_dataset_search_impl(
259 dataset_type, collections
260 )
261 dataset_search = query._tree.datasets[dataset_type_name]
262 spec = DatasetRefResultSpec.model_construct(
263 dataset_type_name=dataset_type_name,
264 dimensions=dataset_search.dimensions,
265 storage_class_name=storage_class_name,
266 include_dimension_records=False,
267 find_first=find_first,
268 )
269 return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec)
271 def dimension_records(self, element: str) -> DimensionRecordQueryResults:
272 """Return a result object that is a `DimensionRecord` iterable.
274 Parameters
275 ----------
276 element : `str`
277 The name of a dimension element to obtain records for.
279 Returns
280 -------
281 records : `.queries.DimensionRecordQueryResults`
282 Data IDs matching the given query parameters.
283 """
284 tree = self._tree
285 if element not in tree.dimensions.elements:
286 tree = tree.join_dimensions(self._driver.universe[element].minimal_group)
287 result_spec = DimensionRecordResultSpec(element=self._driver.universe[element])
288 return DimensionRecordQueryResults(self._driver, tree, result_spec)
290 def materialize(
291 self,
292 *,
293 dimensions: Iterable[str] | DimensionGroup | None = None,
294 datasets: Iterable[str] | None = None,
295 ) -> Query:
296 """Execute the query, save its results to a temporary location, and
297 return a new query that represents fetching or joining against those
298 saved results.
300 Parameters
301 ----------
302 dimensions : `~collections.abc.Iterable` [ `str` ] or \
303 `DimensionGroup`, optional
304 Dimensions to include in the temporary results. Default is to
305 include all dimensions in the query.
306 datasets : `~collections.abc.Iterable` [ `str` ], optional
307 Names of dataset types that should be included in the new query;
308 default is to include `constraint_dataset_types`.
310 Returns
311 -------
312 query : `Query`
313 A new query object whose that represents the materialized rows.
315 Notes
316 -----
317 Only dimension key columns and (at the discretion of the
318 implementation) certain dataset columns are actually materialized,
319 since at this stage we do not know which dataset or dimension record
320 fields are actually needed in result rows, and these can be joined back
321 in on the materialized dimension keys. But all constraints on those
322 dimension keys (including dataset existence) are applied to the
323 materialized rows.
324 """
325 if datasets is None:
326 datasets = frozenset(self.constraint_dataset_types)
327 else:
328 datasets = frozenset(datasets)
329 if not (datasets <= self.constraint_dataset_types):
330 raise InvalidQueryError(
331 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query."
332 )
333 if dimensions is None:
334 dimensions = self._tree.dimensions
335 else:
336 dimensions = self._driver.universe.conform(dimensions)
337 key = self._driver.materialize(self._tree, dimensions, datasets)
338 tree = make_identity_query_tree(self._driver.universe).join_materialization(
339 key, dimensions=dimensions
340 )
341 for dataset_type_name in datasets:
342 dataset_search = self._tree.datasets[dataset_type_name]
343 if not (dataset_search.dimensions <= tree.dimensions):
344 raise InvalidQueryError(
345 f"Materialization-backed query has dimensions {tree.dimensions}, which do not "
346 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. "
347 "Expand the dimensions or drop this dataset type in the arguments to materialize to "
348 "avoid this error."
349 )
350 tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name])
351 return Query(self._driver, tree)
353 def join_dataset_search(
354 self,
355 dataset_type: str | DatasetType,
356 collections: Iterable[str] | None = None,
357 ) -> Query:
358 """Return a new query with a search for a dataset joined in.
360 Parameters
361 ----------
362 dataset_type : `str` or `DatasetType`
363 Dataset type or name. May not refer to a dataset component.
364 collections : `~collections.abc.Iterable` [ `str` ], optional
365 Iterable of collections to search. Order is preserved, but will
366 not matter if the dataset search is only used as a constraint on
367 dimensions or if ``find_first=False`` when requesting results. If
368 not present or `None`, the default collection search path will be
369 used.
371 Returns
372 -------
373 query : `Query`
374 A new query object with dataset columns available and rows
375 restricted to those consistent with the found data IDs.
377 Raises
378 ------
379 DatasetTypeError
380 Raised given dataset type is inconsistent with the registered
381 dataset type.
382 MissingDatasetTypeError
383 Raised if the dataset type has not been registered and only a
384 `str` dataset type name was given.
386 Notes
387 -----
388 This method may require communication with the server unless the
389 dataset type and collections have already been referenced by the same
390 query context.
391 """
392 _, _, query = self._join_dataset_search_impl(
393 dataset_type, collections, allow_storage_class_overrides=False
394 )
395 return query
397 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query:
398 """Return a new query that joins in an explicit table of data IDs.
400 Parameters
401 ----------
402 iterable : `~collections.abc.Iterable` [ `DataCoordinate` ]
403 Iterable of `DataCoordinate`. All items must have the same
404 dimensions. Must have at least one item.
406 Returns
407 -------
408 query : `Query`
409 A new query object with the data IDs joined in.
410 """
411 rows: set[tuple[DataIdValue, ...]] = set()
412 dimensions: DimensionGroup | None = None
413 for data_coordinate in iterable:
414 if dimensions is None:
415 dimensions = data_coordinate.dimensions
416 elif dimensions != data_coordinate.dimensions:
417 raise InvalidQueryError(
418 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}."
419 )
420 rows.add(data_coordinate.required_values)
421 if dimensions is None:
422 raise InvalidQueryError("Cannot upload an empty data coordinate set.")
423 key = self._driver.upload_data_coordinates(dimensions, rows)
424 return Query(
425 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), driver=self._driver
426 )
428 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query:
429 """Return a new query that joins the logical tables for additional
430 dimensions.
432 Parameters
433 ----------
434 dimensions : `~collections.abc.Iterable` [ `str` ] or `DimensionGroup`
435 Names of dimensions to join in.
437 Returns
438 -------
439 query : `Query`
440 A new query object with the dimensions joined in.
442 Notes
443 -----
444 Dimensions are automatically joined in whenever needed, so this method
445 should rarely need to be called directly.
446 """
447 dimensions = self._driver.universe.conform(dimensions)
448 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver)
450 def where(
451 self,
452 *args: str | Predicate | DataId,
453 bind: Mapping[str, Any] | None = None,
454 **kwargs: Any,
455 ) -> Query:
456 """Return a query with a boolean-expression filter on its rows.
458 Parameters
459 ----------
460 *args
461 Constraints to apply, combined with logical AND. Arguments may be
462 `str` expressions to parse, `Predicate` objects (these are
463 typically constructed via `expression_factory`) or data IDs.
464 bind : `~collections.abc.Mapping`
465 Mapping from string identifier appearing in a string expression to
466 a literal value that should be substituted for it. This is
467 recommended instead of embedding literals directly into the
468 expression, especially for strings, timespans, or other types where
469 quoting or formatting is nontrivial.
470 **kwargs
471 Data ID key value pairs that extend and override any present in
472 ``*args``.
474 Returns
475 -------
476 query : `Query`
477 A new query object with the given row filters (as well as any
478 already present in ``self``). All row filters are combined with
479 logical AND.
481 Notes
482 -----
483 If an expression references a dimension or dimension element that is
484 not already present in the query, it will be joined in, but dataset
485 searches must already be joined into a query in order to reference
486 their fields in expressions.
488 Data ID values are not checked for consistency; they are extracted from
489 ``args`` and then ``kwargs`` and combined, with later values overriding
490 earlier ones.
491 """
492 return Query(
493 tree=self._tree.where(
494 convert_where_args(
495 self.constraint_dimensions,
496 self.constraint_dataset_types,
497 *args,
498 bind=bind,
499 **kwargs,
500 )
501 ),
502 driver=self._driver,
503 )
505 def _join_dataset_search_impl(
506 self,
507 dataset_type: str | DatasetType,
508 collections: Iterable[str] | None = None,
509 allow_storage_class_overrides: bool = True,
510 ) -> tuple[str, str, Query]:
511 """Implement `join_dataset_search`, and also return the dataset type
512 name and storage class, in addition to the modified Query.
513 """
514 # In this method we need the dimensions of the dataset type, but we
515 # might not need the storage class, since the dataset may only be used
516 # as an existence constraint. It depends on whether
517 # `join_dataset_search` or `datasets` is calling this method.
518 dimensions: DimensionGroup | None = None
519 storage_class_name: str | None = None
520 # Handle DatasetType vs. str arg.
521 if isinstance(dataset_type, DatasetType):
522 dataset_type_name = dataset_type.name
523 dimensions = dataset_type.dimensions.as_group()
524 storage_class_name = dataset_type.storageClass_name
525 elif isinstance(dataset_type, str):
526 dataset_type_name = dataset_type
527 else:
528 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.")
529 # See if this dataset has already been joined into the query.
530 if existing_search := self._tree.datasets.get(dataset_type_name):
531 if collections is None:
532 collections = existing_search.collections
533 else:
534 collections = tuple(ensure_iterable(collections))
535 if collections != existing_search.collections:
536 raise InvalidQueryError(
537 f"Dataset type {dataset_type_name!r} was already joined into this "
538 "query with a different collection search path (previously "
539 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])."
540 )
541 if dimensions is None:
542 dimensions = existing_search.dimensions
543 else:
544 if collections is None:
545 collections = self._driver.get_default_collections()
546 collections = tuple(ensure_iterable(collections))
547 # Look up the data repository definition of the dataset type to check
548 # for consistency, or get dimensions and storage class if we don't have
549 # them.
550 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name)
551 resolved_dimensions = resolved_dataset_type.dimensions.as_group()
552 if dimensions is not None and dimensions != resolved_dimensions:
553 raise DatasetTypeError(
554 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the "
555 f"registered dimensions {resolved_dimensions}."
556 )
557 if storage_class_name is not None:
558 if storage_class_name != resolved_dataset_type.storageClass_name:
559 if not allow_storage_class_overrides:
560 raise InvalidQueryError(
561 f"Storage class {storage_class_name!r} for dataset type {dataset_type!r} differs "
562 f"from repository definition {resolved_dataset_type.storageClass_name!r}, but "
563 "join_dataset_search does not are about storage classes and cannot record this "
564 "override. Pass the override to `Query.datasets` instead."
565 )
566 if not (
567 StorageClassFactory()
568 .getStorageClass(storage_class_name)
569 .can_convert(resolved_dataset_type.storageClass)
570 ):
571 raise DatasetTypeError(
572 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not "
573 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}."
574 )
575 else:
576 storage_class_name = resolved_dataset_type.storageClass_name
577 dataset_search = DatasetSearch.model_construct(
578 collections=collections,
579 dimensions=resolved_dimensions,
580 )
581 return (
582 dataset_type_name,
583 storage_class_name,
584 Query(self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)),
585 )