Coverage for python/lsst/daf/butler/queries/_query.py: 24%
125 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-13 09:58 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-13 09:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Query",)
32from collections.abc import Iterable, Mapping, Set
33from typing import Any, final
35from lsst.utils.iteration import ensure_iterable
37from .._dataset_type import DatasetType
38from .._storage_class import StorageClassFactory
39from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup
40from ..registry import DatasetTypeError
41from ._base import QueryBase
42from ._data_coordinate_query_results import DataCoordinateQueryResults
43from ._dataset_query_results import DatasetRefQueryResults
44from ._dimension_record_query_results import DimensionRecordQueryResults
45from .convert_args import convert_where_args
46from .driver import QueryDriver
47from .expression_factory import ExpressionFactory
48from .result_specs import DataCoordinateResultSpec, DatasetRefResultSpec, DimensionRecordResultSpec
49from .tree import DatasetSearch, InvalidQueryError, Predicate, QueryTree, make_identity_query_tree
52@final
53class Query(QueryBase):
54 """A method-chaining builder for butler queries.
56 Parameters
57 ----------
58 driver : `QueryDriver`
59 Implementation object that knows how to actually execute queries.
60 tree : `QueryTree`, optional
61 Description of the query as a tree of joins and column expressions.
62 Defaults to the result of a call to `tree.make_identity_query_tree`.
64 Notes
65 -----
66 `Query` objects should never be constructed directly by users; use
67 `Butler._query` instead.
69 A `Query` object represents the first stage of query construction, in which
70 constraints and joins are defined (roughly corresponding to the WHERE and
71 FROM clauses in SQL). The various "results" objects represent the second
72 (and final) stage, where the columns returned are specified and any sorting
73 or integer slicing can be applied. Result objects are obtained from the
74 `data_ids`, `datasets`, and `dimension_records` methods.
76 `Query` and query-result objects are always immutable (except for caching
77 information fetched from the database or server), so modifier methods
78 always return a new object without modifying the current one.
79 """
81 def __init__(self, driver: QueryDriver, tree: QueryTree | None = None):
82 # __init__ defined here because there are multiple base classes and
83 # not all define __init__ (and hence inherit object.__init__, which
84 # just ignores its args). Even if we just delegate to super(), it
85 # seems less fragile to make it explicit here.
86 if tree is None:
87 tree = make_identity_query_tree(driver.universe)
88 super().__init__(driver, tree)
90 @property
91 def constraint_dataset_types(self) -> Set[str]:
92 """The names of all dataset types joined into the query.
94 The existence of datasets of these types constrains the data IDs of any
95 type of result. Fields for these dataset types are also usable in
96 'where' expressions.
97 """
98 # Note that this includes only dataset type names, not `DatasetType`
99 # instances; the `DatasetQueryResults` adapter returned by the
100 # `datasets` method does include `DatasetType` instances, since it is
101 # in a better position to track and respect any storage class override
102 # specified.
103 return self._tree.datasets.keys()
105 @property
106 def constraint_dimensions(self) -> DimensionGroup:
107 """Dimensions currently present in the query, either directly or
108 indirectly.
110 This includes dimensions that are present in any joined subquery (such
111 as a dataset search, materialization, or data ID upload) or `where`
112 argument, as well as any required or implied dependency of those
113 dimensions.
114 """
115 return self._tree.dimensions
117 @property
118 def expression_factory(self) -> ExpressionFactory:
119 """A factory for column expressions using overloaded operators.
121 Notes
122 -----
123 Typically this attribute will be assigned to a single-character local
124 variable, and then its (dynamic) attributes can be used to obtain
125 references to columns that can be included in a query::
127 with butler._query() as query:
128 x = query.expression_factory
129 query = query.where(
130 x.instrument == "LSSTCam",
131 x.visit.day_obs > 20240701,
132 x.any(x.band == 'u', x.band == 'y'),
133 )
135 As shown above, the returned object also has an `any` method to create
136 combine expressions with logical OR (as well as `not_` and `all`,
137 though the latter is rarely necessary since `where` already combines
138 its arguments with AND).
140 Proxies for fields associated with dataset types (``dataset_id``,
141 ``ingest_date``, ``run``, ``collection``, as well as ``timespan`` for
142 `~CollectionType.CALIBRATION` collection searches) can be obtained with
143 dict-like access instead::
145 with butler._query() as query:
146 query = query.order_by(x["raw"].ingest_date)
148 Expression proxy objects that correspond to scalar columns overload the
149 standard comparison operators (``==``, ``!=``, ``<``, ``>``, ``<=``,
150 ``>=``) and provide `~ScalarExpressionProxy.in_range`,
151 `~ScalarExpressionProxy.in_iterable`, and
152 `~ScalarExpressionProxy.in_query` methods for membership tests. For
153 `order_by` contexts, they also have a `~ScalarExpressionProxy.desc`
154 property to indicate that the sort order for that expression should be
155 reversed.
157 Proxy objects for region and timespan fields have an `overlaps` method,
158 and timespans also have `~TimespanProxy.begin` and `~TimespanProxy.end`
159 properties to access scalar expression proxies for the bounds.
161 All proxy objects also have a `~ExpressionProxy.is_null` property.
163 Literal values can be created by calling `ExpressionFactory.literal`,
164 but can almost always be created implicitly via overloaded operators
165 instead.
166 """
167 return ExpressionFactory(self._driver.universe)
169 def data_ids(
170 self, dimensions: DimensionGroup | Iterable[str] | str | None = None
171 ) -> DataCoordinateQueryResults:
172 """Return a result object that is a `DataCoordinate` iterable.
174 Parameters
175 ----------
176 dimensions : `DimensionGroup`, `str`, or \
177 `~collections.abc.Iterable` [`str`], optional
178 The dimensions of the data IDs to yield, as either `DimensionGroup`
179 instances or `str` names. Will be automatically expanded to a
180 complete `DimensionGroup`. These dimensions do not need to match
181 the query's current `dimensions`. Default is
182 `constraint_dimensions`.
184 Returns
185 -------
186 data_ids : `DataCoordinateQueryResults`
187 Data IDs matching the given query parameters. These are guaranteed
188 to identify all dimensions (`DataCoordinate.hasFull` returns
189 `True`), but will not contain `DimensionRecord` objects
190 (`DataCoordinate.hasRecords` returns `False`). Call
191 `~DataCoordinateQueryResults.with_dimension_records` on the
192 returned object to include dimension records as well.
193 """
194 tree = self._tree
195 if dimensions is None:
196 dimensions = self._tree.dimensions
197 else:
198 dimensions = self._driver.universe.conform(dimensions)
199 if not dimensions <= self._tree.dimensions:
200 tree = tree.join_dimensions(dimensions)
201 result_spec = DataCoordinateResultSpec(dimensions=dimensions, include_dimension_records=False)
202 return DataCoordinateQueryResults(self._driver, tree, result_spec)
204 def datasets(
205 self,
206 dataset_type: str | DatasetType,
207 collections: str | Iterable[str] | None = None,
208 *,
209 find_first: bool = True,
210 ) -> DatasetRefQueryResults:
211 """Return a result object that is a `DatasetRef` iterable.
213 Parameters
214 ----------
215 dataset_type : `str` or `DatasetType`
216 The dataset type to search for.
217 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
218 The collection or collections to search, in order. If not provided
219 or `None`, and the dataset has not already been joined into the
220 query, the default collection search path for this butler is used.
221 find_first : `bool`, optional
222 If `True` (default), for each result data ID, only yield one
223 `DatasetRef` of each `DatasetType`, from the first collection in
224 which a dataset of that dataset type appears (according to the
225 order of ``collections`` passed in). If `True`, ``collections``
226 must not be ``...``.
228 Returns
229 -------
230 refs : `.queries.DatasetRefQueryResults`
231 Dataset references matching the given query criteria. Nested data
232 IDs are guaranteed to include values for all implied dimensions
233 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
234 include dimension records (`DataCoordinate.hasRecords` will be
235 `False`) unless
236 `~.queries.DatasetRefQueryResults.with_dimension_records` is
237 called on the result object (which returns a new one).
239 Raises
240 ------
241 lsst.daf.butler.registry.DatasetTypeExpressionError
242 Raised when the ``dataset_type`` expression is invalid.
243 lsst.daf.butler.registry.NoDefaultCollectionError
244 Raised when ``collections`` is `None` and default butler
245 collections are not defined.
246 TypeError
247 Raised when the arguments are incompatible, such as when a
248 collection wildcard is passed when ``find_first`` is `True`
250 Notes
251 -----
252 When multiple dataset types are queried in a single call, the
253 results of this operation are equivalent to querying for each dataset
254 type separately in turn, and no information about the relationships
255 between datasets of different types is included.
256 """
257 dataset_type_name, storage_class_name, query = self._join_dataset_search_impl(
258 dataset_type, collections
259 )
260 dataset_search = query._tree.datasets[dataset_type_name]
261 spec = DatasetRefResultSpec.model_construct(
262 dataset_type_name=dataset_type_name,
263 dimensions=dataset_search.dimensions,
264 storage_class_name=storage_class_name,
265 include_dimension_records=False,
266 find_first=find_first,
267 )
268 return DatasetRefQueryResults(self._driver, tree=query._tree, spec=spec)
270 def dimension_records(self, element: str) -> DimensionRecordQueryResults:
271 """Return a result object that is a `DimensionRecord` iterable.
273 Parameters
274 ----------
275 element : `str`
276 The name of a dimension element to obtain records for.
278 Returns
279 -------
280 records : `.queries.DimensionRecordQueryResults`
281 Data IDs matching the given query parameters.
282 """
283 tree = self._tree
284 if element not in tree.dimensions.elements:
285 tree = tree.join_dimensions(self._driver.universe[element].minimal_group)
286 result_spec = DimensionRecordResultSpec(element=self._driver.universe[element])
287 return DimensionRecordQueryResults(self._driver, tree, result_spec)
289 def materialize(
290 self,
291 *,
292 dimensions: Iterable[str] | DimensionGroup | None = None,
293 datasets: Iterable[str] | None = None,
294 ) -> Query:
295 """Execute the query, save its results to a temporary location, and
296 return a new query that represents fetching or joining against those
297 saved results.
299 Parameters
300 ----------
301 dimensions : `~collections.abc.Iterable` [ `str` ] or \
302 `DimensionGroup`, optional
303 Dimensions to include in the temporary results. Default is to
304 include all dimensions in the query.
305 datasets : `~collections.abc.Iterable` [ `str` ], optional
306 Names of dataset types that should be included in the new query;
307 default is to include `constraint_dataset_types`.
309 Returns
310 -------
311 query : `Query`
312 A new query object whose that represents the materialized rows.
314 Notes
315 -----
316 Only dimension key columns and (at the discretion of the
317 implementation) certain dataset columns are actually materialized,
318 since at this stage we do not know which dataset or dimension record
319 fields are actually needed in result rows, and these can be joined back
320 in on the materialized dimension keys. But all constraints on those
321 dimension keys (including dataset existence) are applied to the
322 materialized rows.
323 """
324 if datasets is None:
325 datasets = frozenset(self.constraint_dataset_types)
326 else:
327 datasets = frozenset(datasets)
328 if not (datasets <= self.constraint_dataset_types):
329 raise InvalidQueryError(
330 f"Dataset(s) {datasets - self.constraint_dataset_types} are present in the query."
331 )
332 if dimensions is None:
333 dimensions = self._tree.dimensions
334 else:
335 dimensions = self._driver.universe.conform(dimensions)
336 key = self._driver.materialize(self._tree, dimensions, datasets)
337 tree = make_identity_query_tree(self._driver.universe).join_materialization(
338 key, dimensions=dimensions
339 )
340 for dataset_type_name in datasets:
341 dataset_search = self._tree.datasets[dataset_type_name]
342 if not (dataset_search.dimensions <= tree.dimensions):
343 raise InvalidQueryError(
344 f"Materialization-backed query has dimensions {tree.dimensions}, which do not "
345 f"cover the dimensions {dataset_search.dimensions} of dataset {dataset_type_name!r}. "
346 "Expand the dimensions or drop this dataset type in the arguments to materialize to "
347 "avoid this error."
348 )
349 tree = tree.join_dataset(dataset_type_name, self._tree.datasets[dataset_type_name])
350 return Query(self._driver, tree)
352 def join_dataset_search(
353 self,
354 dataset_type: str | DatasetType,
355 collections: Iterable[str] | None = None,
356 ) -> Query:
357 """Return a new query with a search for a dataset joined in.
359 Parameters
360 ----------
361 dataset_type : `str` or `DatasetType`
362 Dataset type or name. May not refer to a dataset component.
363 collections : `~collections.abc.Iterable` [ `str` ], optional
364 Iterable of collections to search. Order is preserved, but will
365 not matter if the dataset search is only used as a constraint on
366 dimensions or if ``find_first=False`` when requesting results. If
367 not present or `None`, the default collection search path will be
368 used.
370 Returns
371 -------
372 query : `Query`
373 A new query object with dataset columns available and rows
374 restricted to those consistent with the found data IDs.
376 Raises
377 ------
378 DatasetTypeError
379 Raised given dataset type is inconsistent with the registered
380 dataset type.
381 MissingDatasetTypeError
382 Raised if the dataset type has not been registered and only a
383 `str` dataset type name was given.
385 Notes
386 -----
387 This method may require communication with the server unless the
388 dataset type and collections have already been referenced by the same
389 query context.
390 """
391 _, _, query = self._join_dataset_search_impl(
392 dataset_type, collections, allow_storage_class_overrides=False
393 )
394 return query
396 def join_data_coordinates(self, iterable: Iterable[DataCoordinate]) -> Query:
397 """Return a new query that joins in an explicit table of data IDs.
399 Parameters
400 ----------
401 iterable : `~collections.abc.Iterable` [ `DataCoordinate` ]
402 Iterable of `DataCoordinate`. All items must have the same
403 dimensions. Must have at least one item.
405 Returns
406 -------
407 query : `Query`
408 A new query object with the data IDs joined in.
409 """
410 rows: set[tuple[DataIdValue, ...]] = set()
411 dimensions: DimensionGroup | None = None
412 for data_coordinate in iterable:
413 if dimensions is None:
414 dimensions = data_coordinate.dimensions
415 elif dimensions != data_coordinate.dimensions:
416 raise InvalidQueryError(
417 f"Inconsistent dimensions: {dimensions} != {data_coordinate.dimensions}."
418 )
419 rows.add(data_coordinate.required_values)
420 if dimensions is None:
421 raise InvalidQueryError("Cannot upload an empty data coordinate set.")
422 key = self._driver.upload_data_coordinates(dimensions, rows)
423 return Query(
424 tree=self._tree.join_data_coordinate_upload(dimensions=dimensions, key=key), driver=self._driver
425 )
427 def join_dimensions(self, dimensions: Iterable[str] | DimensionGroup) -> Query:
428 """Return a new query that joins the logical tables for additional
429 dimensions.
431 Parameters
432 ----------
433 dimensions : `~collections.abc.Iterable` [ `str` ] or `DimensionGroup`
434 Names of dimensions to join in.
436 Returns
437 -------
438 query : `Query`
439 A new query object with the dimensions joined in.
441 Notes
442 -----
443 Dimensions are automatically joined in whenever needed, so this method
444 should rarely need to be called directly.
445 """
446 dimensions = self._driver.universe.conform(dimensions)
447 return Query(tree=self._tree.join_dimensions(dimensions), driver=self._driver)
449 def where(
450 self,
451 *args: str | Predicate | DataId,
452 bind: Mapping[str, Any] | None = None,
453 **kwargs: Any,
454 ) -> Query:
455 """Return a query with a boolean-expression filter on its rows.
457 Parameters
458 ----------
459 *args
460 Constraints to apply, combined with logical AND. Arguments may be
461 `str` expressions to parse, `Predicate` objects (these are
462 typically constructed via `expression_factory`) or data IDs.
463 bind : `~collections.abc.Mapping`
464 Mapping from string identifier appearing in a string expression to
465 a literal value that should be substituted for it. This is
466 recommended instead of embedding literals directly into the
467 expression, especially for strings, timespans, or other types where
468 quoting or formatting is nontrivial.
469 **kwargs
470 Data ID key value pairs that extend and override any present in
471 ``*args``.
473 Returns
474 -------
475 query : `Query`
476 A new query object with the given row filters (as well as any
477 already present in ``self``). All row filters are combined with
478 logical AND.
480 Notes
481 -----
482 If an expression references a dimension or dimension element that is
483 not already present in the query, it will be joined in, but dataset
484 searches must already be joined into a query in order to reference
485 their fields in expressions.
487 Data ID values are not checked for consistency; they are extracted from
488 ``args`` and then ``kwargs`` and combined, with later values overriding
489 earlier ones.
490 """
491 return Query(
492 tree=self._tree.where(
493 convert_where_args(
494 self.constraint_dimensions,
495 self.constraint_dataset_types,
496 *args,
497 bind=bind,
498 **kwargs,
499 )
500 ),
501 driver=self._driver,
502 )
504 def _join_dataset_search_impl(
505 self,
506 dataset_type: str | DatasetType,
507 collections: Iterable[str] | None = None,
508 allow_storage_class_overrides: bool = True,
509 ) -> tuple[str, str, Query]:
510 """Implement `join_dataset_search`, and also return the dataset type
511 name and storage class, in addition to the modified Query.
512 """
513 # In this method we need the dimensions of the dataset type, but we
514 # might not need the storage class, since the dataset may only be used
515 # as an existence constraint. It depends on whether
516 # `join_dataset_search` or `datasets` is calling this method.
517 dimensions: DimensionGroup | None = None
518 storage_class_name: str | None = None
519 # Handle DatasetType vs. str arg.
520 if isinstance(dataset_type, DatasetType):
521 dataset_type_name = dataset_type.name
522 dimensions = dataset_type.dimensions.as_group()
523 storage_class_name = dataset_type.storageClass_name
524 elif isinstance(dataset_type, str):
525 dataset_type_name = dataset_type
526 else:
527 raise TypeError(f"Invalid dataset type argument {dataset_type!r}.")
528 # See if this dataset has already been joined into the query.
529 if existing_search := self._tree.datasets.get(dataset_type_name):
530 if collections is None:
531 collections = existing_search.collections
532 else:
533 collections = tuple(ensure_iterable(collections))
534 if collections != existing_search.collections:
535 raise InvalidQueryError(
536 f"Dataset type {dataset_type_name!r} was already joined into this "
537 "query with a different collection search path (previously "
538 f"[{', '.join(existing_search.collections)}], now [{', '.join(collections)}])."
539 )
540 if dimensions is None:
541 dimensions = existing_search.dimensions
542 else:
543 if collections is None:
544 collections = self._driver.get_default_collections()
545 collections = tuple(ensure_iterable(collections))
546 # Look up the data repository definition of the dataset type to check
547 # for consistency, or get dimensions and storage class if we don't have
548 # them.
549 resolved_dataset_type = self._driver.get_dataset_type(dataset_type_name)
550 resolved_dimensions = resolved_dataset_type.dimensions.as_group()
551 if dimensions is not None and dimensions != resolved_dimensions:
552 raise DatasetTypeError(
553 f"Given dimensions {dimensions} for dataset type {dataset_type_name!r} do not match the "
554 f"registered dimensions {resolved_dimensions}."
555 )
556 if storage_class_name is not None:
557 if storage_class_name != resolved_dataset_type.storageClass_name:
558 if not allow_storage_class_overrides:
559 raise InvalidQueryError(
560 f"Storage class {storage_class_name!r} for dataset type {dataset_type!r} differs "
561 f"from repository definition {resolved_dataset_type.storageClass_name!r}, but "
562 "join_dataset_search does not are about storage classes and cannot record this "
563 "override. Pass the override to `Query.datasets` instead."
564 )
565 if not (
566 StorageClassFactory()
567 .getStorageClass(storage_class_name)
568 .can_convert(resolved_dataset_type.storageClass)
569 ):
570 raise DatasetTypeError(
571 f"Given storage class {storage_class_name!r} for {dataset_type_name!r} is not "
572 f"compatible with repository storage class {resolved_dataset_type.storageClass_name}."
573 )
574 else:
575 storage_class_name = resolved_dataset_type.storageClass_name
576 dataset_search = DatasetSearch.model_construct(
577 collections=collections,
578 dimensions=resolved_dimensions,
579 )
580 return (
581 dataset_type_name,
582 storage_class_name,
583 Query(self._driver, self._tree.join_dataset(dataset_type_name, dataset_search)),
584 )