Coverage for python/lsst/daf/butler/registry/queries/_datasets.py : 17%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("DatasetRegistryStorage", "Like", "DatasetTypeExpression", "CollectionsExpression")
25from dataclasses import dataclass
26from typing import Mapping, Optional, Sequence, List, Union
28from sqlalchemy.sql import FromClause, select, case, and_, or_, ColumnElement
29from sqlalchemy.engine import Connection
31from ...core import (
32 DatasetType,
33 ExpandedDataCoordinate,
34 DimensionGraph,
35 DimensionUniverse,
36)
39@dataclass(frozen=True)
40class Like:
41 """Simple wrapper around a string pattern used to indicate that a string is
42 a pattern to be used with the SQL ``LIKE`` operator rather than a complete
43 name.
44 """
46 pattern: str
47 """The string pattern, in SQL ``LIKE`` syntax.
48 """
51DatasetTypeExpression = Union[DatasetType, str, Like, type(...)]
52"""Type annotation alias for the types accepted when querying for a dataset
53type.
55Ellipsis (``...``) is used as a full wildcard, indicating that any
56`DatasetType` will be matched.
57"""
59CollectionsExpression = Union[Sequence[Union[str, Like]], type(...)]
60"""Type annotation alias for the types accepted to describe the collections to
61be searched for a dataset.
63Ellipsis (``...``) is used as a full wildcard, indicating that all
64collections will be searched.
65"""
68def makeCollectionsWhereExpression(column: ColumnElement,
69 collections: CollectionsExpression) -> Optional[ColumnElement]:
70 """Construct a boolean SQL expression corresponding to a Python expression
71 for the collections to search for one or more datasets.
73 Parameters
74 ----------
75 column : `sqlalchemy.sql.ColumnElement`
76 The "collection" name column from a dataset subquery or table.
77 collections : `list` of `str` or `Like`, or ``...``
78 An expression indicating the collections to be searched. This may
79 be a sequence containing complete collection names (`str` values),
80 wildcard expressions (`Like` instances) or the special value ``...``,
81 indicating all collections.
83 Returns
84 -------
85 where : `sqlalchemy.sql.ColumnElement` or `None`
86 A boolean SQL expression object, or `None` if all collections are to
87 be searched and hence there is no WHERE expression for the given Python
88 expression (or, more precisely, the WHERE expression is the literal
89 "true", but we don't want to pollute our SQL queries with those when
90 we can avoid it).
91 """
92 if collections is ...:
93 return None
94 terms = []
95 equalities = []
96 for collection in collections:
97 if isinstance(collection, Like):
98 terms.append(column.like(collection.pattern))
99 else:
100 equalities.append(collection)
101 if len(equalities) == 1:
102 terms.append(column == equalities[0])
103 if len(equalities) > 1:
104 terms.append(column.in_(equalities))
105 return or_(*terms)
108class DatasetRegistryStorage:
109 """An object managing ``dataset`` and related tables in a `Registry`.
111 Parameters
112 ----------
113 connection : `sqlalchemy.engine.Connection`
114 A SQLAlchemy connection object, typically shared with the `Registry`
115 that will own the storage instances.
116 universe : `DimensionUniverse`
117 The set of all dimensions for which storage instances should be
118 constructed.
119 tables : `dict`
120 A dictionary mapping table name to a `sqlalchemy.sql.FromClause`
121 representing that table.
123 Notes
124 -----
125 Future changes will convert this concrete class into a polymorphic
126 hierarchy modeled after `DimensionRecordStorage`, with many more
127 `SqlRegistry` method implementations delegating to it. Its interface
128 may change significantly at the same time. At present, this functionality
129 has been factored out of `SqlRegistry` (with a bit of duplication) to
130 allow the initial `QueryBuilder` design and implementation to be more
131 forward-looking.
132 """
133 def __init__(self, connection: Connection, universe: DimensionUniverse,
134 tables: Mapping[str, FromClause]):
135 self._connection = connection
136 self._universe = universe
137 self._datasetTypeTable = tables["dataset_type"]
138 self._datasetTypeDimensionsTable = tables["dataset_type_dimensions"]
139 self._datasetTable = tables["dataset"]
140 self._datasetCollectionTable = tables["dataset_collection"]
142 def fetchDatasetTypes(self, datasetType: DatasetTypeExpression = ..., *,
143 collections: CollectionsExpression = ...,
144 dataId: Optional[ExpandedDataCoordinate] = None) -> List[DatasetType]:
145 """Retrieve `DatasetType` instances from the database matching an
146 expression.
148 Parameters
149 ----------
150 datasetType : `str`, `Like`, `DatasetType`, or ``...``
151 An expression indicating the dataset type(s) to fetch. If this is
152 a true `DatasetType` instance, it will be returned directly without
153 querying the database. If this is a `str`, the `DatasetType`
154 matching that name will be returned if it exists. If it is a
155 `Like` expression, dataset types whose name match the expression
156 will be returned. The special value ``...`` fetches all dataset
157 types. If no dataset types match, an empty `list` is returned.
158 collections : sequence of `str` or `Like`, or ``...``
159 An expression indicating collections that *may* be used to limit
160 the dataset types returned to only those that might have datasets
161 in these collections. This is intended as an optimization for
162 higher-level functionality; it may simply be ignored, and cannot
163 be relied upon to filter the returned dataset types.
164 dataId : `ExpandedDataCoordinate`, optional
165 A data ID that *may* be used to limit the dataset types returned
166 to only those with datasets matching the given data ID. This is
167 intended as an optimization for higher-level functionality; it may
168 simply be ignored, and cannot be relied upon to filter the returned
169 dataset types.
171 Returns
172 -------
173 datasetTypes : `list` of `DatasetType`
174 All datasets in the registry matching the given arguments.
175 """
176 if isinstance(datasetType, DatasetType):
177 # This *could* return an empty list if we could determine
178 # efficiently that there are no entries of this dataset type
179 # that match the given data ID or collections, but we are not
180 # required to do that filtering.
181 return [datasetType]
182 whereTerms = []
183 if datasetType is ...:
184 # "..." means no restriction on the dataset types; get all
185 # of them.
186 pass
187 elif isinstance(datasetType, str):
188 whereTerms.append(self._datasetTypeTable.columns.dataset_type_name == datasetType)
189 elif isinstance(datasetType, Like):
190 whereTerms.append(self._datasetTypeTable.columns.dataset_type_name.like(datasetType.pattern))
191 else:
192 raise TypeError(f"Unexpected dataset type expression '{datasetType}' in query.")
193 query = select([
194 self._datasetTypeTable.columns.dataset_type_name,
195 self._datasetTypeTable.columns.storage_class,
196 self._datasetTypeDimensionsTable.columns.dimension_name,
197 ]).select_from(
198 self._datasetTypeTable.join(self._datasetTypeDimensionsTable)
199 )
200 if whereTerms:
201 query = query.where(*whereTerms)
202 # Collections and dataId arguments are currently ignored; they are
203 # provided so future code *may* restrict the list of returned dataset
204 # types, but are not required to be used.
205 grouped = {}
206 for row in self._connection.execute(query).fetchall():
207 datasetTypeName, storageClassName, dimensionName = row
208 _, dimensionNames = grouped.setdefault(datasetTypeName, (storageClassName, set()))
209 dimensionNames.add(dimensionName)
210 return [DatasetType(datasetTypeName,
211 dimensions=DimensionGraph(self._universe, names=dimensionNames),
212 storageClass=storageClassName)
213 for datasetTypeName, (storageClassName, dimensionNames) in grouped.items()]
215 def getDatasetSubquery(self, datasetType: DatasetType, *,
216 collections: CollectionsExpression,
217 dataId: Optional[ExpandedDataCoordinate] = None,
218 isResult: bool = True,
219 addRank: bool = False) -> FromClause:
220 """Return a SQL expression that searches for a dataset of a particular
221 type in one or more collections.
223 Parameters
224 ----------
225 datasetType : `DatasetType`
226 Type of dataset to search for. Must be a true `DatasetType`;
227 call `fetchDatasetTypes` first to expand an expression if desired.
228 collections : sequence of `str` or `Like`, or ``...``
229 An expression describing the collections in which to search for
230 the datasets. ``...`` indicates that all collections should be
231 searched. Returned datasets are guaranteed to be from one of the
232 given collections (unlike the behavior of the same argument in
233 `fetchDatasetTypes`).
234 dataId : `ExpandedDataCoordinate`, optional
235 A data ID that *may* be used to limit the datasets returned
236 to only those matching the given data ID. This is intended as an
237 optimization for higher-level functionality; it may simply be
238 ignored, and cannot be relied upon to filter the returned dataset
239 types.
240 isResult : `bool`, optional
241 If `True` (default), include the ``dataset_id`` column in the
242 result columns of the query.
243 addRank : `bool`, optional
244 If `True` (`False` is default), also include a calculated column
245 that ranks the collection in which the dataset was found (lower
246 is better). Requires that all entries in ``collections`` be
247 regular strings, so there is a clear search order. Ignored if
248 ``isResult`` is `False`.
250 Returns
251 -------
252 subquery : `sqlalchemy.sql.FromClause`
253 Named subquery or table that can be used in the FROM clause of
254 a SELECT query. Has at least columns for all dimensions in
255 ``datasetType.dimensions``; may have additional columns depending
256 on the values of ``isResult`` and ``addRank``.
257 """
258 # Always include dimension columns, because that's what we use to
259 # join against other tables.
260 columns = [self._datasetTable.columns[dimension.name] for dimension in datasetType.dimensions]
261 # Only include dataset_id and the rank of the collection in the given
262 # list if caller has indicated that they're going to be actually
263 # selecting columns from this subquery in the larger query.
264 if isResult:
265 columns.append(self._datasetTable.columns.dataset_id)
266 if addRank:
267 if collections is ...:
268 raise TypeError("Cannot rank collections when no collections are provided.")
269 ranks = {}
270 for n, collection in enumerate(collections):
271 if isinstance(collection, Like):
272 raise TypeError(
273 f"Cannot rank collections that include LIKE pattern '{collection.pattern}'."
274 )
275 ranks[collection] = n
276 columns.append(
277 case(
278 ranks,
279 value=self._datasetCollectionTable.columns.collection
280 ).label("rank")
281 )
282 whereTerms = [self._datasetTable.columns.dataset_type_name == datasetType.name]
283 collectionsTerm = makeCollectionsWhereExpression(self._datasetCollectionTable.columns.collection,
284 collections)
285 if collectionsTerm is not None:
286 whereTerms.append(collectionsTerm)
287 return select(
288 columns
289 ).select_from(
290 self._datasetTable.join(self._datasetCollectionTable)
291 ).where(
292 and_(*whereTerms)
293 ).alias(datasetType.name)