Coverage for python/lsst/daf/butler/registry/queries/_datasets.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["DatasetRegistryStorage"]
25from typing import Any, Mapping, Iterator, Optional
27import sqlalchemy
29from ...core import (
30 DatasetType,
31 DimensionGraph,
32 DimensionUniverse,
33)
34from .._collectionType import CollectionType
35from ..interfaces import CollectionManager, CollectionRecord
36from ..wildcards import CategorizedWildcard, CollectionSearch, CollectionQuery
39class DatasetRegistryStorage:
40 """An object managing ``dataset`` and related tables in a `Registry`.
42 Parameters
43 ----------
44 connection : `sqlalchemy.engine.Connection`
45 A SQLAlchemy connection object, typically shared with the `Registry`
46 that will own the storage instances.
47 universe : `DimensionUniverse`
48 The set of all dimensions for which storage instances should be
49 constructed.
50 tables : `dict`
51 A dictionary mapping table name to a `sqlalchemy.sql.FromClause`
52 representing that table.
54 Notes
55 -----
56 Future changes will convert this concrete class into a polymorphic
57 hierarchy modeled after `DimensionRecordStorage`, with many more
58 `SqlRegistry` method implementations delegating to it. Its interface
59 may change significantly at the same time. At present, this functionality
60 has been factored out of `SqlRegistry` (with a bit of duplication) to
61 allow the initial `QueryBuilder` design and implementation to be more
62 forward-looking.
63 """
64 def __init__(self, connection: sqlalchemy.engine.Connection, universe: DimensionUniverse,
65 tables: Mapping[str, sqlalchemy.sql.FromClause], *,
66 collections: CollectionManager):
67 self._connection = connection
68 self._universe = universe
69 self._collections = collections
70 self._datasetTypeTable = tables["dataset_type"]
71 self._datasetTypeDimensionsTable = tables["dataset_type_dimensions"]
72 self._datasetTable = tables["dataset"]
73 self._datasetCollectionTable = tables["dataset_collection"]
75 def fetchDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
76 """Retrieve `DatasetType` instances from the database matching an
77 expression.
79 Parameters
80 ----------
81 expression
82 An expression indicating the dataset type(s) to fetch.
83 See :ref:`daf_butler_dataset_type_expressions` for more
84 information.
86 Yields
87 -------
88 datasetType
89 A dataset matching the given argument.
90 """
91 query = sqlalchemy.sql.select([
92 self._datasetTypeTable.columns.dataset_type_name,
93 self._datasetTypeTable.columns.storage_class,
94 self._datasetTypeDimensionsTable.columns.dimension_name,
95 ]).select_from(
96 self._datasetTypeTable.join(self._datasetTypeDimensionsTable)
97 )
98 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
99 if wildcard is not ...:
100 where = wildcard.makeWhereExpression(self._datasetTypeTable.columns.dataset_type_name)
101 if where is None:
102 return
103 query = query.where(where)
104 # Run the query and group by dataset type name.
105 grouped = {}
106 for row in self._connection.execute(query).fetchall():
107 datasetTypeName, storageClassName, dimensionName = row
108 _, dimensionNames = grouped.setdefault(datasetTypeName, (storageClassName, set()))
109 dimensionNames.add(dimensionName)
110 for datasetTypeName, (storageClassName, dimensionNames) in grouped.items():
111 yield DatasetType(datasetTypeName,
112 dimensions=DimensionGraph(self._universe, names=dimensionNames),
113 storageClass=storageClassName)
115 def getDatasetSubquery(self, datasetType: DatasetType, *,
116 collections: Any,
117 isResult: bool = True,
118 addRank: bool = False) -> Optional[sqlalchemy.sql.FromClause]:
119 """Return a SQL expression that searches for a dataset of a particular
120 type in one or more collections.
122 Parameters
123 ----------
124 datasetType : `DatasetType`
125 Type of dataset to search for. Must be a true `DatasetType`;
126 call `fetchDatasetTypes` first to expand an expression if desired.
127 collections
128 An expression describing the collections to search and any
129 restrictions on the dataset types to search within them.
130 See :ref:`daf_butler_collection_expressions` for more information.
131 isResult : `bool`, optional
132 If `True` (default), include the ``dataset_id`` column in the
133 result columns of the query.
134 addRank : `bool`, optional
135 If `True` (`False` is default), also include a calculated column
136 that ranks the collection in which the dataset was found (lower
137 is better). Requires that ``collections`` must be an *ordered*
138 expression (regular expressions and `...` are not allowed).
140 Returns
141 -------
142 subquery : `sqlalchemy.sql.FromClause` or `None`
143 Named subquery or table that can be used in the FROM clause of
144 a SELECT query. Has at least columns for all dimensions in
145 ``datasetType.dimensions``; may have additional columns depending
146 on the values of ``isResult`` and ``addRank``. May be `None` if
147 it is known that the query would return no results.
148 """
149 # Always include dimension columns, because that's what we use to
150 # join against other tables.
151 columns = [self._datasetTable.columns[dimension.name] for dimension in datasetType.dimensions]
153 def finishSubquery(select: sqlalchemy.sql.Select, collectionRecord: CollectionRecord):
154 if collectionRecord.type is CollectionType.TAGGED:
155 collectionColumn = \
156 self._datasetCollectionTable.columns[self._collections.getCollectionForeignKeyName()]
157 fromClause = self._datasetTable.join(self._datasetCollectionTable)
158 elif collectionRecord.type is CollectionType.RUN:
159 collectionColumn = self._datasetTable.columns[self._collections.getRunForeignKeyName()]
160 fromClause = self._datasetTable
161 else:
162 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.")
163 return select.select_from(
164 fromClause
165 ).where(
166 sqlalchemy.sql.and_(self._datasetTable.columns.dataset_type_name == datasetType.name,
167 collectionColumn == collectionRecord.key)
168 )
170 # A list of single-collection queries that we'll UNION together.
171 subsubqueries = []
173 # Only include dataset_id and the rank of the collection in the given
174 # list if caller has indicated that they're going to be actually
175 # selecting columns from this subquery in the larger query.
176 if isResult:
177 columns.append(self._datasetTable.columns.dataset_id)
178 if addRank:
179 collections = CollectionSearch.fromExpression(collections)
180 for n, record in enumerate(collections.iter(self._collections, datasetType=datasetType)):
181 subsubqueries.append(
182 finishSubquery(
183 sqlalchemy.sql.select(
184 columns + [sqlalchemy.sql.literal(n).label("rank")]
185 ),
186 record
187 )
188 )
189 return sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name)
191 # The code path for not adding ranks is similar, but we don't need to
192 # add the literal rank column, and we transform the collections
193 # expression into a CollectionQuery instead of a CollectionSearch.
194 collections = CollectionQuery.fromExpression(collections)
195 for record in collections.iter(self._collections, datasetType=datasetType):
196 subsubqueries.append(finishSubquery(sqlalchemy.sql.select(columns), record))
197 if not subsubqueries:
198 return None
199 return sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name)