Coverage for python/lsst/daf/butler/registry/queries/_datasets.py : 15%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["DatasetRegistryStorage"]
25from typing import Any, Mapping, Iterator
27import sqlalchemy
29from ...core import (
30 DatasetType,
31 DimensionGraph,
32 DimensionUniverse,
33)
34from .._collectionType import CollectionType
35from ..interfaces import CollectionManager, CollectionRecord
36from ..wildcards import CategorizedWildcard, CollectionSearch, CollectionQuery
39class DatasetRegistryStorage:
40 """An object managing ``dataset`` and related tables in a `Registry`.
42 Parameters
43 ----------
44 connection : `sqlalchemy.engine.Connection`
45 A SQLAlchemy connection object, typically shared with the `Registry`
46 that will own the storage instances.
47 universe : `DimensionUniverse`
48 The set of all dimensions for which storage instances should be
49 constructed.
50 tables : `dict`
51 A dictionary mapping table name to a `sqlalchemy.sql.FromClause`
52 representing that table.
54 Notes
55 -----
56 Future changes will convert this concrete class into a polymorphic
57 hierarchy modeled after `DimensionRecordStorage`, with many more
58 `SqlRegistry` method implementations delegating to it. Its interface
59 may change significantly at the same time. At present, this functionality
60 has been factored out of `SqlRegistry` (with a bit of duplication) to
61 allow the initial `QueryBuilder` design and implementation to be more
62 forward-looking.
63 """
64 def __init__(self, connection: sqlalchemy.engine.Connection, universe: DimensionUniverse,
65 tables: Mapping[str, sqlalchemy.sql.FromClause], *,
66 collections: CollectionManager):
67 self._connection = connection
68 self._universe = universe
69 self._collections = collections
70 self._datasetTypeTable = tables["dataset_type"]
71 self._datasetTypeDimensionsTable = tables["dataset_type_dimensions"]
72 self._datasetTable = tables["dataset"]
73 self._datasetCollectionTable = tables["dataset_collection"]
75 def fetchDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]:
76 """Retrieve `DatasetType` instances from the database matching an
77 expression.
79 Parameters
80 ----------
81 expression
82 An expression indicating the dataset type(s) to fetch.
83 See :ref:`daf_butler_dataset_type_expressions` for more
84 information.
86 Yields
87 -------
88 datasetType
89 A dataset matching the given argument.
90 """
91 query = sqlalchemy.sql.select([
92 self._datasetTypeTable.columns.dataset_type_name,
93 self._datasetTypeTable.columns.storage_class,
94 self._datasetTypeDimensionsTable.columns.dimension_name,
95 ]).select_from(
96 self._datasetTypeTable.join(self._datasetTypeDimensionsTable)
97 )
98 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name)
99 if wildcard is not ...:
100 where = wildcard.makeWhereExpression(self._datasetTypeTable.columns.dataset_type_name)
101 if where is None:
102 return
103 query = query.where(where)
104 # Run the query and group by dataset type name.
105 grouped = {}
106 for row in self._connection.execute(query).fetchall():
107 datasetTypeName, storageClassName, dimensionName = row
108 _, dimensionNames = grouped.setdefault(datasetTypeName, (storageClassName, set()))
109 dimensionNames.add(dimensionName)
110 for datasetTypeName, (storageClassName, dimensionNames) in grouped.items():
111 yield DatasetType(datasetTypeName,
112 dimensions=DimensionGraph(self._universe, names=dimensionNames),
113 storageClass=storageClassName)
115 def getDatasetSubquery(self, datasetType: DatasetType, *,
116 collections: Any,
117 isResult: bool = True,
118 addRank: bool = False) -> sqlalchemy.sql.FromClause:
119 """Return a SQL expression that searches for a dataset of a particular
120 type in one or more collections.
122 Parameters
123 ----------
124 datasetType : `DatasetType`
125 Type of dataset to search for. Must be a true `DatasetType`;
126 call `fetchDatasetTypes` first to expand an expression if desired.
127 collections
128 An expression describing the collections to search and any
129 restrictions on the dataset types to search within them.
130 See :ref:`daf_butler_collection_expressions` for more information.
131 isResult : `bool`, optional
132 If `True` (default), include the ``dataset_id`` column in the
133 result columns of the query.
134 addRank : `bool`, optional
135 If `True` (`False` is default), also include a calculated column
136 that ranks the collection in which the dataset was found (lower
137 is better). Requires that ``collections`` must be an *ordered*
138 expression (regular expressions and `...` are not allowed).
140 Returns
141 -------
142 subquery : `sqlalchemy.sql.FromClause`
143 Named subquery or table that can be used in the FROM clause of
144 a SELECT query. Has at least columns for all dimensions in
145 ``datasetType.dimensions``; may have additional columns depending
146 on the values of ``isResult`` and ``addRank``.
147 """
148 # Always include dimension columns, because that's what we use to
149 # join against other tables.
150 columns = [self._datasetTable.columns[dimension.name] for dimension in datasetType.dimensions]
152 def finishSubquery(select: sqlalchemy.sql.Select, collectionRecord: CollectionRecord):
153 if collectionRecord.type is CollectionType.TAGGED:
154 collectionColumn = \
155 self._datasetCollectionTable.columns[self._collections.getCollectionForeignKeyName()]
156 fromClause = self._datasetTable.join(self._datasetCollectionTable)
157 elif collectionRecord.type is CollectionType.RUN:
158 collectionColumn = self._datasetTable.columns[self._collections.getRunForeignKeyName()]
159 fromClause = self._datasetTable
160 else:
161 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.")
162 return select.select_from(
163 fromClause
164 ).where(
165 sqlalchemy.sql.and_(self._datasetTable.columns.dataset_type_name == datasetType.name,
166 collectionColumn == collectionRecord.key)
167 )
169 # A list of single-collection queries that we'll UNION together.
170 subsubqueries = []
172 # Only include dataset_id and the rank of the collection in the given
173 # list if caller has indicated that they're going to be actually
174 # selecting columns from this subquery in the larger query.
175 if isResult:
176 columns.append(self._datasetTable.columns.dataset_id)
177 if addRank:
178 collections = CollectionSearch.fromExpression(collections)
179 for n, record in enumerate(collections.iter(self._collections, datasetType=datasetType)):
180 subsubqueries.append(
181 finishSubquery(
182 sqlalchemy.sql.select(
183 columns + [sqlalchemy.sql.literal(n).label("rank")]
184 ),
185 record
186 )
187 )
188 return sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name)
190 # The code path for not adding ranks is similar, but we don't need to
191 # add the literal rank column, and we transform the collections
192 # expression into a CollectionQuery instead of a CollectionSearch.
193 collections = CollectionQuery.fromExpression(collections)
194 for record in collections.iter(self._collections, datasetType=datasetType):
195 subsubqueries.append(finishSubquery(sqlalchemy.sql.select(columns), record))
196 return sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name)