Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 97%
81 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-21 09:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-21 09:54 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("CollectionSummaryManager",)
26from collections.abc import Iterable, Mapping
27from typing import Any, Generic, TypeVar
29import sqlalchemy
31from ....core import (
32 DatasetType,
33 GovernorDimension,
34 NamedKeyDict,
35 NamedKeyMapping,
36 addDimensionForeignKey,
37 ddl,
38)
39from ..._collection_summary import CollectionSummary
40from ..._collectionType import CollectionType
41from ...interfaces import (
42 ChainedCollectionRecord,
43 CollectionManager,
44 CollectionRecord,
45 Database,
46 DimensionRecordStorageManager,
47 StaticTablesContext,
48)
50_T = TypeVar("_T")
53class CollectionSummaryTables(Generic[_T]):
54 """Structure that holds the table or table specification objects that
55 summarize the contents of collections.
57 Parameters
58 ----------
59 datasetType
60 Table [specification] that summarizes which dataset types are in each
61 collection.
62 dimensions
63 Mapping of table [specifications] that summarize which governor
64 dimension values are present in the data IDs of each collection.
65 """
67 def __init__(
68 self,
69 datasetType: _T,
70 dimensions: NamedKeyMapping[GovernorDimension, _T],
71 ):
72 self.datasetType = datasetType
73 self.dimensions = dimensions
75 @classmethod
76 def makeTableSpecs(
77 cls,
78 collections: CollectionManager,
79 dimensions: DimensionRecordStorageManager,
80 ) -> CollectionSummaryTables[ddl.TableSpec]:
81 """Create specifications for all summary tables.
83 Parameters
84 ----------
85 collections: `CollectionManager`
86 Manager object for the collections in this `Registry`.
87 dimensions : `DimensionRecordStorageManager`
88 Manager object for the dimensions in this `Registry`.
90 Returns
91 -------
92 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
93 Structure containing table specifications.
94 """
95 # Spec for collection_summary_dataset_type.
96 datasetTypeTableSpec = ddl.TableSpec(fields=[])
97 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
98 datasetTypeTableSpec.fields.add(
99 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
100 )
101 datasetTypeTableSpec.foreignKeys.append(
102 ddl.ForeignKeySpec(
103 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE"
104 )
105 )
106 # Specs for collection_summary_<dimension>.
107 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
108 for dimension in dimensions.universe.getGovernorDimensions():
109 tableSpec = ddl.TableSpec(fields=[])
110 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
111 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
112 dimensionTableSpecs[dimension] = tableSpec
113 return CollectionSummaryTables(
114 datasetType=datasetTypeTableSpec,
115 dimensions=dimensionTableSpecs.freeze(),
116 )
119class CollectionSummaryManager:
120 """Object manages the summaries of what dataset types and governor
121 dimension values are present in a collection.
123 Parameters
124 ----------
125 db : `Database`
126 Interface to the underlying database engine and namespace.
127 collections: `CollectionManager`
128 Manager object for the collections in this `Registry`.
129 dimensions : `DimensionRecordStorageManager`
130 Manager object for the dimensions in this `Registry`.
131 tables : `CollectionSummaryTables`
132 Struct containing the tables that hold collection summaries.
133 """
135 def __init__(
136 self,
137 db: Database,
138 *,
139 collections: CollectionManager,
140 dimensions: DimensionRecordStorageManager,
141 tables: CollectionSummaryTables[sqlalchemy.schema.Table],
142 ):
143 self._db = db
144 self._collections = collections
145 self._collectionKeyName = collections.getCollectionForeignKeyName()
146 self._dimensions = dimensions
147 self._tables = tables
148 self._cache: dict[Any, CollectionSummary] = {}
150 @classmethod
151 def initialize(
152 cls,
153 db: Database,
154 context: StaticTablesContext,
155 *,
156 collections: CollectionManager,
157 dimensions: DimensionRecordStorageManager,
158 ) -> CollectionSummaryManager:
159 """Create all summary tables (or check that they have been created),
160 returning an object to manage them.
162 Parameters
163 ----------
164 db : `Database`
165 Interface to the underlying database engine and namespace.
166 context : `StaticTablesContext`
167 Context object obtained from `Database.declareStaticTables`; used
168 to declare any tables that should always be present.
169 collections: `CollectionManager`
170 Manager object for the collections in this `Registry`.
171 dimensions : `DimensionRecordStorageManager`
172 Manager object for the dimensions in this `Registry`.
174 Returns
175 -------
176 manager : `CollectionSummaryManager`
177 New manager object for collection summaries.
178 """
179 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
180 tables = CollectionSummaryTables(
181 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
182 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table](
183 {
184 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
185 for dimension, spec in specs.dimensions.items()
186 }
187 ).freeze(),
188 )
189 return cls(
190 db=db,
191 collections=collections,
192 dimensions=dimensions,
193 tables=tables,
194 )
196 def update(
197 self,
198 collection: CollectionRecord,
199 dataset_type_ids: Iterable[int],
200 summary: CollectionSummary,
201 ) -> None:
202 """Update the summary tables to associate the given collection with
203 a dataset type and governor dimension values.
205 Parameters
206 ----------
207 collection : `CollectionRecord`
208 Collection whose summary should be updated.
209 dataset_type_ids : `~collections.abc.Iterable` [ `int` ]
210 Integer IDs for the dataset types to associate with this
211 collection.
212 summary : `CollectionSummary`
213 Summary to store. Dataset types must correspond to
214 ``dataset_type_ids``.
216 Notes
217 -----
218 This method should only be called inside the transaction context of
219 another operation that inserts or associates datasets.
220 """
221 self._db.ensure(
222 self._tables.datasetType,
223 *[
224 {
225 "dataset_type_id": dataset_type_id,
226 self._collectionKeyName: collection.key,
227 }
228 for dataset_type_id in dataset_type_ids
229 ],
230 )
231 for dimension, values in summary.governors.items():
232 if values: 232 ↛ 231line 232 didn't jump to line 231, because the condition on line 232 was never false
233 self._db.ensure(
234 self._tables.dimensions[dimension],
235 *[{self._collectionKeyName: collection.key, dimension: v} for v in values],
236 )
237 # Update the in-memory cache, too. These changes will remain even if
238 # the database inserts above are rolled back by some later exception in
239 # the same transaction, but that's okay: we never promise that a
240 # CollectionSummary has _just_ the dataset types and governor dimension
241 # values that are actually present, only that it is guaranteed to
242 # contain any dataset types or governor dimension values that _may_ be
243 # present.
244 # That guarantee (and the possibility of rollbacks) means we can't get
245 # away with checking the cache before we try the database inserts,
246 # however; if someone had attempted to insert datasets of some dataset
247 # type previously, and that rolled back, and we're now trying to insert
248 # some more datasets of that same type, it would not be okay to skip
249 # the DB summary table insertions because we found entries in the
250 # in-memory cache.
251 self.get(collection).update(summary)
253 def refresh(self, dataset_types: Mapping[int, DatasetType]) -> None:
254 """Load all collection summary information from the database.
256 Parameters
257 ----------
258 dataset_types : `~collections.abc.Mapping` [`int`, `DatasetType`]
259 Mapping of an `int` dataset_type_id value to `DatasetType`
260 instance. Summaries are only loaded for dataset types that appear
261 in this mapping.
262 """
263 # Set up the SQL query we'll use to fetch all of the summary
264 # information at once.
265 columns = [
266 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName),
267 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"),
268 ]
269 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType
270 for dimension, table in self._tables.dimensions.items():
271 columns.append(table.columns[dimension.name].label(dimension.name))
272 fromClause = fromClause.join(
273 table,
274 onclause=(
275 self._tables.datasetType.columns[self._collectionKeyName]
276 == table.columns[self._collectionKeyName]
277 ),
278 isouter=True,
279 )
280 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
281 # Run the query and construct CollectionSummary objects from the result
282 # rows. This will never include CHAINED collections or collections
283 # with no datasets.
284 summaries: dict[Any, CollectionSummary] = {}
285 with self._db.query(sql) as sql_result:
286 sql_rows = sql_result.mappings().fetchall()
287 for row in sql_rows:
288 # Collection key should never be None/NULL; it's what we join on.
289 # Extract that and then turn it into a collection name.
290 collectionKey = row[self._collectionKeyName]
291 # dataset_type_id should also never be None/NULL; it's in the first
292 # table we joined.
293 if datasetType := dataset_types.get(row["dataset_type_id"]): 293 ↛ 287line 293 didn't jump to line 287, because the condition on line 293 was never false
294 # See if we have a summary already for this collection; if not,
295 # make one.
296 summary = summaries.get(collectionKey)
297 if summary is None:
298 summary = CollectionSummary()
299 summaries[collectionKey] = summary
300 # Update the dimensions with the values in this row that
301 # aren't None/NULL (many will be in general, because these
302 # enter the query via LEFT OUTER JOIN).
303 summary.dataset_types.add(datasetType)
304 for dimension in self._tables.dimensions:
305 value = row[dimension.name]
306 if value is not None:
307 summary.governors.setdefault(dimension.name, set()).add(value)
308 self._cache = summaries
310 def get(self, collection: CollectionRecord) -> CollectionSummary:
311 """Return a summary for the given collection.
313 Parameters
314 ----------
315 collection : `CollectionRecord`
316 Record describing the collection for which a summary is to be
317 retrieved.
319 Returns
320 -------
321 summary : `CollectionSummary`
322 Summary of the dataset types and governor dimension values in
323 this collection.
324 """
325 summary = self._cache.get(collection.key)
326 if summary is None:
327 # When we load the summary information from the database, we don't
328 # create summaries for CHAINED collections; those are created here
329 # as needed, and *never* cached - we have no good way to update
330 # those summaries when some a new dataset is added to a child
331 # colletion.
332 if collection.type is CollectionType.CHAINED:
333 assert isinstance(collection, ChainedCollectionRecord)
334 child_summaries = [self.get(self._collections.find(child)) for child in collection.children]
335 if child_summaries: 335 ↛ 338line 335 didn't jump to line 338, because the condition on line 335 was never false
336 summary = CollectionSummary.union(*child_summaries)
337 else:
338 summary = CollectionSummary()
339 else:
340 # Either this collection doesn't have any datasets yet, or the
341 # only datasets it has were created by some other process since
342 # the last call to refresh. We assume the former; the user is
343 # responsible for calling refresh if they want to read
344 # concurrently-written things. We do remember this in the
345 # cache.
346 summary = CollectionSummary()
347 self._cache[collection.key] = summary
348 return summary