Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 97%
84 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:43 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:43 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from .... import ddl
32__all__ = ("CollectionSummaryManager",)
34from collections.abc import Iterable, Mapping
35from typing import Any, Generic, TypeVar
37import sqlalchemy
39from ...._dataset_type import DatasetType
40from ...._named import NamedKeyDict, NamedKeyMapping
41from ....dimensions import GovernorDimension, addDimensionForeignKey
42from ..._collection_summary import CollectionSummary
43from ..._collection_type import CollectionType
44from ...interfaces import (
45 ChainedCollectionRecord,
46 CollectionManager,
47 CollectionRecord,
48 Database,
49 DimensionRecordStorageManager,
50 StaticTablesContext,
51)
53_T = TypeVar("_T")
56class CollectionSummaryTables(Generic[_T]):
57 """Structure that holds the table or table specification objects that
58 summarize the contents of collections.
60 Parameters
61 ----------
62 datasetType
63 Table [specification] that summarizes which dataset types are in each
64 collection.
65 dimensions
66 Mapping of table [specifications] that summarize which governor
67 dimension values are present in the data IDs of each collection.
68 """
70 def __init__(
71 self,
72 datasetType: _T,
73 dimensions: NamedKeyMapping[GovernorDimension, _T],
74 ):
75 self.datasetType = datasetType
76 self.dimensions = dimensions
78 @classmethod
79 def makeTableSpecs(
80 cls,
81 collections: CollectionManager,
82 dimensions: DimensionRecordStorageManager,
83 ) -> CollectionSummaryTables[ddl.TableSpec]:
84 """Create specifications for all summary tables.
86 Parameters
87 ----------
88 collections: `CollectionManager`
89 Manager object for the collections in this `Registry`.
90 dimensions : `DimensionRecordStorageManager`
91 Manager object for the dimensions in this `Registry`.
93 Returns
94 -------
95 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
96 Structure containing table specifications.
97 """
98 # Spec for collection_summary_dataset_type.
99 datasetTypeTableSpec = ddl.TableSpec(fields=[])
100 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
101 datasetTypeTableSpec.fields.add(
102 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
103 )
104 datasetTypeTableSpec.foreignKeys.append(
105 ddl.ForeignKeySpec(
106 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE"
107 )
108 )
109 # Specs for collection_summary_<dimension>.
110 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
111 for dimension in dimensions.universe.getGovernorDimensions():
112 tableSpec = ddl.TableSpec(fields=[])
113 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
114 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
115 dimensionTableSpecs[dimension] = tableSpec
116 return CollectionSummaryTables(
117 datasetType=datasetTypeTableSpec,
118 dimensions=dimensionTableSpecs.freeze(),
119 )
122class CollectionSummaryManager:
123 """Object manages the summaries of what dataset types and governor
124 dimension values are present in a collection.
126 Parameters
127 ----------
128 db : `Database`
129 Interface to the underlying database engine and namespace.
130 collections: `CollectionManager`
131 Manager object for the collections in this `Registry`.
132 dimensions : `DimensionRecordStorageManager`
133 Manager object for the dimensions in this `Registry`.
134 tables : `CollectionSummaryTables`
135 Struct containing the tables that hold collection summaries.
136 """
138 def __init__(
139 self,
140 db: Database,
141 *,
142 collections: CollectionManager,
143 dimensions: DimensionRecordStorageManager,
144 tables: CollectionSummaryTables[sqlalchemy.schema.Table],
145 ):
146 self._db = db
147 self._collections = collections
148 self._collectionKeyName = collections.getCollectionForeignKeyName()
149 self._dimensions = dimensions
150 self._tables = tables
151 self._cache: dict[Any, CollectionSummary] = {}
153 @classmethod
154 def initialize(
155 cls,
156 db: Database,
157 context: StaticTablesContext,
158 *,
159 collections: CollectionManager,
160 dimensions: DimensionRecordStorageManager,
161 ) -> CollectionSummaryManager:
162 """Create all summary tables (or check that they have been created),
163 returning an object to manage them.
165 Parameters
166 ----------
167 db : `Database`
168 Interface to the underlying database engine and namespace.
169 context : `StaticTablesContext`
170 Context object obtained from `Database.declareStaticTables`; used
171 to declare any tables that should always be present.
172 collections: `CollectionManager`
173 Manager object for the collections in this `Registry`.
174 dimensions : `DimensionRecordStorageManager`
175 Manager object for the dimensions in this `Registry`.
177 Returns
178 -------
179 manager : `CollectionSummaryManager`
180 New manager object for collection summaries.
181 """
182 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
183 tables = CollectionSummaryTables(
184 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
185 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table](
186 {
187 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
188 for dimension, spec in specs.dimensions.items()
189 }
190 ).freeze(),
191 )
192 return cls(
193 db=db,
194 collections=collections,
195 dimensions=dimensions,
196 tables=tables,
197 )
199 def update(
200 self,
201 collection: CollectionRecord,
202 dataset_type_ids: Iterable[int],
203 summary: CollectionSummary,
204 ) -> None:
205 """Update the summary tables to associate the given collection with
206 a dataset type and governor dimension values.
208 Parameters
209 ----------
210 collection : `CollectionRecord`
211 Collection whose summary should be updated.
212 dataset_type_ids : `~collections.abc.Iterable` [ `int` ]
213 Integer IDs for the dataset types to associate with this
214 collection.
215 summary : `CollectionSummary`
216 Summary to store. Dataset types must correspond to
217 ``dataset_type_ids``.
219 Notes
220 -----
221 This method should only be called inside the transaction context of
222 another operation that inserts or associates datasets.
223 """
224 self._db.ensure(
225 self._tables.datasetType,
226 *[
227 {
228 "dataset_type_id": dataset_type_id,
229 self._collectionKeyName: collection.key,
230 }
231 for dataset_type_id in dataset_type_ids
232 ],
233 )
234 for dimension, values in summary.governors.items():
235 if values: 235 ↛ 234line 235 didn't jump to line 234, because the condition on line 235 was never false
236 self._db.ensure(
237 self._tables.dimensions[dimension],
238 *[{self._collectionKeyName: collection.key, dimension: v} for v in values],
239 )
240 # Update the in-memory cache, too. These changes will remain even if
241 # the database inserts above are rolled back by some later exception in
242 # the same transaction, but that's okay: we never promise that a
243 # CollectionSummary has _just_ the dataset types and governor dimension
244 # values that are actually present, only that it is guaranteed to
245 # contain any dataset types or governor dimension values that _may_ be
246 # present.
247 # That guarantee (and the possibility of rollbacks) means we can't get
248 # away with checking the cache before we try the database inserts,
249 # however; if someone had attempted to insert datasets of some dataset
250 # type previously, and that rolled back, and we're now trying to insert
251 # some more datasets of that same type, it would not be okay to skip
252 # the DB summary table insertions because we found entries in the
253 # in-memory cache.
254 self.get(collection).update(summary)
256 def refresh(self, dataset_types: Mapping[int, DatasetType]) -> None:
257 """Load all collection summary information from the database.
259 Parameters
260 ----------
261 dataset_types : `~collections.abc.Mapping` [`int`, `DatasetType`]
262 Mapping of an `int` dataset_type_id value to `DatasetType`
263 instance. Summaries are only loaded for dataset types that appear
264 in this mapping.
265 """
266 # Set up the SQL query we'll use to fetch all of the summary
267 # information at once.
268 columns = [
269 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName),
270 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"),
271 ]
272 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType
273 for dimension, table in self._tables.dimensions.items():
274 columns.append(table.columns[dimension.name].label(dimension.name))
275 fromClause = fromClause.join(
276 table,
277 onclause=(
278 self._tables.datasetType.columns[self._collectionKeyName]
279 == table.columns[self._collectionKeyName]
280 ),
281 isouter=True,
282 )
283 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
284 # Run the query and construct CollectionSummary objects from the result
285 # rows. This will never include CHAINED collections or collections
286 # with no datasets.
287 summaries: dict[Any, CollectionSummary] = {}
288 with self._db.query(sql) as sql_result:
289 sql_rows = sql_result.mappings().fetchall()
290 for row in sql_rows:
291 # Collection key should never be None/NULL; it's what we join on.
292 # Extract that and then turn it into a collection name.
293 collectionKey = row[self._collectionKeyName]
294 # dataset_type_id should also never be None/NULL; it's in the first
295 # table we joined.
296 if datasetType := dataset_types.get(row["dataset_type_id"]): 296 ↛ 290line 296 didn't jump to line 290, because the condition on line 296 was never false
297 # See if we have a summary already for this collection; if not,
298 # make one.
299 summary = summaries.get(collectionKey)
300 if summary is None:
301 summary = CollectionSummary()
302 summaries[collectionKey] = summary
303 # Update the dimensions with the values in this row that
304 # aren't None/NULL (many will be in general, because these
305 # enter the query via LEFT OUTER JOIN).
306 summary.dataset_types.add(datasetType)
307 for dimension in self._tables.dimensions:
308 value = row[dimension.name]
309 if value is not None:
310 summary.governors.setdefault(dimension.name, set()).add(value)
311 self._cache = summaries
313 def get(self, collection: CollectionRecord) -> CollectionSummary:
314 """Return a summary for the given collection.
316 Parameters
317 ----------
318 collection : `CollectionRecord`
319 Record describing the collection for which a summary is to be
320 retrieved.
322 Returns
323 -------
324 summary : `CollectionSummary`
325 Summary of the dataset types and governor dimension values in
326 this collection.
327 """
328 summary = self._cache.get(collection.key)
329 if summary is None:
330 # When we load the summary information from the database, we don't
331 # create summaries for CHAINED collections; those are created here
332 # as needed, and *never* cached - we have no good way to update
333 # those summaries when some a new dataset is added to a child
334 # colletion.
335 if collection.type is CollectionType.CHAINED:
336 assert isinstance(collection, ChainedCollectionRecord)
337 child_summaries = [self.get(self._collections.find(child)) for child in collection.children]
338 if child_summaries: 338 ↛ 341line 338 didn't jump to line 341, because the condition on line 338 was never false
339 summary = CollectionSummary.union(*child_summaries)
340 else:
341 summary = CollectionSummary()
342 else:
343 # Either this collection doesn't have any datasets yet, or the
344 # only datasets it has were created by some other process since
345 # the last call to refresh. We assume the former; the user is
346 # responsible for calling refresh if they want to read
347 # concurrently-written things. We do remember this in the
348 # cache.
349 summary = CollectionSummary()
350 self._cache[collection.key] = summary
351 return summary