Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 97%
81 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 07:59 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 07:59 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("CollectionSummaryManager",)
32from collections.abc import Iterable, Mapping
33from typing import Any, Generic, TypeVar
35import sqlalchemy
37from ....core import (
38 DatasetType,
39 GovernorDimension,
40 NamedKeyDict,
41 NamedKeyMapping,
42 addDimensionForeignKey,
43 ddl,
44)
45from ..._collection_summary import CollectionSummary
46from ..._collectionType import CollectionType
47from ...interfaces import (
48 ChainedCollectionRecord,
49 CollectionManager,
50 CollectionRecord,
51 Database,
52 DimensionRecordStorageManager,
53 StaticTablesContext,
54)
56_T = TypeVar("_T")
59class CollectionSummaryTables(Generic[_T]):
60 """Structure that holds the table or table specification objects that
61 summarize the contents of collections.
63 Parameters
64 ----------
65 datasetType
66 Table [specification] that summarizes which dataset types are in each
67 collection.
68 dimensions
69 Mapping of table [specifications] that summarize which governor
70 dimension values are present in the data IDs of each collection.
71 """
73 def __init__(
74 self,
75 datasetType: _T,
76 dimensions: NamedKeyMapping[GovernorDimension, _T],
77 ):
78 self.datasetType = datasetType
79 self.dimensions = dimensions
81 @classmethod
82 def makeTableSpecs(
83 cls,
84 collections: CollectionManager,
85 dimensions: DimensionRecordStorageManager,
86 ) -> CollectionSummaryTables[ddl.TableSpec]:
87 """Create specifications for all summary tables.
89 Parameters
90 ----------
91 collections: `CollectionManager`
92 Manager object for the collections in this `Registry`.
93 dimensions : `DimensionRecordStorageManager`
94 Manager object for the dimensions in this `Registry`.
96 Returns
97 -------
98 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
99 Structure containing table specifications.
100 """
101 # Spec for collection_summary_dataset_type.
102 datasetTypeTableSpec = ddl.TableSpec(fields=[])
103 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
104 datasetTypeTableSpec.fields.add(
105 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
106 )
107 datasetTypeTableSpec.foreignKeys.append(
108 ddl.ForeignKeySpec(
109 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE"
110 )
111 )
112 # Specs for collection_summary_<dimension>.
113 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
114 for dimension in dimensions.universe.getGovernorDimensions():
115 tableSpec = ddl.TableSpec(fields=[])
116 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
117 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
118 dimensionTableSpecs[dimension] = tableSpec
119 return CollectionSummaryTables(
120 datasetType=datasetTypeTableSpec,
121 dimensions=dimensionTableSpecs.freeze(),
122 )
125class CollectionSummaryManager:
126 """Object manages the summaries of what dataset types and governor
127 dimension values are present in a collection.
129 Parameters
130 ----------
131 db : `Database`
132 Interface to the underlying database engine and namespace.
133 collections: `CollectionManager`
134 Manager object for the collections in this `Registry`.
135 dimensions : `DimensionRecordStorageManager`
136 Manager object for the dimensions in this `Registry`.
137 tables : `CollectionSummaryTables`
138 Struct containing the tables that hold collection summaries.
139 """
141 def __init__(
142 self,
143 db: Database,
144 *,
145 collections: CollectionManager,
146 dimensions: DimensionRecordStorageManager,
147 tables: CollectionSummaryTables[sqlalchemy.schema.Table],
148 ):
149 self._db = db
150 self._collections = collections
151 self._collectionKeyName = collections.getCollectionForeignKeyName()
152 self._dimensions = dimensions
153 self._tables = tables
154 self._cache: dict[Any, CollectionSummary] = {}
156 @classmethod
157 def initialize(
158 cls,
159 db: Database,
160 context: StaticTablesContext,
161 *,
162 collections: CollectionManager,
163 dimensions: DimensionRecordStorageManager,
164 ) -> CollectionSummaryManager:
165 """Create all summary tables (or check that they have been created),
166 returning an object to manage them.
168 Parameters
169 ----------
170 db : `Database`
171 Interface to the underlying database engine and namespace.
172 context : `StaticTablesContext`
173 Context object obtained from `Database.declareStaticTables`; used
174 to declare any tables that should always be present.
175 collections: `CollectionManager`
176 Manager object for the collections in this `Registry`.
177 dimensions : `DimensionRecordStorageManager`
178 Manager object for the dimensions in this `Registry`.
180 Returns
181 -------
182 manager : `CollectionSummaryManager`
183 New manager object for collection summaries.
184 """
185 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
186 tables = CollectionSummaryTables(
187 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
188 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table](
189 {
190 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
191 for dimension, spec in specs.dimensions.items()
192 }
193 ).freeze(),
194 )
195 return cls(
196 db=db,
197 collections=collections,
198 dimensions=dimensions,
199 tables=tables,
200 )
202 def update(
203 self,
204 collection: CollectionRecord,
205 dataset_type_ids: Iterable[int],
206 summary: CollectionSummary,
207 ) -> None:
208 """Update the summary tables to associate the given collection with
209 a dataset type and governor dimension values.
211 Parameters
212 ----------
213 collection : `CollectionRecord`
214 Collection whose summary should be updated.
215 dataset_type_ids : `~collections.abc.Iterable` [ `int` ]
216 Integer IDs for the dataset types to associate with this
217 collection.
218 summary : `CollectionSummary`
219 Summary to store. Dataset types must correspond to
220 ``dataset_type_ids``.
222 Notes
223 -----
224 This method should only be called inside the transaction context of
225 another operation that inserts or associates datasets.
226 """
227 self._db.ensure(
228 self._tables.datasetType,
229 *[
230 {
231 "dataset_type_id": dataset_type_id,
232 self._collectionKeyName: collection.key,
233 }
234 for dataset_type_id in dataset_type_ids
235 ],
236 )
237 for dimension, values in summary.governors.items():
238 if values: 238 ↛ 237line 238 didn't jump to line 237, because the condition on line 238 was never false
239 self._db.ensure(
240 self._tables.dimensions[dimension],
241 *[{self._collectionKeyName: collection.key, dimension: v} for v in values],
242 )
243 # Update the in-memory cache, too. These changes will remain even if
244 # the database inserts above are rolled back by some later exception in
245 # the same transaction, but that's okay: we never promise that a
246 # CollectionSummary has _just_ the dataset types and governor dimension
247 # values that are actually present, only that it is guaranteed to
248 # contain any dataset types or governor dimension values that _may_ be
249 # present.
250 # That guarantee (and the possibility of rollbacks) means we can't get
251 # away with checking the cache before we try the database inserts,
252 # however; if someone had attempted to insert datasets of some dataset
253 # type previously, and that rolled back, and we're now trying to insert
254 # some more datasets of that same type, it would not be okay to skip
255 # the DB summary table insertions because we found entries in the
256 # in-memory cache.
257 self.get(collection).update(summary)
259 def refresh(self, dataset_types: Mapping[int, DatasetType]) -> None:
260 """Load all collection summary information from the database.
262 Parameters
263 ----------
264 dataset_types : `~collections.abc.Mapping` [`int`, `DatasetType`]
265 Mapping of an `int` dataset_type_id value to `DatasetType`
266 instance. Summaries are only loaded for dataset types that appear
267 in this mapping.
268 """
269 # Set up the SQL query we'll use to fetch all of the summary
270 # information at once.
271 columns = [
272 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName),
273 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"),
274 ]
275 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType
276 for dimension, table in self._tables.dimensions.items():
277 columns.append(table.columns[dimension.name].label(dimension.name))
278 fromClause = fromClause.join(
279 table,
280 onclause=(
281 self._tables.datasetType.columns[self._collectionKeyName]
282 == table.columns[self._collectionKeyName]
283 ),
284 isouter=True,
285 )
286 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
287 # Run the query and construct CollectionSummary objects from the result
288 # rows. This will never include CHAINED collections or collections
289 # with no datasets.
290 summaries: dict[Any, CollectionSummary] = {}
291 with self._db.query(sql) as sql_result:
292 sql_rows = sql_result.mappings().fetchall()
293 for row in sql_rows:
294 # Collection key should never be None/NULL; it's what we join on.
295 # Extract that and then turn it into a collection name.
296 collectionKey = row[self._collectionKeyName]
297 # dataset_type_id should also never be None/NULL; it's in the first
298 # table we joined.
299 if datasetType := dataset_types.get(row["dataset_type_id"]): 299 ↛ 293line 299 didn't jump to line 293, because the condition on line 299 was never false
300 # See if we have a summary already for this collection; if not,
301 # make one.
302 summary = summaries.get(collectionKey)
303 if summary is None:
304 summary = CollectionSummary()
305 summaries[collectionKey] = summary
306 # Update the dimensions with the values in this row that
307 # aren't None/NULL (many will be in general, because these
308 # enter the query via LEFT OUTER JOIN).
309 summary.dataset_types.add(datasetType)
310 for dimension in self._tables.dimensions:
311 value = row[dimension.name]
312 if value is not None:
313 summary.governors.setdefault(dimension.name, set()).add(value)
314 self._cache = summaries
316 def get(self, collection: CollectionRecord) -> CollectionSummary:
317 """Return a summary for the given collection.
319 Parameters
320 ----------
321 collection : `CollectionRecord`
322 Record describing the collection for which a summary is to be
323 retrieved.
325 Returns
326 -------
327 summary : `CollectionSummary`
328 Summary of the dataset types and governor dimension values in
329 this collection.
330 """
331 summary = self._cache.get(collection.key)
332 if summary is None:
333 # When we load the summary information from the database, we don't
334 # create summaries for CHAINED collections; those are created here
335 # as needed, and *never* cached - we have no good way to update
336 # those summaries when some a new dataset is added to a child
337 # colletion.
338 if collection.type is CollectionType.CHAINED:
339 assert isinstance(collection, ChainedCollectionRecord)
340 child_summaries = [self.get(self._collections.find(child)) for child in collection.children]
341 if child_summaries: 341 ↛ 344line 341 didn't jump to line 344, because the condition on line 341 was never false
342 summary = CollectionSummary.union(*child_summaries)
343 else:
344 summary = CollectionSummary()
345 else:
346 # Either this collection doesn't have any datasets yet, or the
347 # only datasets it has were created by some other process since
348 # the last call to refresh. We assume the former; the user is
349 # responsible for calling refresh if they want to read
350 # concurrently-written things. We do remember this in the
351 # cache.
352 summary = CollectionSummary()
353 self._cache[collection.key] = summary
354 return summary