Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 98%
80 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-28 07:51 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-08-28 07:51 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("CollectionSummaryManager",)
26from typing import Any, Callable, Dict, Generic, TypeVar
28import sqlalchemy
29from lsst.daf.butler import (
30 DatasetType,
31 GovernorDimension,
32 NamedKeyDict,
33 NamedKeyMapping,
34 NamedValueSet,
35 addDimensionForeignKey,
36 ddl,
37)
38from lsst.daf.butler.registry.interfaces import (
39 ChainedCollectionRecord,
40 CollectionManager,
41 CollectionRecord,
42 Database,
43 DimensionRecordStorageManager,
44 StaticTablesContext,
45)
47from ..._collectionType import CollectionType
48from ...summaries import CollectionSummary, GovernorDimensionRestriction
50_T = TypeVar("_T")
53class CollectionSummaryTables(Generic[_T]):
54 """Structure that holds the table or table specification objects that
55 summarize the contents of collections.
57 Parameters
58 ----------
59 datasetType
60 Table [specification] that summarizes which dataset types are in each
61 collection.
62 dimensions
63 Mapping of table [specifications] that summarize which governor
64 dimension values are present in the data IDs of each collection.
65 """
67 def __init__(
68 self,
69 datasetType: _T,
70 dimensions: NamedKeyMapping[GovernorDimension, _T],
71 ):
72 self.datasetType = datasetType
73 self.dimensions = dimensions
75 @classmethod
76 def makeTableSpecs(
77 cls,
78 collections: CollectionManager,
79 dimensions: DimensionRecordStorageManager,
80 ) -> CollectionSummaryTables[ddl.TableSpec]:
81 """Create specifications for all summary tables.
83 Parameters
84 ----------
85 collections: `CollectionManager`
86 Manager object for the collections in this `Registry`.
87 dimensions : `DimensionRecordStorageManager`
88 Manager object for the dimensions in this `Registry`.
90 Returns
91 -------
92 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
93 Structure containing table specifications.
94 """
95 # Spec for collection_summary_dataset_type.
96 datasetTypeTableSpec = ddl.TableSpec(fields=[])
97 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
98 datasetTypeTableSpec.fields.add(
99 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
100 )
101 datasetTypeTableSpec.foreignKeys.append(
102 ddl.ForeignKeySpec(
103 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE"
104 )
105 )
106 # Specs for collection_summary_<dimension>.
107 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
108 for dimension in dimensions.universe.getGovernorDimensions():
109 tableSpec = ddl.TableSpec(fields=[])
110 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
111 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
112 dimensionTableSpecs[dimension] = tableSpec
113 return CollectionSummaryTables(
114 datasetType=datasetTypeTableSpec,
115 dimensions=dimensionTableSpecs.freeze(),
116 )
119class CollectionSummaryManager:
120 """Object manages the summaries of what dataset types and governor
121 dimension values are present in a collection.
123 Parameters
124 ----------
125 db : `Database`
126 Interface to the underlying database engine and namespace.
127 collections: `CollectionManager`
128 Manager object for the collections in this `Registry`.
129 dimensions : `DimensionRecordStorageManager`
130 Manager object for the dimensions in this `Registry`.
131 tables : `CollectionSummaryTables`
132 Struct containing the tables that hold collection summaries.
133 """
135 def __init__(
136 self,
137 db: Database,
138 *,
139 collections: CollectionManager,
140 dimensions: DimensionRecordStorageManager,
141 tables: CollectionSummaryTables[sqlalchemy.sql.Table],
142 ):
143 self._db = db
144 self._collections = collections
145 self._collectionKeyName = collections.getCollectionForeignKeyName()
146 self._dimensions = dimensions
147 self._tables = tables
148 self._cache: Dict[Any, CollectionSummary] = {}
150 @classmethod
151 def initialize(
152 cls,
153 db: Database,
154 context: StaticTablesContext,
155 *,
156 collections: CollectionManager,
157 dimensions: DimensionRecordStorageManager,
158 ) -> CollectionSummaryManager:
159 """Create all summary tables (or check that they have been created),
160 returning an object to manage them.
162 Parameters
163 ----------
164 db : `Database`
165 Interface to the underlying database engine and namespace.
166 context : `StaticTablesContext`
167 Context object obtained from `Database.declareStaticTables`; used
168 to declare any tables that should always be present.
169 collections: `CollectionManager`
170 Manager object for the collections in this `Registry`.
171 dimensions : `DimensionRecordStorageManager`
172 Manager object for the dimensions in this `Registry`.
174 Returns
175 -------
176 manager : `CollectionSummaryManager`
177 New manager object for collection summaries.
178 """
179 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
180 tables = CollectionSummaryTables(
181 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
182 dimensions=NamedKeyDict(
183 {
184 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
185 for dimension, spec in specs.dimensions.items()
186 }
187 ).freeze(),
188 )
189 return cls(
190 db=db,
191 collections=collections,
192 dimensions=dimensions,
193 tables=tables,
194 )
196 def update(
197 self,
198 collection: CollectionRecord,
199 datasetType: DatasetType,
200 dataset_type_id: int,
201 governors: GovernorDimensionRestriction,
202 ) -> None:
203 """Update the summary tables to associate the given collection with
204 a dataset type and governor dimension values.
206 Parameters
207 ----------
208 collection : `CollectionRecord`
209 Collection whose summary should be updated.
210 datasetType : `DatasetType`
211 DatasetType instance to associate with this collection.
212 dataset_type_id : `int`
213 Integer ID for the dataset type to associate with this collection.
214 governors : `GovernorDimensionRestriction`
215 Mapping from `GovernorDimensionRestriction` to sets of values they
216 may be associated with in the data IDs of the datasets in this
217 collection.
219 Notes
220 -----
221 This method should only be called inside the transaction context of
222 another operation that inserts or associates datasets.
223 """
224 self._db.ensure(
225 self._tables.datasetType,
226 {
227 "dataset_type_id": dataset_type_id,
228 self._collectionKeyName: collection.key,
229 },
230 )
231 for dimension, values in governors.items():
232 if values:
233 self._db.ensure(
234 self._tables.dimensions[dimension.name],
235 *[{self._collectionKeyName: collection.key, dimension.name: v} for v in values],
236 )
237 # Update the in-memory cache, too. These changes will remain even if
238 # the database inserts above are rolled back by some later exception in
239 # the same transaction, but that's okay: we never promise that a
240 # CollectionSummary has _just_ the dataset types and governor dimension
241 # values that are actually present, only that it is guaranteed to
242 # contain any dataset types or governor dimension values that _may_ be
243 # present.
244 # That guarantee (and the possibility of rollbacks) means we can't get
245 # away with checking the cache before we try the database inserts,
246 # however; if someone had attempted to insert datasets of some dataset
247 # type previously, and that rolled back, and we're now trying to insert
248 # some more datasets of that same type, it would not be okay to skip
249 # the DB summary table insertions because we found entries in the
250 # in-memory cache.
251 summary = self.get(collection)
252 summary.datasetTypes.add(datasetType)
253 summary.dimensions.update(governors)
255 def refresh(self, get_dataset_type: Callable[[int], DatasetType]) -> None:
256 """Load all collection summary information from the database.
258 Parameters
259 ----------
260 get_dataset_type : `Callable`
261 Function that takes an `int` dataset_type_id value and returns a
262 `DatasetType` instance.
263 """
264 # Set up the SQL query we'll use to fetch all of the summary
265 # information at once.
266 columns = [
267 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName),
268 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"),
269 ]
270 fromClause = self._tables.datasetType
271 for dimension, table in self._tables.dimensions.items():
272 columns.append(table.columns[dimension.name].label(dimension.name))
273 fromClause = fromClause.join(
274 table,
275 onclause=(
276 self._tables.datasetType.columns[self._collectionKeyName]
277 == table.columns[self._collectionKeyName]
278 ),
279 isouter=True,
280 )
281 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
282 # Run the query and construct CollectionSummary objects from the result
283 # rows. This will never include CHAINED collections or collections
284 # with no datasets.
285 summaries: Dict[Any, CollectionSummary] = {}
286 for row in self._db.query(sql).mappings():
287 # Collection key should never be None/NULL; it's what we join on.
288 # Extract that and then turn it into a collection name.
289 collectionKey = row[self._collectionKeyName]
290 # dataset_type_id should also never be None/NULL; it's in the first
291 # table we joined.
292 datasetType = get_dataset_type(row["dataset_type_id"])
293 # See if we have a summary already for this collection; if not,
294 # make one.
295 summary = summaries.get(collectionKey)
296 if summary is None:
297 summary = CollectionSummary(
298 datasetTypes=NamedValueSet([datasetType]),
299 dimensions=GovernorDimensionRestriction.makeEmpty(self._dimensions.universe),
300 )
301 summaries[collectionKey] = summary
302 else:
303 summary.datasetTypes.add(datasetType)
304 # Update the dimensions with the values in this row that aren't
305 # None/NULL (many will be in general, because these enter the query
306 # via LEFT OUTER JOIN).
307 for dimension in self._tables.dimensions:
308 value = row[dimension.name]
309 if value is not None:
310 summary.dimensions.add(dimension, value)
311 self._cache = summaries
313 def get(self, collection: CollectionRecord) -> CollectionSummary:
314 """Return a summary for the given collection.
316 Parameters
317 ----------
318 collection : `CollectionRecord`
319 Record describing the collection for which a summary is to be
320 retrieved.
322 Returns
323 -------
324 summary : `CollectionSummary`
325 Summary of the dataset types and governor dimension values in
326 this collection.
327 """
328 summary = self._cache.get(collection.key)
329 if summary is None:
330 # When we load the summary information from the database, we don't
331 # create summaries for CHAINED collections; those are created here
332 # as needed, and *never* cached - we have no good way to update
333 # those summaries when some a new dataset is added to a child
334 # colletion.
335 if collection.type is CollectionType.CHAINED:
336 assert isinstance(collection, ChainedCollectionRecord)
337 child_summaries = [self.get(self._collections.find(child)) for child in collection.children]
338 if child_summaries: 338 ↛ 341line 338 didn't jump to line 341, because the condition on line 338 was never false
339 summary = CollectionSummary.union(*child_summaries)
340 else:
341 summary = CollectionSummary.makeEmpty(self._dimensions.universe)
342 else:
343 # Either this collection doesn't have any datasets yet, or the
344 # only datasets it has were created by some other process since
345 # the last call to refresh. We assume the former; the user is
346 # responsible for calling refresh if they want to read
347 # concurrently-written things. We do remember this in the
348 # cache.
349 summary = CollectionSummary.makeEmpty(self._dimensions.universe)
350 self._cache[collection.key] = summary
351 return summary