Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 98%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = (
25 "CollectionSummaryManager",
26)
28from typing import (
29 Any,
30 Callable,
31 Dict,
32 Generic,
33 TypeVar,
34)
36import sqlalchemy
38from lsst.daf.butler import (
39 DatasetType,
40 ddl,
41 GovernorDimension,
42 NamedKeyDict,
43 NamedKeyMapping,
44 NamedValueSet,
45)
46from lsst.daf.butler import addDimensionForeignKey
47from lsst.daf.butler.registry.interfaces import (
48 ChainedCollectionRecord,
49 CollectionManager,
50 CollectionRecord,
51 Database,
52 DimensionRecordStorageManager,
53 StaticTablesContext,
54)
55from ..._collectionType import CollectionType
56from ...summaries import CollectionSummary, GovernorDimensionRestriction
58_T = TypeVar("_T")
61class CollectionSummaryTables(Generic[_T]):
62 """Structure that holds the table or table specification objects that
63 summarize the contents of collections.
65 Parameters
66 ----------
67 datasetType
68 Table [specification] that summarizes which dataset types are in each
69 collection.
70 dimensions
71 Mapping of table [specifications] that summarize which governor
72 dimension values are present in the data IDs of each collection.
73 """
74 def __init__(
75 self,
76 datasetType: _T,
77 dimensions: NamedKeyMapping[GovernorDimension, _T],
78 ):
79 self.datasetType = datasetType
80 self.dimensions = dimensions
82 @classmethod
83 def makeTableSpecs(
84 cls,
85 collections: CollectionManager,
86 dimensions: DimensionRecordStorageManager,
87 ) -> CollectionSummaryTables[ddl.TableSpec]:
88 """Create specifications for all summary tables.
90 Parameters
91 ----------
92 collections: `CollectionManager`
93 Manager object for the collections in this `Registry`.
94 dimensions : `DimensionRecordStorageManager`
95 Manager object for the dimensions in this `Registry`.
97 Returns
98 -------
99 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
100 Structure containing table specifications.
101 """
102 # Spec for collection_summary_dataset_type.
103 datasetTypeTableSpec = ddl.TableSpec(fields=[])
104 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
105 datasetTypeTableSpec.fields.add(
106 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
107 )
108 datasetTypeTableSpec.foreignKeys.append(
109 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",),
110 onDelete="CASCADE")
111 )
112 # Specs for collection_summary_<dimension>.
113 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
114 for dimension in dimensions.universe.getGovernorDimensions():
115 tableSpec = ddl.TableSpec(fields=[])
116 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
117 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
118 dimensionTableSpecs[dimension] = tableSpec
119 return CollectionSummaryTables(
120 datasetType=datasetTypeTableSpec,
121 dimensions=dimensionTableSpecs.freeze(),
122 )
125class CollectionSummaryManager:
126 """Object manages the summaries of what dataset types and governor
127 dimension values are present in a collection.
129 Parameters
130 ----------
131 db : `Database`
132 Interface to the underlying database engine and namespace.
133 collections: `CollectionManager`
134 Manager object for the collections in this `Registry`.
135 dimensions : `DimensionRecordStorageManager`
136 Manager object for the dimensions in this `Registry`.
137 tables : `CollectionSummaryTables`
138 Struct containing the tables that hold collection summaries.
139 """
140 def __init__(
141 self,
142 db: Database, *,
143 collections: CollectionManager,
144 dimensions: DimensionRecordStorageManager,
145 tables: CollectionSummaryTables[sqlalchemy.sql.Table],
146 ):
147 self._db = db
148 self._collections = collections
149 self._collectionKeyName = collections.getCollectionForeignKeyName()
150 self._dimensions = dimensions
151 self._tables = tables
152 self._cache: Dict[Any, CollectionSummary] = {}
154 @classmethod
155 def initialize(
156 cls,
157 db: Database,
158 context: StaticTablesContext, *,
159 collections: CollectionManager,
160 dimensions: DimensionRecordStorageManager,
161 ) -> CollectionSummaryManager:
162 """Create all summary tables (or check that they have been created),
163 returning an object to manage them.
165 Parameters
166 ----------
167 db : `Database`
168 Interface to the underlying database engine and namespace.
169 context : `StaticTablesContext`
170 Context object obtained from `Database.declareStaticTables`; used
171 to declare any tables that should always be present.
172 collections: `CollectionManager`
173 Manager object for the collections in this `Registry`.
174 dimensions : `DimensionRecordStorageManager`
175 Manager object for the dimensions in this `Registry`.
177 Returns
178 -------
179 manager : `CollectionSummaryManager`
180 New manager object for collection summaries.
181 """
182 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
183 tables = CollectionSummaryTables(
184 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
185 dimensions=NamedKeyDict({
186 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
187 for dimension, spec in specs.dimensions.items()
188 }).freeze(),
189 )
190 return cls(
191 db=db,
192 collections=collections,
193 dimensions=dimensions,
194 tables=tables,
195 )
197 def update(
198 self,
199 collection: CollectionRecord,
200 datasetType: DatasetType,
201 dataset_type_id: int,
202 governors: GovernorDimensionRestriction,
203 ) -> None:
204 """Update the summary tables to associate the given collection with
205 a dataset type and governor dimension values.
207 Parameters
208 ----------
209 collection : `CollectionRecord`
210 Collection whose summary should be updated.
211 datasetType : `DatasetType`
212 DatasetType instance to associate with this collection.
213 dataset_type_id : `int`
214 Integer ID for the dataset type to associate with this collection.
215 governors : `GovernorDimensionRestriction`
216 Mapping from `GovernorDimensionRestriction` to sets of values they
217 may be associated with in the data IDs of the datasets in this
218 collection.
220 Notes
221 -----
222 This method should only be called inside the transaction context of
223 another operation that inserts or associates datasets.
224 """
225 self._db.ensure(
226 self._tables.datasetType,
227 {
228 "dataset_type_id": dataset_type_id,
229 self._collectionKeyName: collection.key,
230 }
231 )
232 for dimension, values in governors.items():
233 if values:
234 self._db.ensure(
235 self._tables.dimensions[dimension.name],
236 *[{
237 self._collectionKeyName: collection.key,
238 dimension.name: v
239 } for v in values],
240 )
241 # Update the in-memory cache, too. These changes will remain even if
242 # the database inserts above are rolled back by some later exception in
243 # the same transaction, but that's okay: we never promise that a
244 # CollectionSummary has _just_ the dataset types and governor dimension
245 # values that are actually present, only that it is guaranteed to
246 # contain any dataset types or governor dimension values that _may_ be
247 # present.
248 # That guarantee (and the possibility of rollbacks) means we can't get
249 # away with checking the cache before we try the database inserts,
250 # however; if someone had attempted to insert datasets of some dataset
251 # type previously, and that rolled back, and we're now trying to insert
252 # some more datasets of that same type, it would not be okay to skip
253 # the DB summary table insertions because we found entries in the
254 # in-memory cache.
255 summary = self.get(collection)
256 summary.datasetTypes.add(datasetType)
257 summary.dimensions.update(governors)
259 def refresh(self, get_dataset_type: Callable[[int], DatasetType]) -> None:
260 """Load all collection summary information from the database.
262 Parameters
263 ----------
264 get_dataset_type : `Callable`
265 Function that takes an `int` dataset_type_id value and returns a
266 `DatasetType` instance.
267 """
268 # Set up the SQL query we'll use to fetch all of the summary
269 # information at once.
270 columns = [
271 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName),
272 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"),
273 ]
274 fromClause = self._tables.datasetType
275 for dimension, table in self._tables.dimensions.items():
276 columns.append(table.columns[dimension.name].label(dimension.name))
277 fromClause = fromClause.join(
278 table,
279 onclause=(
280 self._tables.datasetType.columns[self._collectionKeyName]
281 == table.columns[self._collectionKeyName]
282 ),
283 isouter=True,
284 )
285 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
286 # Run the query and construct CollectionSummary objects from the result
287 # rows. This will never include CHAINED collections or collections
288 # with no datasets.
289 summaries: Dict[Any, CollectionSummary] = {}
290 for row in self._db.query(sql).mappings():
291 # Collection key should never be None/NULL; it's what we join on.
292 # Extract that and then turn it into a collection name.
293 collectionKey = row[self._collectionKeyName]
294 # dataset_type_id should also never be None/NULL; it's in the first
295 # table we joined.
296 datasetType = get_dataset_type(row["dataset_type_id"])
297 # See if we have a summary already for this collection; if not,
298 # make one.
299 summary = summaries.get(collectionKey)
300 if summary is None:
301 summary = CollectionSummary(
302 datasetTypes=NamedValueSet([datasetType]),
303 dimensions=GovernorDimensionRestriction.makeEmpty(self._dimensions.universe),
304 )
305 summaries[collectionKey] = summary
306 else:
307 summary.datasetTypes.add(datasetType)
308 # Update the dimensions with the values in this row that aren't
309 # None/NULL (many will be in general, because these enter the query
310 # via LEFT OUTER JOIN).
311 for dimension in self._tables.dimensions:
312 value = row[dimension.name]
313 if value is not None:
314 summary.dimensions.add(dimension, value)
315 self._cache = summaries
317 def get(self, collection: CollectionRecord) -> CollectionSummary:
318 """Return a summary for the given collection.
320 Parameters
321 ----------
322 collection : `CollectionRecord`
323 Record describing the collection for which a summary is to be
324 retrieved.
326 Returns
327 -------
328 summary : `CollectionSummary`
329 Summary of the dataset types and governor dimension values in
330 this collection.
331 """
332 summary = self._cache.get(collection.key)
333 if summary is None:
334 # When we load the summary information from the database, we don't
335 # create summaries for CHAINED collections; those are created here
336 # as needed, and *never* cached - we have no good way to update
337 # those summaries when some a new dataset is added to a child
338 # colletion.
339 if collection.type is CollectionType.CHAINED:
340 assert isinstance(collection, ChainedCollectionRecord)
341 child_summaries = [
342 self.get(self._collections.find(child))
343 for child in collection.children
344 ]
345 if child_summaries: 345 ↛ 348line 345 didn't jump to line 348, because the condition on line 345 was never false
346 summary = CollectionSummary.union(*child_summaries)
347 else:
348 summary = CollectionSummary.makeEmpty(self._dimensions.universe)
349 else:
350 # Either this collection doesn't have any datasets yet, or the
351 # only datasets it has were created by some other process since
352 # the last call to refresh. We assume the former; the user is
353 # responsible for calling refresh if they want to read
354 # concurrently-written things. We do remember this in the
355 # cache.
356 summary = CollectionSummary.makeEmpty(self._dimensions.universe)
357 self._cache[collection.key] = summary
358 return summary