Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 97%
78 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-22 02:04 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-22 02:04 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("CollectionSummaryManager",)
26from typing import Any, Callable, Dict, Generic, Iterable, TypeVar
28import sqlalchemy
29from lsst.daf.butler import (
30 DatasetType,
31 GovernorDimension,
32 NamedKeyDict,
33 NamedKeyMapping,
34 addDimensionForeignKey,
35 ddl,
36)
37from lsst.daf.butler.registry.interfaces import (
38 ChainedCollectionRecord,
39 CollectionManager,
40 CollectionRecord,
41 Database,
42 DimensionRecordStorageManager,
43 StaticTablesContext,
44)
46from ..._collection_summary import CollectionSummary
47from ..._collectionType import CollectionType
49_T = TypeVar("_T")
52class CollectionSummaryTables(Generic[_T]):
53 """Structure that holds the table or table specification objects that
54 summarize the contents of collections.
56 Parameters
57 ----------
58 datasetType
59 Table [specification] that summarizes which dataset types are in each
60 collection.
61 dimensions
62 Mapping of table [specifications] that summarize which governor
63 dimension values are present in the data IDs of each collection.
64 """
66 def __init__(
67 self,
68 datasetType: _T,
69 dimensions: NamedKeyMapping[GovernorDimension, _T],
70 ):
71 self.datasetType = datasetType
72 self.dimensions = dimensions
74 @classmethod
75 def makeTableSpecs(
76 cls,
77 collections: CollectionManager,
78 dimensions: DimensionRecordStorageManager,
79 ) -> CollectionSummaryTables[ddl.TableSpec]:
80 """Create specifications for all summary tables.
82 Parameters
83 ----------
84 collections: `CollectionManager`
85 Manager object for the collections in this `Registry`.
86 dimensions : `DimensionRecordStorageManager`
87 Manager object for the dimensions in this `Registry`.
89 Returns
90 -------
91 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
92 Structure containing table specifications.
93 """
94 # Spec for collection_summary_dataset_type.
95 datasetTypeTableSpec = ddl.TableSpec(fields=[])
96 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
97 datasetTypeTableSpec.fields.add(
98 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
99 )
100 datasetTypeTableSpec.foreignKeys.append(
101 ddl.ForeignKeySpec(
102 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE"
103 )
104 )
105 # Specs for collection_summary_<dimension>.
106 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
107 for dimension in dimensions.universe.getGovernorDimensions():
108 tableSpec = ddl.TableSpec(fields=[])
109 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
110 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
111 dimensionTableSpecs[dimension] = tableSpec
112 return CollectionSummaryTables(
113 datasetType=datasetTypeTableSpec,
114 dimensions=dimensionTableSpecs.freeze(),
115 )
118class CollectionSummaryManager:
119 """Object manages the summaries of what dataset types and governor
120 dimension values are present in a collection.
122 Parameters
123 ----------
124 db : `Database`
125 Interface to the underlying database engine and namespace.
126 collections: `CollectionManager`
127 Manager object for the collections in this `Registry`.
128 dimensions : `DimensionRecordStorageManager`
129 Manager object for the dimensions in this `Registry`.
130 tables : `CollectionSummaryTables`
131 Struct containing the tables that hold collection summaries.
132 """
134 def __init__(
135 self,
136 db: Database,
137 *,
138 collections: CollectionManager,
139 dimensions: DimensionRecordStorageManager,
140 tables: CollectionSummaryTables[sqlalchemy.sql.Table],
141 ):
142 self._db = db
143 self._collections = collections
144 self._collectionKeyName = collections.getCollectionForeignKeyName()
145 self._dimensions = dimensions
146 self._tables = tables
147 self._cache: Dict[Any, CollectionSummary] = {}
149 @classmethod
150 def initialize(
151 cls,
152 db: Database,
153 context: StaticTablesContext,
154 *,
155 collections: CollectionManager,
156 dimensions: DimensionRecordStorageManager,
157 ) -> CollectionSummaryManager:
158 """Create all summary tables (or check that they have been created),
159 returning an object to manage them.
161 Parameters
162 ----------
163 db : `Database`
164 Interface to the underlying database engine and namespace.
165 context : `StaticTablesContext`
166 Context object obtained from `Database.declareStaticTables`; used
167 to declare any tables that should always be present.
168 collections: `CollectionManager`
169 Manager object for the collections in this `Registry`.
170 dimensions : `DimensionRecordStorageManager`
171 Manager object for the dimensions in this `Registry`.
173 Returns
174 -------
175 manager : `CollectionSummaryManager`
176 New manager object for collection summaries.
177 """
178 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
179 tables = CollectionSummaryTables(
180 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
181 dimensions=NamedKeyDict(
182 {
183 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
184 for dimension, spec in specs.dimensions.items()
185 }
186 ).freeze(),
187 )
188 return cls(
189 db=db,
190 collections=collections,
191 dimensions=dimensions,
192 tables=tables,
193 )
195 def update(
196 self,
197 collection: CollectionRecord,
198 dataset_type_ids: Iterable[int],
199 summary: CollectionSummary,
200 ) -> None:
201 """Update the summary tables to associate the given collection with
202 a dataset type and governor dimension values.
204 Parameters
205 ----------
206 collection : `CollectionRecord`
207 Collection whose summary should be updated.
208 dataset_type_ids : `Iterable` [ `int` ]
209 Integer IDs for the dataset types to associate with this
210 collection.
211 summary : `CollectionSummary`
212 Summary to store. Dataset types must correspond to
213 ``dataset_type_ids``.
215 Notes
216 -----
217 This method should only be called inside the transaction context of
218 another operation that inserts or associates datasets.
219 """
220 self._db.ensure(
221 self._tables.datasetType,
222 *[
223 {
224 "dataset_type_id": dataset_type_id,
225 self._collectionKeyName: collection.key,
226 }
227 for dataset_type_id in dataset_type_ids
228 ],
229 )
230 for dimension, values in summary.governors.items():
231 if values: 231 ↛ 230line 231 didn't jump to line 230, because the condition on line 231 was never false
232 self._db.ensure(
233 self._tables.dimensions[dimension],
234 *[{self._collectionKeyName: collection.key, dimension: v} for v in values],
235 )
236 # Update the in-memory cache, too. These changes will remain even if
237 # the database inserts above are rolled back by some later exception in
238 # the same transaction, but that's okay: we never promise that a
239 # CollectionSummary has _just_ the dataset types and governor dimension
240 # values that are actually present, only that it is guaranteed to
241 # contain any dataset types or governor dimension values that _may_ be
242 # present.
243 # That guarantee (and the possibility of rollbacks) means we can't get
244 # away with checking the cache before we try the database inserts,
245 # however; if someone had attempted to insert datasets of some dataset
246 # type previously, and that rolled back, and we're now trying to insert
247 # some more datasets of that same type, it would not be okay to skip
248 # the DB summary table insertions because we found entries in the
249 # in-memory cache.
250 self.get(collection).update(summary)
252 def refresh(self, get_dataset_type: Callable[[int], DatasetType]) -> None:
253 """Load all collection summary information from the database.
255 Parameters
256 ----------
257 get_dataset_type : `Callable`
258 Function that takes an `int` dataset_type_id value and returns a
259 `DatasetType` instance.
260 """
261 # Set up the SQL query we'll use to fetch all of the summary
262 # information at once.
263 columns = [
264 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName),
265 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"),
266 ]
267 fromClause = self._tables.datasetType
268 for dimension, table in self._tables.dimensions.items():
269 columns.append(table.columns[dimension.name].label(dimension.name))
270 fromClause = fromClause.join(
271 table,
272 onclause=(
273 self._tables.datasetType.columns[self._collectionKeyName]
274 == table.columns[self._collectionKeyName]
275 ),
276 isouter=True,
277 )
278 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
279 # Run the query and construct CollectionSummary objects from the result
280 # rows. This will never include CHAINED collections or collections
281 # with no datasets.
282 summaries: Dict[Any, CollectionSummary] = {}
283 for row in self._db.query(sql).mappings():
284 # Collection key should never be None/NULL; it's what we join on.
285 # Extract that and then turn it into a collection name.
286 collectionKey = row[self._collectionKeyName]
287 # dataset_type_id should also never be None/NULL; it's in the first
288 # table we joined.
289 datasetType = get_dataset_type(row["dataset_type_id"])
290 # See if we have a summary already for this collection; if not,
291 # make one.
292 summary = summaries.get(collectionKey)
293 if summary is None:
294 summary = CollectionSummary()
295 summaries[collectionKey] = summary
296 # Update the dimensions with the values in this row that aren't
297 # None/NULL (many will be in general, because these enter the query
298 # via LEFT OUTER JOIN).
299 summary.dataset_types.add(datasetType)
300 for dimension in self._tables.dimensions:
301 value = row[dimension.name]
302 if value is not None:
303 summary.governors.setdefault(dimension.name, set()).add(value)
304 self._cache = summaries
306 def get(self, collection: CollectionRecord) -> CollectionSummary:
307 """Return a summary for the given collection.
309 Parameters
310 ----------
311 collection : `CollectionRecord`
312 Record describing the collection for which a summary is to be
313 retrieved.
315 Returns
316 -------
317 summary : `CollectionSummary`
318 Summary of the dataset types and governor dimension values in
319 this collection.
320 """
321 summary = self._cache.get(collection.key)
322 if summary is None:
323 # When we load the summary information from the database, we don't
324 # create summaries for CHAINED collections; those are created here
325 # as needed, and *never* cached - we have no good way to update
326 # those summaries when some a new dataset is added to a child
327 # colletion.
328 if collection.type is CollectionType.CHAINED:
329 assert isinstance(collection, ChainedCollectionRecord)
330 child_summaries = [self.get(self._collections.find(child)) for child in collection.children]
331 if child_summaries: 331 ↛ 334line 331 didn't jump to line 334, because the condition on line 331 was never false
332 summary = CollectionSummary.union(*child_summaries)
333 else:
334 summary = CollectionSummary()
335 else:
336 # Either this collection doesn't have any datasets yet, or the
337 # only datasets it has were created by some other process since
338 # the last call to refresh. We assume the former; the user is
339 # responsible for calling refresh if they want to read
340 # concurrently-written things. We do remember this in the
341 # cache.
342 summary = CollectionSummary()
343 self._cache[collection.key] = summary
344 return summary