Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 91%
103 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 10:59 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 10:59 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from .... import ddl
32__all__ = ("CollectionSummaryManager",)
34from collections.abc import Callable, Iterable, Mapping
35from typing import Any, Generic, TypeVar
37import sqlalchemy
39from ...._dataset_type import DatasetType
40from ...._named import NamedKeyDict, NamedKeyMapping
41from ....dimensions import GovernorDimension, addDimensionForeignKey
42from ..._caching_context import CachingContext
43from ..._collection_summary import CollectionSummary
44from ..._collection_type import CollectionType
45from ...interfaces import (
46 CollectionManager,
47 CollectionRecord,
48 Database,
49 DimensionRecordStorageManager,
50 StaticTablesContext,
51)
52from ...wildcards import CollectionWildcard
54_T = TypeVar("_T")
57class CollectionSummaryTables(Generic[_T]):
58 """Structure that holds the table or table specification objects that
59 summarize the contents of collections.
61 Parameters
62 ----------
63 datasetType
64 Table [specification] that summarizes which dataset types are in each
65 collection.
66 dimensions
67 Mapping of table [specifications] that summarize which governor
68 dimension values are present in the data IDs of each collection.
69 """
71 def __init__(
72 self,
73 datasetType: _T,
74 dimensions: NamedKeyMapping[GovernorDimension, _T],
75 ):
76 self.datasetType = datasetType
77 self.dimensions = dimensions
79 @classmethod
80 def makeTableSpecs(
81 cls,
82 collections: CollectionManager,
83 dimensions: DimensionRecordStorageManager,
84 ) -> CollectionSummaryTables[ddl.TableSpec]:
85 """Create specifications for all summary tables.
87 Parameters
88 ----------
89 collections: `CollectionManager`
90 Manager object for the collections in this `Registry`.
91 dimensions : `DimensionRecordStorageManager`
92 Manager object for the dimensions in this `Registry`.
94 Returns
95 -------
96 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
97 Structure containing table specifications.
98 """
99 # Spec for collection_summary_dataset_type.
100 datasetTypeTableSpec = ddl.TableSpec(fields=[])
101 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
102 datasetTypeTableSpec.fields.add(
103 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
104 )
105 datasetTypeTableSpec.foreignKeys.append(
106 ddl.ForeignKeySpec(
107 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE"
108 )
109 )
110 # Specs for collection_summary_<dimension>.
111 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
112 for dimension in dimensions.universe.governor_dimensions:
113 tableSpec = ddl.TableSpec(fields=[])
114 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
115 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
116 dimensionTableSpecs[dimension] = tableSpec
117 return CollectionSummaryTables(
118 datasetType=datasetTypeTableSpec,
119 dimensions=dimensionTableSpecs.freeze(),
120 )
123class CollectionSummaryManager:
124 """Object manages the summaries of what dataset types and governor
125 dimension values are present in a collection.
127 Parameters
128 ----------
129 db : `Database`
130 Interface to the underlying database engine and namespace.
131 collections: `CollectionManager`
132 Manager object for the collections in this `Registry`.
133 dimensions : `DimensionRecordStorageManager`
134 Manager object for the dimensions in this `Registry`.
135 tables : `CollectionSummaryTables`
136 Struct containing the tables that hold collection summaries.
137 dataset_type_table : `sqlalchemy.schema.Table`
138 Table containing dataset type definitions.
139 caching_context : `CachingContext`
140 Object controlling caching of information returned by managers.
141 """
143 def __init__(
144 self,
145 db: Database,
146 *,
147 collections: CollectionManager,
148 dimensions: DimensionRecordStorageManager,
149 tables: CollectionSummaryTables[sqlalchemy.schema.Table],
150 dataset_type_table: sqlalchemy.schema.Table,
151 caching_context: CachingContext,
152 ):
153 self._db = db
154 self._collections = collections
155 self._collectionKeyName = collections.getCollectionForeignKeyName()
156 self._dimensions = dimensions
157 self._tables = tables
158 self._dataset_type_table = dataset_type_table
159 self._caching_context = caching_context
161 @classmethod
162 def initialize(
163 cls,
164 db: Database,
165 context: StaticTablesContext,
166 *,
167 collections: CollectionManager,
168 dimensions: DimensionRecordStorageManager,
169 dataset_type_table: sqlalchemy.schema.Table,
170 caching_context: CachingContext,
171 ) -> CollectionSummaryManager:
172 """Create all summary tables (or check that they have been created),
173 returning an object to manage them.
175 Parameters
176 ----------
177 db : `Database`
178 Interface to the underlying database engine and namespace.
179 context : `StaticTablesContext`
180 Context object obtained from `Database.declareStaticTables`; used
181 to declare any tables that should always be present.
182 collections: `CollectionManager`
183 Manager object for the collections in this `Registry`.
184 dimensions : `DimensionRecordStorageManager`
185 Manager object for the dimensions in this `Registry`.
186 dataset_type_table : `sqlalchemy.schema.Table`
187 Table containing dataset type definitions.
188 caching_context : `CachingContext`
189 Object controlling caching of information returned by managers.
191 Returns
192 -------
193 manager : `CollectionSummaryManager`
194 New manager object for collection summaries.
195 """
196 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
197 tables = CollectionSummaryTables(
198 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
199 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table](
200 {
201 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
202 for dimension, spec in specs.dimensions.items()
203 }
204 ).freeze(),
205 )
206 return cls(
207 db=db,
208 collections=collections,
209 dimensions=dimensions,
210 tables=tables,
211 dataset_type_table=dataset_type_table,
212 caching_context=caching_context,
213 )
215 def update(
216 self,
217 collection: CollectionRecord,
218 dataset_type_ids: Iterable[int],
219 summary: CollectionSummary,
220 ) -> None:
221 """Update the summary tables to associate the given collection with
222 a dataset type and governor dimension values.
224 Parameters
225 ----------
226 collection : `CollectionRecord`
227 Collection whose summary should be updated.
228 dataset_type_ids : `~collections.abc.Iterable` [ `int` ]
229 Integer IDs for the dataset types to associate with this
230 collection.
231 summary : `CollectionSummary`
232 Summary to store. Dataset types must correspond to
233 ``dataset_type_ids``.
235 Notes
236 -----
237 This method should only be called inside the transaction context of
238 another operation that inserts or associates datasets.
239 """
240 self._db.ensure(
241 self._tables.datasetType,
242 *[
243 {
244 "dataset_type_id": dataset_type_id,
245 self._collectionKeyName: collection.key,
246 }
247 for dataset_type_id in dataset_type_ids
248 ],
249 )
250 for dimension, values in summary.governors.items():
251 if values: 251 ↛ 250line 251 didn't jump to line 250, because the condition on line 251 was never false
252 self._db.ensure(
253 self._tables.dimensions[dimension],
254 *[{self._collectionKeyName: collection.key, dimension: v} for v in values],
255 )
257 def fetch_summaries(
258 self,
259 collections: Iterable[CollectionRecord],
260 dataset_type_names: Iterable[str] | None,
261 dataset_type_factory: Callable[[sqlalchemy.engine.RowMapping], DatasetType],
262 ) -> Mapping[Any, CollectionSummary]:
263 """Fetch collection summaries given their names and dataset types.
265 Parameters
266 ----------
267 collections : `~collections.abc.Iterable` [`CollectionRecord`]
268 Collection records to query.
269 dataset_type_names : `~collections.abc.Iterable` [`str`]
270 Names of dataset types to include into returned summaries. If
271 `None` then all dataset types will be included.
272 dataset_type_factory : `Callable`
273 Method that takes a table row and make `DatasetType` instance out
274 of it.
276 Returns
277 -------
278 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
279 Collection summaries indexed by collection record key. This mapping
280 will also contain all nested non-chained collections of the chained
281 collections.
282 """
283 summaries: dict[Any, CollectionSummary] = {}
284 # Check what we have in cache first.
285 if self._caching_context.collection_summaries is not None: 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true
286 summaries, missing_keys = self._caching_context.collection_summaries.find_summaries(
287 [record.key for record in collections]
288 )
289 if not missing_keys:
290 return summaries
291 else:
292 collections = [record for record in collections if record.key in missing_keys]
294 # Need to expand all chained collections first.
295 non_chains: list[CollectionRecord] = []
296 chains: dict[CollectionRecord, list[CollectionRecord]] = {}
297 for collection in collections:
298 if collection.type is CollectionType.CHAINED:
299 children = self._collections.resolve_wildcard(
300 CollectionWildcard.from_names([collection.name]),
301 flatten_chains=True,
302 include_chains=False,
303 )
304 non_chains += children
305 chains[collection] = children
306 else:
307 non_chains.append(collection)
309 # Set up the SQL query we'll use to fetch all of the summary
310 # information at once.
311 coll_col = self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName)
312 dataset_type_id_col = self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id")
313 columns = [coll_col, dataset_type_id_col] + list(self._dataset_type_table.columns)
314 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType.join(
315 self._dataset_type_table
316 )
317 for dimension, table in self._tables.dimensions.items():
318 columns.append(table.columns[dimension.name].label(dimension.name))
319 fromClause = fromClause.join(
320 table,
321 onclause=(
322 self._tables.datasetType.columns[self._collectionKeyName]
323 == table.columns[self._collectionKeyName]
324 ),
325 isouter=True,
326 )
328 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
329 sql = sql.where(coll_col.in_([coll.key for coll in non_chains]))
330 # For caching we need to fetch complete summaries.
331 if self._caching_context.collection_summaries is None: 331 ↛ 338line 331 didn't jump to line 338, because the condition on line 331 was never false
332 if dataset_type_names is not None:
333 sql = sql.where(self._dataset_type_table.columns["name"].in_(dataset_type_names))
335 # Run the query and construct CollectionSummary objects from the result
336 # rows. This will never include CHAINED collections or collections
337 # with no datasets.
338 with self._db.query(sql) as sql_result:
339 sql_rows = sql_result.mappings().fetchall()
340 dataset_type_ids: dict[int, DatasetType] = {}
341 for row in sql_rows:
342 # Collection key should never be None/NULL; it's what we join on.
343 # Extract that and then turn it into a collection name.
344 collectionKey = row[self._collectionKeyName]
345 # dataset_type_id should also never be None/NULL; it's in the first
346 # table we joined.
347 dataset_type_id = row["dataset_type_id"]
348 if (dataset_type := dataset_type_ids.get(dataset_type_id)) is None:
349 dataset_type_ids[dataset_type_id] = dataset_type = dataset_type_factory(row)
350 # See if we have a summary already for this collection; if not,
351 # make one.
352 summary = summaries.get(collectionKey)
353 if summary is None:
354 summary = CollectionSummary()
355 summaries[collectionKey] = summary
356 # Update the dimensions with the values in this row that
357 # aren't None/NULL (many will be in general, because these
358 # enter the query via LEFT OUTER JOIN).
359 summary.dataset_types.add(dataset_type)
360 for dimension in self._tables.dimensions:
361 value = row[dimension.name]
362 if value is not None:
363 summary.governors.setdefault(dimension.name, set()).add(value)
365 # Add empty summary for any missing collection.
366 for collection in non_chains:
367 if collection.key not in summaries:
368 summaries[collection.key] = CollectionSummary()
370 # Merge children into their chains summaries.
371 for chain, children in chains.items():
372 summaries[chain.key] = CollectionSummary.union(*(summaries[child.key] for child in children))
374 if self._caching_context.collection_summaries is not None: 374 ↛ 375line 374 didn't jump to line 375, because the condition on line 374 was never true
375 self._caching_context.collection_summaries.update(summaries)
377 return summaries