Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 99%
106 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:43 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:43 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from .... import ddl
32__all__ = ("CollectionSummaryManager",)
34import logging
35from collections.abc import Callable, Iterable, Mapping
36from typing import Any, Generic, TypeVar
38import sqlalchemy
40from ...._dataset_type import DatasetType
41from ...._named import NamedKeyDict, NamedKeyMapping
42from ....dimensions import GovernorDimension, addDimensionForeignKey
43from ..._caching_context import CachingContext
44from ..._collection_summary import CollectionSummary
45from ..._collection_type import CollectionType
46from ...interfaces import (
47 CollectionManager,
48 CollectionRecord,
49 Database,
50 DimensionRecordStorageManager,
51 StaticTablesContext,
52)
53from ...wildcards import CollectionWildcard
55_T = TypeVar("_T")
58_LOG = logging.getLogger(__name__)
61class CollectionSummaryTables(Generic[_T]):
62 """Structure that holds the table or table specification objects that
63 summarize the contents of collections.
65 Parameters
66 ----------
67 datasetType : _T
68 Table [specification] that summarizes which dataset types are in each
69 collection.
70 dimensions : `NamedKeyMapping`
71 Mapping of table [specifications] that summarize which governor
72 dimension values are present in the data IDs of each collection.
73 """
75 def __init__(
76 self,
77 datasetType: _T,
78 dimensions: NamedKeyMapping[GovernorDimension, _T],
79 ):
80 self.datasetType = datasetType
81 self.dimensions = dimensions
83 @classmethod
84 def makeTableSpecs(
85 cls,
86 collections: CollectionManager,
87 dimensions: DimensionRecordStorageManager,
88 ) -> CollectionSummaryTables[ddl.TableSpec]:
89 """Create specifications for all summary tables.
91 Parameters
92 ----------
93 collections : `CollectionManager`
94 Manager object for the collections in this `Registry`.
95 dimensions : `DimensionRecordStorageManager`
96 Manager object for the dimensions in this `Registry`.
98 Returns
99 -------
100 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
101 Structure containing table specifications.
102 """
103 # Spec for collection_summary_dataset_type.
104 datasetTypeTableSpec = ddl.TableSpec(fields=[])
105 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
106 datasetTypeTableSpec.fields.add(
107 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
108 )
109 datasetTypeTableSpec.foreignKeys.append(
110 ddl.ForeignKeySpec(
111 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE"
112 )
113 )
114 # Specs for collection_summary_<dimension>.
115 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
116 for dimension in dimensions.universe.governor_dimensions:
117 tableSpec = ddl.TableSpec(fields=[])
118 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
119 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
120 dimensionTableSpecs[dimension] = tableSpec
121 return CollectionSummaryTables(
122 datasetType=datasetTypeTableSpec,
123 dimensions=dimensionTableSpecs.freeze(),
124 )
127class CollectionSummaryManager:
128 """Object manages the summaries of what dataset types and governor
129 dimension values are present in a collection.
131 Parameters
132 ----------
133 db : `Database`
134 Interface to the underlying database engine and namespace.
135 collections : `CollectionManager`
136 Manager object for the collections in this `Registry`.
137 dimensions : `DimensionRecordStorageManager`
138 Manager object for the dimensions in this `Registry`.
139 tables : `CollectionSummaryTables`
140 Struct containing the tables that hold collection summaries.
141 dataset_type_table : `sqlalchemy.schema.Table`
142 Table containing dataset type definitions.
143 caching_context : `CachingContext`
144 Object controlling caching of information returned by managers.
145 """
147 def __init__(
148 self,
149 db: Database,
150 *,
151 collections: CollectionManager,
152 dimensions: DimensionRecordStorageManager,
153 tables: CollectionSummaryTables[sqlalchemy.schema.Table],
154 dataset_type_table: sqlalchemy.schema.Table,
155 caching_context: CachingContext,
156 ):
157 self._db = db
158 self._collections = collections
159 self._collectionKeyName = collections.getCollectionForeignKeyName()
160 self._dimensions = dimensions
161 self._tables = tables
162 self._dataset_type_table = dataset_type_table
163 self._caching_context = caching_context
165 @classmethod
166 def initialize(
167 cls,
168 db: Database,
169 context: StaticTablesContext,
170 *,
171 collections: CollectionManager,
172 dimensions: DimensionRecordStorageManager,
173 dataset_type_table: sqlalchemy.schema.Table,
174 caching_context: CachingContext,
175 ) -> CollectionSummaryManager:
176 """Create all summary tables (or check that they have been created),
177 returning an object to manage them.
179 Parameters
180 ----------
181 db : `Database`
182 Interface to the underlying database engine and namespace.
183 context : `StaticTablesContext`
184 Context object obtained from `Database.declareStaticTables`; used
185 to declare any tables that should always be present.
186 collections : `CollectionManager`
187 Manager object for the collections in this `Registry`.
188 dimensions : `DimensionRecordStorageManager`
189 Manager object for the dimensions in this `Registry`.
190 dataset_type_table : `sqlalchemy.schema.Table`
191 Table containing dataset type definitions.
192 caching_context : `CachingContext`
193 Object controlling caching of information returned by managers.
195 Returns
196 -------
197 manager : `CollectionSummaryManager`
198 New manager object for collection summaries.
199 """
200 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
201 tables = CollectionSummaryTables(
202 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
203 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table](
204 {
205 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
206 for dimension, spec in specs.dimensions.items()
207 }
208 ).freeze(),
209 )
210 return cls(
211 db=db,
212 collections=collections,
213 dimensions=dimensions,
214 tables=tables,
215 dataset_type_table=dataset_type_table,
216 caching_context=caching_context,
217 )
219 def update(
220 self,
221 collection: CollectionRecord,
222 dataset_type_ids: Iterable[int],
223 summary: CollectionSummary,
224 ) -> None:
225 """Update the summary tables to associate the given collection with
226 a dataset type and governor dimension values.
228 Parameters
229 ----------
230 collection : `CollectionRecord`
231 Collection whose summary should be updated.
232 dataset_type_ids : `~collections.abc.Iterable` [ `int` ]
233 Integer IDs for the dataset types to associate with this
234 collection.
235 summary : `CollectionSummary`
236 Summary to store. Dataset types must correspond to
237 ``dataset_type_ids``.
239 Notes
240 -----
241 This method should only be called inside the transaction context of
242 another operation that inserts or associates datasets.
243 """
244 self._db.ensure(
245 self._tables.datasetType,
246 *[
247 {
248 "dataset_type_id": dataset_type_id,
249 self._collectionKeyName: collection.key,
250 }
251 for dataset_type_id in dataset_type_ids
252 ],
253 )
254 for dimension, values in summary.governors.items():
255 if values: 255 ↛ 254line 255 didn't jump to line 254, because the condition on line 255 was never false
256 self._db.ensure(
257 self._tables.dimensions[dimension],
258 *[{self._collectionKeyName: collection.key, dimension: v} for v in values],
259 )
261 def fetch_summaries(
262 self,
263 collections: Iterable[CollectionRecord],
264 dataset_type_names: Iterable[str] | None,
265 dataset_type_factory: Callable[[sqlalchemy.engine.RowMapping], DatasetType],
266 ) -> Mapping[Any, CollectionSummary]:
267 """Fetch collection summaries given their names and dataset types.
269 Parameters
270 ----------
271 collections : `~collections.abc.Iterable` [`CollectionRecord`]
272 Collection records to query.
273 dataset_type_names : `~collections.abc.Iterable` [`str`]
274 Names of dataset types to include into returned summaries. If
275 `None` then all dataset types will be included.
276 dataset_type_factory : `Callable`
277 Method that takes a table row and make `DatasetType` instance out
278 of it.
280 Returns
281 -------
282 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
283 Collection summaries indexed by collection record key. This mapping
284 will also contain all nested non-chained collections of the chained
285 collections.
286 """
287 summaries: dict[Any, CollectionSummary] = {}
288 # Check what we have in cache first.
289 if self._caching_context.collection_summaries is not None:
290 summaries, missing_keys = self._caching_context.collection_summaries.find_summaries(
291 [record.key for record in collections]
292 )
293 if not missing_keys:
294 return summaries
295 else:
296 collections = [record for record in collections if record.key in missing_keys]
298 # Need to expand all chained collections first.
299 non_chains: list[CollectionRecord] = []
300 chains: dict[CollectionRecord, list[CollectionRecord]] = {}
301 for collection in collections:
302 if collection.type is CollectionType.CHAINED:
303 children = self._collections.resolve_wildcard(
304 CollectionWildcard.from_names([collection.name]),
305 flatten_chains=True,
306 include_chains=False,
307 )
308 non_chains += children
309 chains[collection] = children
310 else:
311 non_chains.append(collection)
313 _LOG.debug("Fetching summaries for collections %s.", [record.name for record in non_chains])
315 # Set up the SQL query we'll use to fetch all of the summary
316 # information at once.
317 coll_col = self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName)
318 dataset_type_id_col = self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id")
319 columns = [coll_col, dataset_type_id_col] + list(self._dataset_type_table.columns)
320 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType.join(
321 self._dataset_type_table
322 )
323 for dimension, table in self._tables.dimensions.items():
324 columns.append(table.columns[dimension.name].label(dimension.name))
325 fromClause = fromClause.join(
326 table,
327 onclause=(
328 self._tables.datasetType.columns[self._collectionKeyName]
329 == table.columns[self._collectionKeyName]
330 ),
331 isouter=True,
332 )
334 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
335 sql = sql.where(coll_col.in_([coll.key for coll in non_chains]))
336 # For caching we need to fetch complete summaries.
337 if self._caching_context.collection_summaries is None:
338 if dataset_type_names is not None:
339 sql = sql.where(self._dataset_type_table.columns["name"].in_(dataset_type_names))
341 # Run the query and construct CollectionSummary objects from the result
342 # rows. This will never include CHAINED collections or collections
343 # with no datasets.
344 with self._db.query(sql) as sql_result:
345 sql_rows = sql_result.mappings().fetchall()
346 dataset_type_ids: dict[int, DatasetType] = {}
347 for row in sql_rows:
348 # Collection key should never be None/NULL; it's what we join on.
349 # Extract that and then turn it into a collection name.
350 collectionKey = row[self._collectionKeyName]
351 # dataset_type_id should also never be None/NULL; it's in the first
352 # table we joined.
353 dataset_type_id = row["dataset_type_id"]
354 if (dataset_type := dataset_type_ids.get(dataset_type_id)) is None:
355 dataset_type_ids[dataset_type_id] = dataset_type = dataset_type_factory(row)
356 # See if we have a summary already for this collection; if not,
357 # make one.
358 summary = summaries.get(collectionKey)
359 if summary is None:
360 summary = CollectionSummary()
361 summaries[collectionKey] = summary
362 # Update the dimensions with the values in this row that
363 # aren't None/NULL (many will be in general, because these
364 # enter the query via LEFT OUTER JOIN).
365 summary.dataset_types.add(dataset_type)
366 for dimension in self._tables.dimensions:
367 value = row[dimension.name]
368 if value is not None:
369 summary.governors.setdefault(dimension.name, set()).add(value)
371 # Add empty summary for any missing collection.
372 for collection in non_chains:
373 if collection.key not in summaries:
374 summaries[collection.key] = CollectionSummary()
376 # Merge children into their chains summaries.
377 for chain, children in chains.items():
378 summaries[chain.key] = CollectionSummary.union(*(summaries[child.key] for child in children))
380 if self._caching_context.collection_summaries is not None:
381 self._caching_context.collection_summaries.update(summaries)
383 return summaries