Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 99%
107 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:19 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:19 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30from .... import ddl
32__all__ = ("CollectionSummaryManager",)
34import logging
35from collections.abc import Callable, Iterable, Mapping
36from typing import Any, Generic, TypeVar
38import sqlalchemy
40from ...._dataset_type import DatasetType
41from ...._named import NamedKeyDict, NamedKeyMapping
42from ....dimensions import GovernorDimension, addDimensionForeignKey
43from ..._caching_context import CachingContext
44from ..._collection_summary import CollectionSummary
45from ..._collection_type import CollectionType
46from ...interfaces import (
47 CollectionManager,
48 CollectionRecord,
49 Database,
50 DimensionRecordStorageManager,
51 StaticTablesContext,
52)
53from ...wildcards import CollectionWildcard
55_T = TypeVar("_T")
58_LOG = logging.getLogger(__name__)
61class CollectionSummaryTables(Generic[_T]):
62 """Structure that holds the table or table specification objects that
63 summarize the contents of collections.
65 Parameters
66 ----------
67 datasetType : _T
68 Table [specification] that summarizes which dataset types are in each
69 collection.
70 dimensions : `NamedKeyMapping`
71 Mapping of table [specifications] that summarize which governor
72 dimension values are present in the data IDs of each collection.
73 """
75 def __init__(
76 self,
77 datasetType: _T,
78 dimensions: NamedKeyMapping[GovernorDimension, _T],
79 ):
80 self.datasetType = datasetType
81 self.dimensions = dimensions
83 @classmethod
84 def makeTableSpecs(
85 cls,
86 collections: CollectionManager,
87 dimensions: DimensionRecordStorageManager,
88 ) -> CollectionSummaryTables[ddl.TableSpec]:
89 """Create specifications for all summary tables.
91 Parameters
92 ----------
93 collections : `CollectionManager`
94 Manager object for the collections in this `Registry`.
95 dimensions : `DimensionRecordStorageManager`
96 Manager object for the dimensions in this `Registry`.
98 Returns
99 -------
100 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ]
101 Structure containing table specifications.
102 """
103 # Spec for collection_summary_dataset_type.
104 datasetTypeTableSpec = ddl.TableSpec(fields=[])
105 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE")
106 datasetTypeTableSpec.fields.add(
107 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True)
108 )
109 datasetTypeTableSpec.foreignKeys.append(
110 ddl.ForeignKeySpec(
111 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE"
112 )
113 )
114 # Specs for collection_summary_<dimension>.
115 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]()
116 for dimension in dimensions.universe.governor_dimensions:
117 tableSpec = ddl.TableSpec(fields=[])
118 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE")
119 addDimensionForeignKey(tableSpec, dimension, primaryKey=True)
120 dimensionTableSpecs[dimension] = tableSpec
121 return CollectionSummaryTables(
122 datasetType=datasetTypeTableSpec,
123 dimensions=dimensionTableSpecs.freeze(),
124 )
127class CollectionSummaryManager:
128 """Object manages the summaries of what dataset types and governor
129 dimension values are present in a collection.
131 Parameters
132 ----------
133 db : `Database`
134 Interface to the underlying database engine and namespace.
135 collections : `CollectionManager`
136 Manager object for the collections in this `Registry`.
137 tables : `CollectionSummaryTables`
138 Struct containing the tables that hold collection summaries.
139 dataset_type_table : `sqlalchemy.schema.Table`
140 Table containing dataset type definitions.
141 caching_context : `CachingContext`
142 Object controlling caching of information returned by managers.
143 """
145 def __init__(
146 self,
147 db: Database,
148 *,
149 collections: CollectionManager,
150 tables: CollectionSummaryTables[sqlalchemy.schema.Table],
151 dataset_type_table: sqlalchemy.schema.Table,
152 caching_context: CachingContext,
153 ):
154 self._db = db
155 self._collections = collections
156 self._collectionKeyName = collections.getCollectionForeignKeyName()
157 self._tables = tables
158 self._dataset_type_table = dataset_type_table
159 self._caching_context = caching_context
161 def clone(
162 self,
163 *,
164 db: Database,
165 collections: CollectionManager,
166 caching_context: CachingContext,
167 ) -> CollectionSummaryManager:
168 """Make an independent copy of this manager instance bound to new
169 instances of `Database` and other managers.
171 Parameters
172 ----------
173 db : `Database`
174 New `Database` object to use when instantiating the manager.
175 collections : `CollectionManager`
176 New `CollectionManager` object to use when instantiating the
177 manager.
178 caching_context : `CachingContext`
179 New `CachingContext` object to use when instantiating the manager.
181 Returns
182 -------
183 instance : `CollectionSummaryManager`
184 New manager instance with the same configuration as this instance,
185 but bound to a new Database object.
186 """
187 return CollectionSummaryManager(
188 db=db,
189 collections=collections,
190 tables=self._tables,
191 dataset_type_table=self._dataset_type_table,
192 caching_context=caching_context,
193 )
195 @classmethod
196 def initialize(
197 cls,
198 db: Database,
199 context: StaticTablesContext,
200 *,
201 collections: CollectionManager,
202 dimensions: DimensionRecordStorageManager,
203 dataset_type_table: sqlalchemy.schema.Table,
204 caching_context: CachingContext,
205 ) -> CollectionSummaryManager:
206 """Create all summary tables (or check that they have been created),
207 returning an object to manage them.
209 Parameters
210 ----------
211 db : `Database`
212 Interface to the underlying database engine and namespace.
213 context : `StaticTablesContext`
214 Context object obtained from `Database.declareStaticTables`; used
215 to declare any tables that should always be present.
216 collections : `CollectionManager`
217 Manager object for the collections in this `Registry`.
218 dimensions : `DimensionRecordStorageManager`
219 Manager object for the dimensions in this `Registry`.
220 dataset_type_table : `sqlalchemy.schema.Table`
221 Table containing dataset type definitions.
222 caching_context : `CachingContext`
223 Object controlling caching of information returned by managers.
225 Returns
226 -------
227 manager : `CollectionSummaryManager`
228 New manager object for collection summaries.
229 """
230 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions)
231 tables = CollectionSummaryTables(
232 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType),
233 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table](
234 {
235 dimension: context.addTable(f"collection_summary_{dimension.name}", spec)
236 for dimension, spec in specs.dimensions.items()
237 }
238 ).freeze(),
239 )
240 return cls(
241 db=db,
242 collections=collections,
243 tables=tables,
244 dataset_type_table=dataset_type_table,
245 caching_context=caching_context,
246 )
248 def update(
249 self,
250 collection: CollectionRecord,
251 dataset_type_ids: Iterable[int],
252 summary: CollectionSummary,
253 ) -> None:
254 """Update the summary tables to associate the given collection with
255 a dataset type and governor dimension values.
257 Parameters
258 ----------
259 collection : `CollectionRecord`
260 Collection whose summary should be updated.
261 dataset_type_ids : `~collections.abc.Iterable` [ `int` ]
262 Integer IDs for the dataset types to associate with this
263 collection.
264 summary : `CollectionSummary`
265 Summary to store. Dataset types must correspond to
266 ``dataset_type_ids``.
268 Notes
269 -----
270 This method should only be called inside the transaction context of
271 another operation that inserts or associates datasets.
272 """
273 self._db.ensure(
274 self._tables.datasetType,
275 *[
276 {
277 "dataset_type_id": dataset_type_id,
278 self._collectionKeyName: collection.key,
279 }
280 for dataset_type_id in dataset_type_ids
281 ],
282 )
283 for dimension, values in summary.governors.items():
284 if values: 284 ↛ 283line 284 didn't jump to line 283, because the condition on line 284 was never false
285 self._db.ensure(
286 self._tables.dimensions[dimension],
287 *[{self._collectionKeyName: collection.key, dimension: v} for v in values],
288 )
290 def fetch_summaries(
291 self,
292 collections: Iterable[CollectionRecord],
293 dataset_type_names: Iterable[str] | None,
294 dataset_type_factory: Callable[[sqlalchemy.engine.RowMapping], DatasetType],
295 ) -> Mapping[Any, CollectionSummary]:
296 """Fetch collection summaries given their names and dataset types.
298 Parameters
299 ----------
300 collections : `~collections.abc.Iterable` [`CollectionRecord`]
301 Collection records to query.
302 dataset_type_names : `~collections.abc.Iterable` [`str`]
303 Names of dataset types to include into returned summaries. If
304 `None` then all dataset types will be included.
305 dataset_type_factory : `Callable`
306 Method that takes a table row and make `DatasetType` instance out
307 of it.
309 Returns
310 -------
311 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`]
312 Collection summaries indexed by collection record key. This mapping
313 will also contain all nested non-chained collections of the chained
314 collections.
315 """
316 summaries: dict[Any, CollectionSummary] = {}
317 # Check what we have in cache first.
318 if self._caching_context.collection_summaries is not None:
319 summaries, missing_keys = self._caching_context.collection_summaries.find_summaries(
320 [record.key for record in collections]
321 )
322 if not missing_keys:
323 return summaries
324 else:
325 collections = [record for record in collections if record.key in missing_keys]
327 # Need to expand all chained collections first.
328 non_chains: list[CollectionRecord] = []
329 chains: dict[CollectionRecord, list[CollectionRecord]] = {}
330 for collection in collections:
331 if collection.type is CollectionType.CHAINED:
332 children = self._collections.resolve_wildcard(
333 CollectionWildcard.from_names([collection.name]),
334 flatten_chains=True,
335 include_chains=False,
336 )
337 non_chains += children
338 chains[collection] = children
339 else:
340 non_chains.append(collection)
342 _LOG.debug("Fetching summaries for collections %s.", [record.name for record in non_chains])
344 # Set up the SQL query we'll use to fetch all of the summary
345 # information at once.
346 coll_col = self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName)
347 dataset_type_id_col = self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id")
348 columns = [coll_col, dataset_type_id_col] + list(self._dataset_type_table.columns)
349 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType.join(
350 self._dataset_type_table
351 )
352 for dimension, table in self._tables.dimensions.items():
353 columns.append(table.columns[dimension.name].label(dimension.name))
354 fromClause = fromClause.join(
355 table,
356 onclause=(
357 self._tables.datasetType.columns[self._collectionKeyName]
358 == table.columns[self._collectionKeyName]
359 ),
360 isouter=True,
361 )
363 sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
364 sql = sql.where(coll_col.in_([coll.key for coll in non_chains]))
365 # For caching we need to fetch complete summaries.
366 if self._caching_context.collection_summaries is None:
367 if dataset_type_names is not None:
368 sql = sql.where(self._dataset_type_table.columns["name"].in_(dataset_type_names))
370 # Run the query and construct CollectionSummary objects from the result
371 # rows. This will never include CHAINED collections or collections
372 # with no datasets.
373 with self._db.query(sql) as sql_result:
374 sql_rows = sql_result.mappings().fetchall()
375 dataset_type_ids: dict[int, DatasetType] = {}
376 for row in sql_rows:
377 # Collection key should never be None/NULL; it's what we join on.
378 # Extract that and then turn it into a collection name.
379 collectionKey = row[self._collectionKeyName]
380 # dataset_type_id should also never be None/NULL; it's in the first
381 # table we joined.
382 dataset_type_id = row["dataset_type_id"]
383 if (dataset_type := dataset_type_ids.get(dataset_type_id)) is None:
384 dataset_type_ids[dataset_type_id] = dataset_type = dataset_type_factory(row)
385 # See if we have a summary already for this collection; if not,
386 # make one.
387 summary = summaries.get(collectionKey)
388 if summary is None:
389 summary = CollectionSummary()
390 summaries[collectionKey] = summary
391 # Update the dimensions with the values in this row that
392 # aren't None/NULL (many will be in general, because these
393 # enter the query via LEFT OUTER JOIN).
394 summary.dataset_types.add(dataset_type)
395 for dimension in self._tables.dimensions:
396 value = row[dimension.name]
397 if value is not None:
398 summary.governors.setdefault(dimension.name, set()).add(value)
400 # Add empty summary for any missing collection.
401 for collection in non_chains:
402 if collection.key not in summaries:
403 summaries[collection.key] = CollectionSummary()
405 # Merge children into their chains summaries.
406 for chain, children in chains.items():
407 summaries[chain.key] = CollectionSummary.union(*(summaries[child.key] for child in children))
409 if self._caching_context.collection_summaries is not None:
410 self._caching_context.collection_summaries.update(summaries)
412 return summaries