Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 87%
328 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 03:43 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 03:43 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
29from __future__ import annotations
31from .... import ddl
33__all__ = ("ByDimensionsDatasetRecordStorage",)
35import datetime
36from collections.abc import Callable, Iterable, Iterator, Sequence, Set
37from typing import TYPE_CHECKING
39import astropy.time
40import sqlalchemy
41from lsst.daf.relation import Relation, sql
43from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag
44from ...._column_type_info import LogicalColumn
45from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef
46from ...._dataset_type import DatasetType
47from ...._exceptions import CollectionTypeError
48from ...._timespan import Timespan
49from ....dimensions import DataCoordinate
50from ....direct_query_driver import QueryBuilder, QueryJoiner # new query system, server+direct only
51from ....queries import tree as qt # new query system, both clients + server
52from ..._collection_summary import CollectionSummary
53from ..._collection_type import CollectionType
54from ..._exceptions import ConflictingDefinitionError
55from ...interfaces import DatasetRecordStorage
56from ...queries import SqlQueryContext # old registry query system
57from .tables import makeTagTableSpec
59if TYPE_CHECKING:
60 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
61 from .summaries import CollectionSummaryManager
62 from .tables import StaticDatasetTablesTuple
65class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
66 """Dataset record storage implementation paired with
67 `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more
68 information.
70 Instances of this class should never be constructed directly; use
71 `DatasetRecordStorageManager.register` instead.
73 Parameters
74 ----------
75 datasetType : `DatasetType`
76 The dataset type to use.
77 db : `Database`
78 Database connection.
79 dataset_type_id : `int`
80 Dataset type identifier.
81 collections : `CollectionManager`
82 The collection manager.
83 static : `StaticDatasetTablesTuple`
84 Unknown.
85 summaries : `CollectionSummaryManager`
86 Collection summary manager.
87 tags_table_factory : `~collections.abc.Callable`
88 Factory for creating tags tables.
89 use_astropy_ingest_date : `bool`
90 Whether to use Astropy for ingest date.
91 calibs_table_factory : `~collections.abc.Callable`
92 Factory for creating calibration tables.
93 """
95 def __init__(
96 self,
97 *,
98 datasetType: DatasetType,
99 db: Database,
100 dataset_type_id: int,
101 collections: CollectionManager,
102 static: StaticDatasetTablesTuple,
103 summaries: CollectionSummaryManager,
104 tags_table_factory: Callable[[], sqlalchemy.schema.Table],
105 use_astropy_ingest_date: bool,
106 calibs_table_factory: Callable[[], sqlalchemy.schema.Table] | None,
107 ):
108 super().__init__(datasetType=datasetType)
109 self._dataset_type_id = dataset_type_id
110 self._db = db
111 self._collections = collections
112 self._static = static
113 self._summaries = summaries
114 self._tags_table_factory = tags_table_factory
115 self._calibs_table_factory = calibs_table_factory
116 self._runKeyColumn = collections.getRunForeignKeyName()
117 self._use_astropy = use_astropy_ingest_date
118 self._tags_table: sqlalchemy.schema.Table | None = None
119 self._calibs_table: sqlalchemy.schema.Table | None = None
121 @property
122 def _tags(self) -> sqlalchemy.schema.Table:
123 if self._tags_table is None:
124 self._tags_table = self._tags_table_factory()
125 return self._tags_table
127 @property
128 def _calibs(self) -> sqlalchemy.schema.Table | None:
129 if self._calibs_table is None:
130 if self._calibs_table_factory is None: 130 ↛ 131line 130 didn't jump to line 131, because the condition on line 130 was never true
131 return None
132 self._calibs_table = self._calibs_table_factory()
133 return self._calibs_table
135 def delete(self, datasets: Iterable[DatasetRef]) -> None:
136 # Docstring inherited from DatasetRecordStorage.
137 # Only delete from common dataset table; ON DELETE foreign key clauses
138 # will handle the rest.
139 self._db.delete(
140 self._static.dataset,
141 ["id"],
142 *[{"id": dataset.id} for dataset in datasets],
143 )
145 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
146 # Docstring inherited from DatasetRecordStorage.
147 if collection.type is not CollectionType.TAGGED: 147 ↛ 148line 147 didn't jump to line 148, because the condition on line 147 was never true
148 raise TypeError(
149 f"Cannot associate into collection '{collection.name}' "
150 f"of type {collection.type.name}; must be TAGGED."
151 )
152 protoRow = {
153 self._collections.getCollectionForeignKeyName(): collection.key,
154 "dataset_type_id": self._dataset_type_id,
155 }
156 rows = []
157 summary = CollectionSummary()
158 for dataset in summary.add_datasets_generator(datasets):
159 rows.append(dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required))
160 # Update the summary tables for this collection in case this is the
161 # first time this dataset type or these governor values will be
162 # inserted there.
163 self._summaries.update(collection, [self._dataset_type_id], summary)
164 # Update the tag table itself.
165 self._db.replace(self._tags, *rows)
167 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
168 # Docstring inherited from DatasetRecordStorage.
169 if collection.type is not CollectionType.TAGGED: 169 ↛ 170line 169 didn't jump to line 170, because the condition on line 169 was never true
170 raise TypeError(
171 f"Cannot disassociate from collection '{collection.name}' "
172 f"of type {collection.type.name}; must be TAGGED."
173 )
174 rows = [
175 {
176 "dataset_id": dataset.id,
177 self._collections.getCollectionForeignKeyName(): collection.key,
178 }
179 for dataset in datasets
180 ]
181 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
183 def _buildCalibOverlapQuery(
184 self,
185 collection: CollectionRecord,
186 data_ids: set[DataCoordinate] | None,
187 timespan: Timespan,
188 context: SqlQueryContext,
189 ) -> Relation:
190 relation = self.make_relation(
191 collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context
192 ).with_rows_satisfying(
193 context.make_timespan_overlap_predicate(
194 DatasetColumnTag(self.datasetType.name, "timespan"), timespan
195 ),
196 )
197 if data_ids is not None:
198 relation = relation.join(
199 context.make_data_id_relation(
200 data_ids, self.datasetType.dimensions.required.names
201 ).transferred_to(context.sql_engine),
202 )
203 return relation
205 def certify(
206 self,
207 collection: CollectionRecord,
208 datasets: Iterable[DatasetRef],
209 timespan: Timespan,
210 context: SqlQueryContext,
211 ) -> None:
212 # Docstring inherited from DatasetRecordStorage.
213 if self._calibs is None: 213 ↛ 214line 213 didn't jump to line 214, because the condition on line 213 was never true
214 raise CollectionTypeError(
215 f"Cannot certify datasets of type {self.datasetType.name}, for which "
216 "DatasetType.isCalibration() is False."
217 )
218 if collection.type is not CollectionType.CALIBRATION: 218 ↛ 219line 218 didn't jump to line 219, because the condition on line 218 was never true
219 raise CollectionTypeError(
220 f"Cannot certify into collection '{collection.name}' "
221 f"of type {collection.type.name}; must be CALIBRATION."
222 )
223 TimespanReprClass = self._db.getTimespanRepresentation()
224 protoRow = {
225 self._collections.getCollectionForeignKeyName(): collection.key,
226 "dataset_type_id": self._dataset_type_id,
227 }
228 rows = []
229 dataIds: set[DataCoordinate] | None = (
230 set() if not TimespanReprClass.hasExclusionConstraint() else None
231 )
232 summary = CollectionSummary()
233 for dataset in summary.add_datasets_generator(datasets):
234 row = dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required)
235 TimespanReprClass.update(timespan, result=row)
236 rows.append(row)
237 if dataIds is not None: 237 ↛ 233line 237 didn't jump to line 233, because the condition on line 237 was never false
238 dataIds.add(dataset.dataId)
239 # Update the summary tables for this collection in case this is the
240 # first time this dataset type or these governor values will be
241 # inserted there.
242 self._summaries.update(collection, [self._dataset_type_id], summary)
243 # Update the association table itself.
244 if TimespanReprClass.hasExclusionConstraint(): 244 ↛ 247line 244 didn't jump to line 247, because the condition on line 244 was never true
245 # Rely on database constraint to enforce invariants; we just
246 # reraise the exception for consistency across DB engines.
247 try:
248 self._db.insert(self._calibs, *rows)
249 except sqlalchemy.exc.IntegrityError as err:
250 raise ConflictingDefinitionError(
251 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
252 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
253 ) from err
254 else:
255 # Have to implement exclusion constraint ourselves.
256 # Start by building a SELECT query for any rows that would overlap
257 # this one.
258 relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context)
259 # Acquire a table lock to ensure there are no concurrent writes
260 # could invalidate our checking before we finish the inserts. We
261 # use a SAVEPOINT in case there is an outer transaction that a
262 # failure here should not roll back.
263 with self._db.transaction(lock=[self._calibs], savepoint=True):
264 # Enter SqlQueryContext in case we need to use a temporary
265 # table to include the give data IDs in the query. Note that
266 # by doing this inside the transaction, we make sure it doesn't
267 # attempt to close the session when its done, since it just
268 # sees an already-open session that it knows it shouldn't
269 # manage.
270 with context:
271 # Run the check SELECT query.
272 conflicting = context.count(context.process(relation))
273 if conflicting > 0:
274 raise ConflictingDefinitionError(
275 f"{conflicting} validity range conflicts certifying datasets of type "
276 f"{self.datasetType.name} into {collection.name} for range "
277 f"[{timespan.begin}, {timespan.end})."
278 )
279 # Proceed with the insert.
280 self._db.insert(self._calibs, *rows)
282 def decertify(
283 self,
284 collection: CollectionRecord,
285 timespan: Timespan,
286 *,
287 dataIds: Iterable[DataCoordinate] | None = None,
288 context: SqlQueryContext,
289 ) -> None:
290 # Docstring inherited from DatasetRecordStorage.
291 if self._calibs is None: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true
292 raise CollectionTypeError(
293 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
294 "DatasetType.isCalibration() is False."
295 )
296 if collection.type is not CollectionType.CALIBRATION: 296 ↛ 297line 296 didn't jump to line 297, because the condition on line 296 was never true
297 raise CollectionTypeError(
298 f"Cannot decertify from collection '{collection.name}' "
299 f"of type {collection.type.name}; must be CALIBRATION."
300 )
301 TimespanReprClass = self._db.getTimespanRepresentation()
302 # Construct a SELECT query to find all rows that overlap our inputs.
303 dataIdSet: set[DataCoordinate] | None
304 if dataIds is not None:
305 dataIdSet = set(dataIds)
306 else:
307 dataIdSet = None
308 relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context)
309 calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey")
310 dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id")
311 timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan")
312 data_id_tags = [
313 (name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required.names
314 ]
315 # Set up collections to populate with the rows we'll want to modify.
316 # The insert rows will have the same values for collection and
317 # dataset type.
318 protoInsertRow = {
319 self._collections.getCollectionForeignKeyName(): collection.key,
320 "dataset_type_id": self._dataset_type_id,
321 }
322 rowsToDelete = []
323 rowsToInsert = []
324 # Acquire a table lock to ensure there are no concurrent writes
325 # between the SELECT and the DELETE and INSERT queries based on it.
326 with self._db.transaction(lock=[self._calibs], savepoint=True):
327 # Enter SqlQueryContext in case we need to use a temporary table to
328 # include the give data IDs in the query (see similar block in
329 # certify for details).
330 with context:
331 for row in context.fetch_iterable(relation):
332 rowsToDelete.append({"id": row[calib_pkey_tag]})
333 # Construct the insert row(s) by copying the prototype row,
334 # then adding the dimension column values, then adding
335 # what's left of the timespan from that row after we
336 # subtract the given timespan.
337 newInsertRow = protoInsertRow.copy()
338 newInsertRow["dataset_id"] = row[dataset_id_tag]
339 for name, tag in data_id_tags:
340 newInsertRow[name] = row[tag]
341 rowTimespan = row[timespan_tag]
342 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
343 for diffTimespan in rowTimespan.difference(timespan):
344 rowsToInsert.append(
345 TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())
346 )
347 # Run the DELETE and INSERT queries.
348 self._db.delete(self._calibs, ["id"], *rowsToDelete)
349 self._db.insert(self._calibs, *rowsToInsert)
351 def make_relation(
352 self,
353 *collections: CollectionRecord,
354 columns: Set[str],
355 context: SqlQueryContext,
356 ) -> Relation:
357 # Docstring inherited from DatasetRecordStorage.
358 collection_types = {collection.type for collection in collections}
359 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
360 TimespanReprClass = self._db.getTimespanRepresentation()
361 #
362 # There are two kinds of table in play here:
363 #
364 # - the static dataset table (with the dataset ID, dataset type ID,
365 # run ID/name, and ingest date);
366 #
367 # - the dynamic tags/calibs table (with the dataset ID, dataset type
368 # type ID, collection ID/name, data ID, and possibly validity
369 # range).
370 #
371 # That means that we might want to return a query against either table
372 # or a JOIN of both, depending on which quantities the caller wants.
373 # But the data ID is always included, which means we'll always include
374 # the tags/calibs table and join in the static dataset table only if we
375 # need things from it that we can't get from the tags/calibs table.
376 #
377 # Note that it's important that we include a WHERE constraint on both
378 # tables for any column (e.g. dataset_type_id) that is in both when
379 # it's given explicitly; not doing can prevent the query planner from
380 # using very important indexes. At present, we don't include those
381 # redundant columns in the JOIN ON expression, however, because the
382 # FOREIGN KEY (and its index) are defined only on dataset_id.
383 tag_relation: Relation | None = None
384 calib_relation: Relation | None = None
385 if collection_types != {CollectionType.CALIBRATION}:
386 # We'll need a subquery for the tags table if any of the given
387 # collections are not a CALIBRATION collection. This intentionally
388 # also fires when the list of collections is empty as a way to
389 # create a dummy subquery that we know will fail.
390 # We give the table an alias because it might appear multiple times
391 # in the same query, for different dataset types.
392 tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags"))
393 if "timespan" in columns:
394 tags_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = (
395 TimespanReprClass.fromLiteral(Timespan(None, None))
396 )
397 tag_relation = self._finish_single_relation(
398 tags_parts,
399 columns,
400 [
401 (record, rank)
402 for rank, record in enumerate(collections)
403 if record.type is not CollectionType.CALIBRATION
404 ],
405 context,
406 )
407 assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries."
408 if CollectionType.CALIBRATION in collection_types:
409 # If at least one collection is a CALIBRATION collection, we'll
410 # need a subquery for the calibs table, and could include the
411 # timespan as a result or constraint.
412 assert (
413 self._calibs is not None
414 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
415 calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs"))
416 if "timespan" in columns:
417 calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = (
418 TimespanReprClass.from_columns(calibs_parts.from_clause.columns)
419 )
420 if "calib_pkey" in columns:
421 # This is a private extension not included in the base class
422 # interface, for internal use only in _buildCalibOverlapQuery,
423 # which needs access to the autoincrement primary key for the
424 # calib association table.
425 calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "calib_pkey")] = (
426 calibs_parts.from_clause.columns.id
427 )
428 calib_relation = self._finish_single_relation(
429 calibs_parts,
430 columns,
431 [
432 (record, rank)
433 for rank, record in enumerate(collections)
434 if record.type is CollectionType.CALIBRATION
435 ],
436 context,
437 )
438 if tag_relation is not None:
439 if calib_relation is not None:
440 # daf_relation's chain operation does not automatically
441 # deduplicate; it's more like SQL's UNION ALL. To get UNION
442 # in SQL here, we add an explicit deduplication.
443 return tag_relation.chain(calib_relation).without_duplicates()
444 else:
445 return tag_relation
446 elif calib_relation is not None:
447 return calib_relation
448 else:
449 raise AssertionError("Branch should be unreachable.")
451 def _finish_single_relation(
452 self,
453 payload: sql.Payload[LogicalColumn],
454 requested_columns: Set[str],
455 collections: Sequence[tuple[CollectionRecord, int]],
456 context: SqlQueryContext,
457 ) -> Relation:
458 """Handle adding columns and WHERE terms that are not specific to
459 either the tags or calibs tables.
461 Helper method for `make_relation`.
463 Parameters
464 ----------
465 payload : `lsst.daf.relation.sql.Payload`
466 SQL query parts under construction, to be modified in-place and
467 used to construct the new relation.
468 requested_columns : `~collections.abc.Set` [ `str` ]
469 Columns the relation should include.
470 collections : `~collections.abc.Sequence` [ `tuple` \
471 [ `CollectionRecord`, `int` ] ]
472 Collections to search for the dataset and their ranks.
473 context : `SqlQueryContext`
474 Context that manages engines and state for the query.
476 Returns
477 -------
478 relation : `lsst.daf.relation.Relation`
479 New dataset query relation.
480 """
481 payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id)
482 dataset_id_col = payload.from_clause.columns.dataset_id
483 collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()]
484 # We always constrain and optionally retrieve the collection(s) via the
485 # tags/calibs table.
486 if len(collections) == 1:
487 payload.where.append(collection_col == collections[0][0].key)
488 if "collection" in requested_columns:
489 payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = (
490 sqlalchemy.sql.literal(collections[0][0].key)
491 )
492 else:
493 assert collections, "The no-collections case should be in calling code for better diagnostics."
494 payload.where.append(collection_col.in_([collection.key for collection, _ in collections]))
495 if "collection" in requested_columns:
496 payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = (
497 collection_col
498 )
499 # Add rank if requested as a CASE-based calculation the collection
500 # column.
501 if "rank" in requested_columns:
502 payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case(
503 {record.key: rank for record, rank in collections},
504 value=collection_col,
505 )
506 # Add more column definitions, starting with the data ID.
507 for dimension_name in self.datasetType.dimensions.required.names:
508 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[
509 dimension_name
510 ]
511 # We can always get the dataset_id from the tags/calibs table.
512 if "dataset_id" in requested_columns:
513 payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col
514 # It's possible we now have everything we need, from just the
515 # tags/calibs table. The things we might need to get from the static
516 # dataset table are the run key and the ingest date.
517 need_static_table = False
518 if "run" in requested_columns:
519 if len(collections) == 1 and collections[0][0].type is CollectionType.RUN:
520 # If we are searching exactly one RUN collection, we
521 # know that if we find the dataset in that collection,
522 # then that's the datasets's run; we don't need to
523 # query for it.
524 payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = (
525 sqlalchemy.sql.literal(collections[0][0].key)
526 )
527 else:
528 payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = (
529 self._static.dataset.columns[self._runKeyColumn]
530 )
531 need_static_table = True
532 # Ingest date can only come from the static table.
533 if "ingest_date" in requested_columns:
534 need_static_table = True
535 payload.columns_available[DatasetColumnTag(self.datasetType.name, "ingest_date")] = (
536 self._static.dataset.columns.ingest_date
537 )
538 # If we need the static table, join it in via dataset_id and
539 # dataset_type_id
540 if need_static_table:
541 payload.from_clause = payload.from_clause.join(
542 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id)
543 )
544 # Also constrain dataset_type_id in static table in case that helps
545 # generate a better plan.
546 # We could also include this in the JOIN ON clause, but my guess is
547 # that that's a good idea IFF it's in the foreign key, and right
548 # now it isn't.
549 payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
550 leaf = context.sql_engine.make_leaf(
551 payload.columns_available.keys(),
552 payload=payload,
553 name=self.datasetType.name,
554 parameters={record.name: rank for record, rank in collections},
555 )
556 return leaf
558 def make_query_joiner(self, collections: Sequence[CollectionRecord], fields: Set[str]) -> QueryJoiner:
559 # This method largely mimics `make_relation`, but it uses the new query
560 # system primitives instead of the old one. In terms of the SQL
561 # queries it builds, there are two more main differences:
562 #
563 # - Collection and run columns are now string names rather than IDs.
564 # This insulates the query result-processing code from collection
565 # caching and the collection manager subclass details.
566 #
567 # - The subquery always has unique rows, which is achieved by using
568 # SELECT DISTINCT when necessary.
569 #
570 collection_types = {collection.type for collection in collections}
571 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
572 #
573 # There are two kinds of table in play here:
574 #
575 # - the static dataset table (with the dataset ID, dataset type ID,
576 # run ID/name, and ingest date);
577 #
578 # - the dynamic tags/calibs table (with the dataset ID, dataset type
579 # type ID, collection ID/name, data ID, and possibly validity
580 # range).
581 #
582 # That means that we might want to return a query against either table
583 # or a JOIN of both, depending on which quantities the caller wants.
584 # But the data ID is always included, which means we'll always include
585 # the tags/calibs table and join in the static dataset table only if we
586 # need things from it that we can't get from the tags/calibs table.
587 #
588 # Note that it's important that we include a WHERE constraint on both
589 # tables for any column (e.g. dataset_type_id) that is in both when
590 # it's given explicitly; not doing can prevent the query planner from
591 # using very important indexes. At present, we don't include those
592 # redundant columns in the JOIN ON expression, however, because the
593 # FOREIGN KEY (and its index) are defined only on dataset_id.
594 columns = qt.ColumnSet(self.datasetType.dimensions.as_group())
595 columns.drop_implied_dimension_keys()
596 columns.dataset_fields[self.datasetType.name].update(fields)
597 tags_builder: QueryBuilder | None = None
598 if collection_types != {CollectionType.CALIBRATION}: 598 ↛ 614line 598 didn't jump to line 614, because the condition on line 598 was never false
599 # We'll need a subquery for the tags table if any of the given
600 # collections are not a CALIBRATION collection. This intentionally
601 # also fires when the list of collections is empty as a way to
602 # create a dummy subquery that we know will fail.
603 # We give the table an alias because it might appear multiple times
604 # in the same query, for different dataset types.
605 tags_builder = self._finish_query_builder(
606 QueryJoiner(self._db, self._tags.alias(f"{self.datasetType.name}_tags")).to_builder(columns),
607 [record for record in collections if record.type is not CollectionType.CALIBRATION],
608 fields,
609 )
610 if "timespan" in fields: 610 ↛ 611line 610 didn't jump to line 611
611 tags_builder.joiner.timespans[self.datasetType.name] = (
612 self._db.getTimespanRepresentation().fromLiteral(Timespan(None, None))
613 )
614 calibs_builder: QueryBuilder | None = None
615 if CollectionType.CALIBRATION in collection_types: 615 ↛ 619line 615 didn't jump to line 619, because the condition on line 615 was never true
616 # If at least one collection is a CALIBRATION collection, we'll
617 # need a subquery for the calibs table, and could include the
618 # timespan as a result or constraint.
619 assert (
620 self._calibs is not None
621 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
622 calibs_builder = self._finish_query_builder(
623 QueryJoiner(self._db, self._calibs.alias(f"{self.datasetType.name}_calibs")).to_builder(
624 columns
625 ),
626 [record for record in collections if record.type is CollectionType.CALIBRATION],
627 fields,
628 )
629 if "timespan" in fields:
630 calibs_builder.joiner.timespans[self.datasetType.name] = (
631 self._db.getTimespanRepresentation().from_columns(self._calibs.columns)
632 )
634 # In calibration collections, we need timespan as well as data ID
635 # to ensure unique rows.
636 calibs_builder.distinct = calibs_builder.distinct and "timespan" not in fields
637 if tags_builder is not None: 637 ↛ 643line 637 didn't jump to line 643, because the condition on line 637 was never false
638 if calibs_builder is not None: 638 ↛ 640line 638 didn't jump to line 640, because the condition on line 638 was never true
639 # Need a UNION subquery.
640 return tags_builder.union_subquery([calibs_builder])
641 else:
642 return tags_builder.to_joiner()
643 elif calibs_builder is not None:
644 return calibs_builder.to_joiner()
645 else:
646 raise AssertionError("Branch should be unreachable.")
648 def _finish_query_builder(
649 self,
650 sql_projection: QueryBuilder,
651 collections: Sequence[CollectionRecord],
652 fields: Set[str],
653 ) -> QueryBuilder:
654 # This method plays the same role as _finish_single_relation in the new
655 # query system. It is called exactly one or two times by
656 # make_sql_builder, just as _finish_single_relation is called exactly
657 # one or two times by make_relation. See make_sql_builder comments for
658 # what's different.
659 assert sql_projection.joiner.from_clause is not None
660 run_collections_only = all(record.type is CollectionType.RUN for record in collections)
661 sql_projection.joiner.where(
662 sql_projection.joiner.from_clause.c.dataset_type_id == self._dataset_type_id
663 )
664 dataset_id_col = sql_projection.joiner.from_clause.c.dataset_id
665 collection_col = sql_projection.joiner.from_clause.c[self._collections.getCollectionForeignKeyName()]
666 fields_provided = sql_projection.joiner.fields[self.datasetType.name]
667 # We always constrain and optionally retrieve the collection(s) via the
668 # tags/calibs table.
669 if "collection_key" in fields: 669 ↛ 670line 669 didn't jump to line 670, because the condition on line 669 was never true
670 sql_projection.joiner.fields[self.datasetType.name]["collection_key"] = collection_col
671 if len(collections) == 1:
672 only_collection_record = collections[0]
673 sql_projection.joiner.where(collection_col == only_collection_record.key)
674 if "collection" in fields: 674 ↛ 675line 674 didn't jump to line 675, because the condition on line 674 was never true
675 fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name)
676 elif not collections:
677 sql_projection.joiner.where(sqlalchemy.literal(False))
678 if "collection" in fields: 678 ↛ 679line 678 didn't jump to line 679, because the condition on line 678 was never true
679 fields_provided["collection"] = sqlalchemy.literal("NO COLLECTIONS")
680 else:
681 sql_projection.joiner.where(collection_col.in_([collection.key for collection in collections]))
682 if "collection" in fields:
683 # Avoid a join to the collection table to get the name by using
684 # a CASE statement. The SQL will be a bit more verbose but
685 # more efficient.
686 fields_provided["collection"] = sqlalchemy.case(
687 {record.key: record.name for record in collections}, value=collection_col
688 )
689 # Add more column definitions, starting with the data ID.
690 sql_projection.joiner.extract_dimensions(self.datasetType.dimensions.required.names)
691 # We can always get the dataset_id from the tags/calibs table, even if
692 # could also get it from the 'static' dataset table.
693 if "dataset_id" in fields: 693 ↛ 694line 693 didn't jump to line 694, because the condition on line 693 was never true
694 fields_provided["dataset_id"] = dataset_id_col
696 # It's possible we now have everything we need, from just the
697 # tags/calibs table. The things we might need to get from the static
698 # dataset table are the run key and the ingest date.
699 need_static_table = False
700 if "run" in fields: 700 ↛ 701line 700 didn't jump to line 701, because the condition on line 700 was never true
701 if len(collections) == 1 and run_collections_only:
702 # If we are searching exactly one RUN collection, we
703 # know that if we find the dataset in that collection,
704 # then that's the datasets's run; we don't need to
705 # query for it.
706 fields_provided["run"] = sqlalchemy.literal(only_collection_record.name)
707 elif run_collections_only:
708 # Once again we can avoid joining to the collection table by
709 # adding a CASE statement.
710 fields_provided["run"] = sqlalchemy.case(
711 {record.key: record.name for record in collections},
712 value=self._static.dataset.c[self._runKeyColumn],
713 )
714 need_static_table = True
715 else:
716 # Here we can't avoid a join to the collection table, because
717 # we might find a dataset via something other than its RUN
718 # collection.
719 (
720 fields_provided["run"],
721 sql_projection.joiner.from_clause,
722 ) = self._collections.lookup_name_sql(
723 self._static.dataset.c[self._runKeyColumn],
724 sql_projection.joiner.from_clause,
725 )
726 need_static_table = True
727 # Ingest date can only come from the static table.
728 if "ingest_date" in fields: 728 ↛ 729line 728 didn't jump to line 729, because the condition on line 728 was never true
729 fields_provided["ingest_date"] = self._static.dataset.c.ingest_date
730 need_static_table = True
731 if need_static_table: 731 ↛ 735line 731 didn't jump to line 735, because the condition on line 731 was never true
732 # If we need the static table, join it in via dataset_id. We don't
733 # use QueryJoiner.join because we're joining on dataset ID, not
734 # dimensions.
735 sql_projection.joiner.from_clause = sql_projection.joiner.from_clause.join(
736 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.c.id)
737 )
738 # Also constrain dataset_type_id in static table in case that helps
739 # generate a better plan. We could also include this in the JOIN ON
740 # clause, but my guess is that that's a good idea IFF it's in the
741 # foreign key, and right now it isn't.
742 sql_projection.joiner.where(self._static.dataset.c.dataset_type_id == self._dataset_type_id)
743 sql_projection.distinct = (
744 # If there are multiple collections, this subquery might have
745 # non-unique rows.
746 len(collections) > 1
747 and not fields
748 )
749 return sql_projection
751 def getDataId(self, id: DatasetId) -> DataCoordinate:
752 """Return DataId for a dataset.
754 Parameters
755 ----------
756 id : `DatasetId`
757 Unique dataset identifier.
759 Returns
760 -------
761 dataId : `DataCoordinate`
762 DataId for the dataset.
763 """
764 # This query could return multiple rows (one for each tagged collection
765 # the dataset is in, plus one for its run collection), and we don't
766 # care which of those we get.
767 sql = (
768 self._tags.select()
769 .where(
770 sqlalchemy.sql.and_(
771 self._tags.columns.dataset_id == id,
772 self._tags.columns.dataset_type_id == self._dataset_type_id,
773 )
774 )
775 .limit(1)
776 )
777 with self._db.query(sql) as sql_result:
778 row = sql_result.mappings().fetchone()
779 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
780 return DataCoordinate.from_required_values(
781 self.datasetType.dimensions.as_group(),
782 tuple(row[dimension] for dimension in self.datasetType.dimensions.required.names),
783 )
786class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
787 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
788 dataset IDs.
789 """
791 idMaker = DatasetIdFactory()
792 """Factory for dataset IDs. In the future this factory may be shared with
793 other classes (e.g. Registry)."""
795 def insert(
796 self,
797 run: RunRecord,
798 dataIds: Iterable[DataCoordinate],
799 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
800 ) -> Iterator[DatasetRef]:
801 # Docstring inherited from DatasetRecordStorage.
803 # Current timestamp, type depends on schema version. Use microsecond
804 # precision for astropy time to keep things consistent with
805 # TIMESTAMP(6) SQL type.
806 timestamp: datetime.datetime | astropy.time.Time
807 if self._use_astropy:
808 # Astropy `now()` precision should be the same as `now()` which
809 # should mean microsecond.
810 timestamp = astropy.time.Time.now()
811 else:
812 timestamp = datetime.datetime.now(datetime.UTC)
814 # Iterate over data IDs, transforming a possibly-single-pass iterable
815 # into a list.
816 dataIdList: list[DataCoordinate] = []
817 rows = []
818 summary = CollectionSummary()
819 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds):
820 dataIdList.append(dataId)
821 rows.append(
822 {
823 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
824 "dataset_type_id": self._dataset_type_id,
825 self._runKeyColumn: run.key,
826 "ingest_date": timestamp,
827 }
828 )
830 with self._db.transaction():
831 # Insert into the static dataset table.
832 self._db.insert(self._static.dataset, *rows)
833 # Update the summary tables for this collection in case this is the
834 # first time this dataset type or these governor values will be
835 # inserted there.
836 self._summaries.update(run, [self._dataset_type_id], summary)
837 # Combine the generated dataset_id values and data ID fields to
838 # form rows to be inserted into the tags table.
839 protoTagsRow = {
840 "dataset_type_id": self._dataset_type_id,
841 self._collections.getCollectionForeignKeyName(): run.key,
842 }
843 tagsRows = [
844 dict(protoTagsRow, dataset_id=row["id"], **dataId.required)
845 for dataId, row in zip(dataIdList, rows, strict=True)
846 ]
847 # Insert those rows into the tags table.
848 self._db.insert(self._tags, *tagsRows)
850 for dataId, row in zip(dataIdList, rows, strict=True):
851 yield DatasetRef(
852 datasetType=self.datasetType,
853 dataId=dataId,
854 id=row["id"],
855 run=run.name,
856 )
858 def import_(
859 self,
860 run: RunRecord,
861 datasets: Iterable[DatasetRef],
862 ) -> Iterator[DatasetRef]:
863 # Docstring inherited from DatasetRecordStorage.
865 # Current timestamp, type depends on schema version.
866 if self._use_astropy:
867 # Astropy `now()` precision should be the same as `now()` which
868 # should mean microsecond.
869 timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai)
870 else:
871 timestamp = sqlalchemy.sql.literal(datetime.datetime.now(datetime.UTC))
873 # Iterate over data IDs, transforming a possibly-single-pass iterable
874 # into a list.
875 dataIds: dict[DatasetId, DataCoordinate] = {}
876 summary = CollectionSummary()
877 for dataset in summary.add_datasets_generator(datasets):
878 dataIds[dataset.id] = dataset.dataId
880 # We'll insert all new rows into a temporary table
881 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False)
882 collFkName = self._collections.getCollectionForeignKeyName()
883 protoTagsRow = {
884 "dataset_type_id": self._dataset_type_id,
885 collFkName: run.key,
886 }
887 tmpRows = [
888 dict(protoTagsRow, dataset_id=dataset_id, **dataId.required)
889 for dataset_id, dataId in dataIds.items()
890 ]
891 with self._db.transaction(for_temp_tables=True), self._db.temporary_table(tableSpec) as tmp_tags:
892 # store all incoming data in a temporary table
893 self._db.insert(tmp_tags, *tmpRows)
895 # There are some checks that we want to make for consistency
896 # of the new datasets with existing ones.
897 self._validateImport(tmp_tags, run)
899 # Before we merge temporary table into dataset/tags we need to
900 # drop datasets which are already there (and do not conflict).
901 self._db.deleteWhere(
902 tmp_tags,
903 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
904 )
906 # Copy it into dataset table, need to re-label some columns.
907 self._db.insert(
908 self._static.dataset,
909 select=sqlalchemy.sql.select(
910 tmp_tags.columns.dataset_id.label("id"),
911 tmp_tags.columns.dataset_type_id,
912 tmp_tags.columns[collFkName].label(self._runKeyColumn),
913 timestamp.label("ingest_date"),
914 ),
915 )
917 # Update the summary tables for this collection in case this
918 # is the first time this dataset type or these governor values
919 # will be inserted there.
920 self._summaries.update(run, [self._dataset_type_id], summary)
922 # Copy it into tags table.
923 self._db.insert(self._tags, select=tmp_tags.select())
925 # Return refs in the same order as in the input list.
926 for dataset_id, dataId in dataIds.items():
927 yield DatasetRef(
928 datasetType=self.datasetType,
929 id=dataset_id,
930 dataId=dataId,
931 run=run.name,
932 )
934 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
935 """Validate imported refs against existing datasets.
937 Parameters
938 ----------
939 tmp_tags : `sqlalchemy.schema.Table`
940 Temporary table with new datasets and the same schema as tags
941 table.
942 run : `RunRecord`
943 The record object describing the `~CollectionType.RUN` collection.
945 Raises
946 ------
947 ConflictingDefinitionError
948 Raise if new datasets conflict with existing ones.
949 """
950 dataset = self._static.dataset
951 tags = self._tags
952 collFkName = self._collections.getCollectionForeignKeyName()
954 # Check that existing datasets have the same dataset type and
955 # run.
956 query = (
957 sqlalchemy.sql.select(
958 dataset.columns.id.label("dataset_id"),
959 dataset.columns.dataset_type_id.label("dataset_type_id"),
960 tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"),
961 dataset.columns[self._runKeyColumn].label("run"),
962 tmp_tags.columns[collFkName].label("new_run"),
963 )
964 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
965 .where(
966 sqlalchemy.sql.or_(
967 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
968 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
969 )
970 )
971 .limit(1)
972 )
973 with self._db.query(query) as result:
974 # Only include the first one in the exception message
975 if (row := result.first()) is not None:
976 existing_run = self._collections[row.run].name
977 new_run = self._collections[row.new_run].name
978 if row.dataset_type_id == self._dataset_type_id:
979 if row.new_dataset_type_id == self._dataset_type_id: 979 ↛ 985line 979 didn't jump to line 985, because the condition on line 979 was never false
980 raise ConflictingDefinitionError(
981 f"Current run {existing_run!r} and new run {new_run!r} do not agree for "
982 f"dataset {row.dataset_id}."
983 )
984 else:
985 raise ConflictingDefinitionError(
986 f"Dataset {row.dataset_id} was provided with type {self.datasetType.name!r} "
987 f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} "
988 f"in run {run!r}."
989 )
990 else:
991 raise ConflictingDefinitionError(
992 f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} "
993 f"in run {new_run!r}, but was already defined with type {self.datasetType.name!r} "
994 f"in run {run!r}."
995 )
997 # Check that matching dataset in tags table has the same DataId.
998 query = (
999 sqlalchemy.sql.select(
1000 tags.columns.dataset_id,
1001 tags.columns.dataset_type_id.label("type_id"),
1002 tmp_tags.columns.dataset_type_id.label("new_type_id"),
1003 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
1004 *[
1005 tmp_tags.columns[dim].label(f"new_{dim}")
1006 for dim in self.datasetType.dimensions.required.names
1007 ],
1008 )
1009 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
1010 .where(
1011 sqlalchemy.sql.or_(
1012 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
1013 *[
1014 tags.columns[dim] != tmp_tags.columns[dim]
1015 for dim in self.datasetType.dimensions.required.names
1016 ],
1017 )
1018 )
1019 .limit(1)
1020 )
1022 with self._db.query(query) as result:
1023 if (row := result.first()) is not None:
1024 # Only include the first one in the exception message
1025 raise ConflictingDefinitionError(
1026 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
1027 )
1029 # Check that matching run+dataId have the same dataset ID.
1030 query = (
1031 sqlalchemy.sql.select(
1032 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
1033 tags.columns.dataset_id,
1034 tmp_tags.columns.dataset_id.label("new_dataset_id"),
1035 tags.columns[collFkName],
1036 tmp_tags.columns[collFkName].label(f"new_{collFkName}"),
1037 )
1038 .select_from(
1039 tags.join(
1040 tmp_tags,
1041 sqlalchemy.sql.and_(
1042 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
1043 tags.columns[collFkName] == tmp_tags.columns[collFkName],
1044 *[
1045 tags.columns[dim] == tmp_tags.columns[dim]
1046 for dim in self.datasetType.dimensions.required.names
1047 ],
1048 ),
1049 )
1050 )
1051 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
1052 .limit(1)
1053 )
1054 with self._db.query(query) as result:
1055 # only include the first one in the exception message
1056 if (row := result.first()) is not None:
1057 data_id = {dim: getattr(row, dim) for dim in self.datasetType.dimensions.required.names}
1058 existing_collection = self._collections[getattr(row, collFkName)].name
1059 new_collection = self._collections[getattr(row, f"new_{collFkName}")].name
1060 raise ConflictingDefinitionError(
1061 f"Dataset with type {self.datasetType.name!r} and data ID {data_id} "
1062 f"has ID {row.dataset_id} in existing collection {existing_collection!r} "
1063 f"but ID {row.new_dataset_id} in new collection {new_collection!r}."
1064 )