Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 95%
261 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 09:58 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 09:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
29from __future__ import annotations
31from .... import ddl
33__all__ = ("ByDimensionsDatasetRecordStorage",)
35import datetime
36from collections.abc import Callable, Iterable, Iterator, Sequence, Set
37from typing import TYPE_CHECKING
39import astropy.time
40import sqlalchemy
41from lsst.daf.relation import Relation, sql
43from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag
44from ...._column_type_info import LogicalColumn
45from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef
46from ...._dataset_type import DatasetType
47from ...._exceptions import CollectionTypeError
48from ...._timespan import Timespan
49from ....dimensions import DataCoordinate
50from ..._collection_summary import CollectionSummary
51from ..._collection_type import CollectionType
52from ..._exceptions import ConflictingDefinitionError
53from ...interfaces import DatasetRecordStorage
54from ...queries import SqlQueryContext
55from .tables import makeTagTableSpec
57if TYPE_CHECKING:
58 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
59 from .summaries import CollectionSummaryManager
60 from .tables import StaticDatasetTablesTuple
63class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
64 """Dataset record storage implementation paired with
65 `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more
66 information.
68 Instances of this class should never be constructed directly; use
69 `DatasetRecordStorageManager.register` instead.
71 Parameters
72 ----------
73 datasetType : `DatasetType`
74 The dataset type to use.
75 db : `Database`
76 Database connection.
77 dataset_type_id : `int`
78 Dataset type identifier.
79 collections : `CollectionManager`
80 The collection manager.
81 static : `StaticDatasetTablesTuple`
82 Unknown.
83 summaries : `CollectionSummaryManager`
84 Collection summary manager.
85 tags_table_factory : `~collections.abc.Callable`
86 Factory for creating tags tables.
87 use_astropy_ingest_date : `bool`
88 Whether to use Astropy for ingest date.
89 calibs_table_factory : `~collections.abc.Callable`
90 Factory for creating calibration tables.
91 """
93 def __init__(
94 self,
95 *,
96 datasetType: DatasetType,
97 db: Database,
98 dataset_type_id: int,
99 collections: CollectionManager,
100 static: StaticDatasetTablesTuple,
101 summaries: CollectionSummaryManager,
102 tags_table_factory: Callable[[], sqlalchemy.schema.Table],
103 use_astropy_ingest_date: bool,
104 calibs_table_factory: Callable[[], sqlalchemy.schema.Table] | None,
105 ):
106 super().__init__(datasetType=datasetType)
107 self._dataset_type_id = dataset_type_id
108 self._db = db
109 self._collections = collections
110 self._static = static
111 self._summaries = summaries
112 self._tags_table_factory = tags_table_factory
113 self._calibs_table_factory = calibs_table_factory
114 self._runKeyColumn = collections.getRunForeignKeyName()
115 self._use_astropy = use_astropy_ingest_date
116 self._tags_table: sqlalchemy.schema.Table | None = None
117 self._calibs_table: sqlalchemy.schema.Table | None = None
119 @property
120 def _tags(self) -> sqlalchemy.schema.Table:
121 if self._tags_table is None:
122 self._tags_table = self._tags_table_factory()
123 return self._tags_table
125 @property
126 def _calibs(self) -> sqlalchemy.schema.Table | None:
127 if self._calibs_table is None:
128 if self._calibs_table_factory is None: 128 ↛ 129line 128 didn't jump to line 129, because the condition on line 128 was never true
129 return None
130 self._calibs_table = self._calibs_table_factory()
131 return self._calibs_table
133 def delete(self, datasets: Iterable[DatasetRef]) -> None:
134 # Docstring inherited from DatasetRecordStorage.
135 # Only delete from common dataset table; ON DELETE foreign key clauses
136 # will handle the rest.
137 self._db.delete(
138 self._static.dataset,
139 ["id"],
140 *[{"id": dataset.id} for dataset in datasets],
141 )
143 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
144 # Docstring inherited from DatasetRecordStorage.
145 if collection.type is not CollectionType.TAGGED: 145 ↛ 146line 145 didn't jump to line 146, because the condition on line 145 was never true
146 raise TypeError(
147 f"Cannot associate into collection '{collection.name}' "
148 f"of type {collection.type.name}; must be TAGGED."
149 )
150 protoRow = {
151 self._collections.getCollectionForeignKeyName(): collection.key,
152 "dataset_type_id": self._dataset_type_id,
153 }
154 rows = []
155 summary = CollectionSummary()
156 for dataset in summary.add_datasets_generator(datasets):
157 rows.append(dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required))
158 # Update the summary tables for this collection in case this is the
159 # first time this dataset type or these governor values will be
160 # inserted there.
161 self._summaries.update(collection, [self._dataset_type_id], summary)
162 # Update the tag table itself.
163 self._db.replace(self._tags, *rows)
165 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
166 # Docstring inherited from DatasetRecordStorage.
167 if collection.type is not CollectionType.TAGGED: 167 ↛ 168line 167 didn't jump to line 168, because the condition on line 167 was never true
168 raise TypeError(
169 f"Cannot disassociate from collection '{collection.name}' "
170 f"of type {collection.type.name}; must be TAGGED."
171 )
172 rows = [
173 {
174 "dataset_id": dataset.id,
175 self._collections.getCollectionForeignKeyName(): collection.key,
176 }
177 for dataset in datasets
178 ]
179 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
181 def _buildCalibOverlapQuery(
182 self,
183 collection: CollectionRecord,
184 data_ids: set[DataCoordinate] | None,
185 timespan: Timespan,
186 context: SqlQueryContext,
187 ) -> Relation:
188 relation = self.make_relation(
189 collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context
190 ).with_rows_satisfying(
191 context.make_timespan_overlap_predicate(
192 DatasetColumnTag(self.datasetType.name, "timespan"), timespan
193 ),
194 )
195 if data_ids is not None:
196 relation = relation.join(
197 context.make_data_id_relation(
198 data_ids, self.datasetType.dimensions.required.names
199 ).transferred_to(context.sql_engine),
200 )
201 return relation
203 def certify(
204 self,
205 collection: CollectionRecord,
206 datasets: Iterable[DatasetRef],
207 timespan: Timespan,
208 context: SqlQueryContext,
209 ) -> None:
210 # Docstring inherited from DatasetRecordStorage.
211 if self._calibs is None: 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 raise CollectionTypeError(
213 f"Cannot certify datasets of type {self.datasetType.name}, for which "
214 "DatasetType.isCalibration() is False."
215 )
216 if collection.type is not CollectionType.CALIBRATION: 216 ↛ 217line 216 didn't jump to line 217, because the condition on line 216 was never true
217 raise CollectionTypeError(
218 f"Cannot certify into collection '{collection.name}' "
219 f"of type {collection.type.name}; must be CALIBRATION."
220 )
221 TimespanReprClass = self._db.getTimespanRepresentation()
222 protoRow = {
223 self._collections.getCollectionForeignKeyName(): collection.key,
224 "dataset_type_id": self._dataset_type_id,
225 }
226 rows = []
227 dataIds: set[DataCoordinate] | None = (
228 set() if not TimespanReprClass.hasExclusionConstraint() else None
229 )
230 summary = CollectionSummary()
231 for dataset in summary.add_datasets_generator(datasets):
232 row = dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required)
233 TimespanReprClass.update(timespan, result=row)
234 rows.append(row)
235 if dataIds is not None: 235 ↛ 231line 235 didn't jump to line 231, because the condition on line 235 was never false
236 dataIds.add(dataset.dataId)
237 # Update the summary tables for this collection in case this is the
238 # first time this dataset type or these governor values will be
239 # inserted there.
240 self._summaries.update(collection, [self._dataset_type_id], summary)
241 # Update the association table itself.
242 if TimespanReprClass.hasExclusionConstraint(): 242 ↛ 245line 242 didn't jump to line 245, because the condition on line 242 was never true
243 # Rely on database constraint to enforce invariants; we just
244 # reraise the exception for consistency across DB engines.
245 try:
246 self._db.insert(self._calibs, *rows)
247 except sqlalchemy.exc.IntegrityError as err:
248 raise ConflictingDefinitionError(
249 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
250 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
251 ) from err
252 else:
253 # Have to implement exclusion constraint ourselves.
254 # Start by building a SELECT query for any rows that would overlap
255 # this one.
256 relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context)
257 # Acquire a table lock to ensure there are no concurrent writes
258 # could invalidate our checking before we finish the inserts. We
259 # use a SAVEPOINT in case there is an outer transaction that a
260 # failure here should not roll back.
261 with self._db.transaction(lock=[self._calibs], savepoint=True):
262 # Enter SqlQueryContext in case we need to use a temporary
263 # table to include the give data IDs in the query. Note that
264 # by doing this inside the transaction, we make sure it doesn't
265 # attempt to close the session when its done, since it just
266 # sees an already-open session that it knows it shouldn't
267 # manage.
268 with context:
269 # Run the check SELECT query.
270 conflicting = context.count(context.process(relation))
271 if conflicting > 0:
272 raise ConflictingDefinitionError(
273 f"{conflicting} validity range conflicts certifying datasets of type "
274 f"{self.datasetType.name} into {collection.name} for range "
275 f"[{timespan.begin}, {timespan.end})."
276 )
277 # Proceed with the insert.
278 self._db.insert(self._calibs, *rows)
280 def decertify(
281 self,
282 collection: CollectionRecord,
283 timespan: Timespan,
284 *,
285 dataIds: Iterable[DataCoordinate] | None = None,
286 context: SqlQueryContext,
287 ) -> None:
288 # Docstring inherited from DatasetRecordStorage.
289 if self._calibs is None: 289 ↛ 290line 289 didn't jump to line 290, because the condition on line 289 was never true
290 raise CollectionTypeError(
291 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
292 "DatasetType.isCalibration() is False."
293 )
294 if collection.type is not CollectionType.CALIBRATION: 294 ↛ 295line 294 didn't jump to line 295, because the condition on line 294 was never true
295 raise CollectionTypeError(
296 f"Cannot decertify from collection '{collection.name}' "
297 f"of type {collection.type.name}; must be CALIBRATION."
298 )
299 TimespanReprClass = self._db.getTimespanRepresentation()
300 # Construct a SELECT query to find all rows that overlap our inputs.
301 dataIdSet: set[DataCoordinate] | None
302 if dataIds is not None:
303 dataIdSet = set(dataIds)
304 else:
305 dataIdSet = None
306 relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context)
307 calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey")
308 dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id")
309 timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan")
310 data_id_tags = [
311 (name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required.names
312 ]
313 # Set up collections to populate with the rows we'll want to modify.
314 # The insert rows will have the same values for collection and
315 # dataset type.
316 protoInsertRow = {
317 self._collections.getCollectionForeignKeyName(): collection.key,
318 "dataset_type_id": self._dataset_type_id,
319 }
320 rowsToDelete = []
321 rowsToInsert = []
322 # Acquire a table lock to ensure there are no concurrent writes
323 # between the SELECT and the DELETE and INSERT queries based on it.
324 with self._db.transaction(lock=[self._calibs], savepoint=True):
325 # Enter SqlQueryContext in case we need to use a temporary table to
326 # include the give data IDs in the query (see similar block in
327 # certify for details).
328 with context:
329 for row in context.fetch_iterable(relation):
330 rowsToDelete.append({"id": row[calib_pkey_tag]})
331 # Construct the insert row(s) by copying the prototype row,
332 # then adding the dimension column values, then adding
333 # what's left of the timespan from that row after we
334 # subtract the given timespan.
335 newInsertRow = protoInsertRow.copy()
336 newInsertRow["dataset_id"] = row[dataset_id_tag]
337 for name, tag in data_id_tags:
338 newInsertRow[name] = row[tag]
339 rowTimespan = row[timespan_tag]
340 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
341 for diffTimespan in rowTimespan.difference(timespan):
342 rowsToInsert.append(
343 TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())
344 )
345 # Run the DELETE and INSERT queries.
346 self._db.delete(self._calibs, ["id"], *rowsToDelete)
347 self._db.insert(self._calibs, *rowsToInsert)
349 def make_relation(
350 self,
351 *collections: CollectionRecord,
352 columns: Set[str],
353 context: SqlQueryContext,
354 ) -> Relation:
355 # Docstring inherited from DatasetRecordStorage.
356 collection_types = {collection.type for collection in collections}
357 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
358 TimespanReprClass = self._db.getTimespanRepresentation()
359 #
360 # There are two kinds of table in play here:
361 #
362 # - the static dataset table (with the dataset ID, dataset type ID,
363 # run ID/name, and ingest date);
364 #
365 # - the dynamic tags/calibs table (with the dataset ID, dataset type
366 # type ID, collection ID/name, data ID, and possibly validity
367 # range).
368 #
369 # That means that we might want to return a query against either table
370 # or a JOIN of both, depending on which quantities the caller wants.
371 # But the data ID is always included, which means we'll always include
372 # the tags/calibs table and join in the static dataset table only if we
373 # need things from it that we can't get from the tags/calibs table.
374 #
375 # Note that it's important that we include a WHERE constraint on both
376 # tables for any column (e.g. dataset_type_id) that is in both when
377 # it's given explicitly; not doing can prevent the query planner from
378 # using very important indexes. At present, we don't include those
379 # redundant columns in the JOIN ON expression, however, because the
380 # FOREIGN KEY (and its index) are defined only on dataset_id.
381 tag_relation: Relation | None = None
382 calib_relation: Relation | None = None
383 if collection_types != {CollectionType.CALIBRATION}:
384 # We'll need a subquery for the tags table if any of the given
385 # collections are not a CALIBRATION collection. This intentionally
386 # also fires when the list of collections is empty as a way to
387 # create a dummy subquery that we know will fail.
388 # We give the table an alias because it might appear multiple times
389 # in the same query, for different dataset types.
390 tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags"))
391 if "timespan" in columns:
392 tags_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = (
393 TimespanReprClass.fromLiteral(Timespan(None, None))
394 )
395 tag_relation = self._finish_single_relation(
396 tags_parts,
397 columns,
398 [
399 (record, rank)
400 for rank, record in enumerate(collections)
401 if record.type is not CollectionType.CALIBRATION
402 ],
403 context,
404 )
405 assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries."
406 if CollectionType.CALIBRATION in collection_types:
407 # If at least one collection is a CALIBRATION collection, we'll
408 # need a subquery for the calibs table, and could include the
409 # timespan as a result or constraint.
410 assert (
411 self._calibs is not None
412 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
413 calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs"))
414 if "timespan" in columns:
415 calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = (
416 TimespanReprClass.from_columns(calibs_parts.from_clause.columns)
417 )
418 if "calib_pkey" in columns:
419 # This is a private extension not included in the base class
420 # interface, for internal use only in _buildCalibOverlapQuery,
421 # which needs access to the autoincrement primary key for the
422 # calib association table.
423 calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "calib_pkey")] = (
424 calibs_parts.from_clause.columns.id
425 )
426 calib_relation = self._finish_single_relation(
427 calibs_parts,
428 columns,
429 [
430 (record, rank)
431 for rank, record in enumerate(collections)
432 if record.type is CollectionType.CALIBRATION
433 ],
434 context,
435 )
436 if tag_relation is not None:
437 if calib_relation is not None:
438 # daf_relation's chain operation does not automatically
439 # deduplicate; it's more like SQL's UNION ALL. To get UNION
440 # in SQL here, we add an explicit deduplication.
441 return tag_relation.chain(calib_relation).without_duplicates()
442 else:
443 return tag_relation
444 elif calib_relation is not None:
445 return calib_relation
446 else:
447 raise AssertionError("Branch should be unreachable.")
449 def _finish_single_relation(
450 self,
451 payload: sql.Payload[LogicalColumn],
452 requested_columns: Set[str],
453 collections: Sequence[tuple[CollectionRecord, int]],
454 context: SqlQueryContext,
455 ) -> Relation:
456 """Handle adding columns and WHERE terms that are not specific to
457 either the tags or calibs tables.
459 Helper method for `make_relation`.
461 Parameters
462 ----------
463 payload : `lsst.daf.relation.sql.Payload`
464 SQL query parts under construction, to be modified in-place and
465 used to construct the new relation.
466 requested_columns : `~collections.abc.Set` [ `str` ]
467 Columns the relation should include.
468 collections : `~collections.abc.Sequence` [ `tuple` \
469 [ `CollectionRecord`, `int` ] ]
470 Collections to search for the dataset and their ranks.
471 context : `SqlQueryContext`
472 Context that manages engines and state for the query.
474 Returns
475 -------
476 relation : `lsst.daf.relation.Relation`
477 New dataset query relation.
478 """
479 payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id)
480 dataset_id_col = payload.from_clause.columns.dataset_id
481 collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()]
482 # We always constrain and optionally retrieve the collection(s) via the
483 # tags/calibs table.
484 if len(collections) == 1:
485 payload.where.append(collection_col == collections[0][0].key)
486 if "collection" in requested_columns:
487 payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = (
488 sqlalchemy.sql.literal(collections[0][0].key)
489 )
490 else:
491 assert collections, "The no-collections case should be in calling code for better diagnostics."
492 payload.where.append(collection_col.in_([collection.key for collection, _ in collections]))
493 if "collection" in requested_columns:
494 payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = (
495 collection_col
496 )
497 # Add rank if requested as a CASE-based calculation the collection
498 # column.
499 if "rank" in requested_columns:
500 payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case(
501 {record.key: rank for record, rank in collections},
502 value=collection_col,
503 )
504 # Add more column definitions, starting with the data ID.
505 for dimension_name in self.datasetType.dimensions.required.names:
506 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[
507 dimension_name
508 ]
509 # We can always get the dataset_id from the tags/calibs table.
510 if "dataset_id" in requested_columns:
511 payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col
512 # It's possible we now have everything we need, from just the
513 # tags/calibs table. The things we might need to get from the static
514 # dataset table are the run key and the ingest date.
515 need_static_table = False
516 if "run" in requested_columns:
517 if len(collections) == 1 and collections[0][0].type is CollectionType.RUN:
518 # If we are searching exactly one RUN collection, we
519 # know that if we find the dataset in that collection,
520 # then that's the datasets's run; we don't need to
521 # query for it.
522 payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = (
523 sqlalchemy.sql.literal(collections[0][0].key)
524 )
525 else:
526 payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = (
527 self._static.dataset.columns[self._runKeyColumn]
528 )
529 need_static_table = True
530 # Ingest date can only come from the static table.
531 if "ingest_date" in requested_columns:
532 need_static_table = True
533 payload.columns_available[DatasetColumnTag(self.datasetType.name, "ingest_date")] = (
534 self._static.dataset.columns.ingest_date
535 )
536 # If we need the static table, join it in via dataset_id and
537 # dataset_type_id
538 if need_static_table:
539 payload.from_clause = payload.from_clause.join(
540 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id)
541 )
542 # Also constrain dataset_type_id in static table in case that helps
543 # generate a better plan.
544 # We could also include this in the JOIN ON clause, but my guess is
545 # that that's a good idea IFF it's in the foreign key, and right
546 # now it isn't.
547 payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
548 leaf = context.sql_engine.make_leaf(
549 payload.columns_available.keys(),
550 payload=payload,
551 name=self.datasetType.name,
552 parameters={record.name: rank for record, rank in collections},
553 )
554 return leaf
556 def getDataId(self, id: DatasetId) -> DataCoordinate:
557 """Return DataId for a dataset.
559 Parameters
560 ----------
561 id : `DatasetId`
562 Unique dataset identifier.
564 Returns
565 -------
566 dataId : `DataCoordinate`
567 DataId for the dataset.
568 """
569 # This query could return multiple rows (one for each tagged collection
570 # the dataset is in, plus one for its run collection), and we don't
571 # care which of those we get.
572 sql = (
573 self._tags.select()
574 .where(
575 sqlalchemy.sql.and_(
576 self._tags.columns.dataset_id == id,
577 self._tags.columns.dataset_type_id == self._dataset_type_id,
578 )
579 )
580 .limit(1)
581 )
582 with self._db.query(sql) as sql_result:
583 row = sql_result.mappings().fetchone()
584 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
585 return DataCoordinate.from_required_values(
586 self.datasetType.dimensions.as_group(),
587 tuple(row[dimension] for dimension in self.datasetType.dimensions.required.names),
588 )
591class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
592 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
593 dataset IDs.
594 """
596 idMaker = DatasetIdFactory()
597 """Factory for dataset IDs. In the future this factory may be shared with
598 other classes (e.g. Registry)."""
600 def insert(
601 self,
602 run: RunRecord,
603 dataIds: Iterable[DataCoordinate],
604 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
605 ) -> Iterator[DatasetRef]:
606 # Docstring inherited from DatasetRecordStorage.
608 # Current timestamp, type depends on schema version. Use microsecond
609 # precision for astropy time to keep things consistent with
610 # TIMESTAMP(6) SQL type.
611 timestamp: datetime.datetime | astropy.time.Time
612 if self._use_astropy:
613 # Astropy `now()` precision should be the same as `now()` which
614 # should mean microsecond.
615 timestamp = astropy.time.Time.now()
616 else:
617 timestamp = datetime.datetime.now(datetime.UTC)
619 # Iterate over data IDs, transforming a possibly-single-pass iterable
620 # into a list.
621 dataIdList: list[DataCoordinate] = []
622 rows = []
623 summary = CollectionSummary()
624 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds):
625 dataIdList.append(dataId)
626 rows.append(
627 {
628 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
629 "dataset_type_id": self._dataset_type_id,
630 self._runKeyColumn: run.key,
631 "ingest_date": timestamp,
632 }
633 )
635 with self._db.transaction():
636 # Insert into the static dataset table.
637 self._db.insert(self._static.dataset, *rows)
638 # Update the summary tables for this collection in case this is the
639 # first time this dataset type or these governor values will be
640 # inserted there.
641 self._summaries.update(run, [self._dataset_type_id], summary)
642 # Combine the generated dataset_id values and data ID fields to
643 # form rows to be inserted into the tags table.
644 protoTagsRow = {
645 "dataset_type_id": self._dataset_type_id,
646 self._collections.getCollectionForeignKeyName(): run.key,
647 }
648 tagsRows = [
649 dict(protoTagsRow, dataset_id=row["id"], **dataId.required)
650 for dataId, row in zip(dataIdList, rows, strict=True)
651 ]
652 # Insert those rows into the tags table.
653 self._db.insert(self._tags, *tagsRows)
655 for dataId, row in zip(dataIdList, rows, strict=True):
656 yield DatasetRef(
657 datasetType=self.datasetType,
658 dataId=dataId,
659 id=row["id"],
660 run=run.name,
661 )
663 def import_(
664 self,
665 run: RunRecord,
666 datasets: Iterable[DatasetRef],
667 ) -> Iterator[DatasetRef]:
668 # Docstring inherited from DatasetRecordStorage.
670 # Current timestamp, type depends on schema version.
671 if self._use_astropy:
672 # Astropy `now()` precision should be the same as `now()` which
673 # should mean microsecond.
674 timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai)
675 else:
676 timestamp = sqlalchemy.sql.literal(datetime.datetime.now(datetime.UTC))
678 # Iterate over data IDs, transforming a possibly-single-pass iterable
679 # into a list.
680 dataIds: dict[DatasetId, DataCoordinate] = {}
681 summary = CollectionSummary()
682 for dataset in summary.add_datasets_generator(datasets):
683 dataIds[dataset.id] = dataset.dataId
685 # We'll insert all new rows into a temporary table
686 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False)
687 collFkName = self._collections.getCollectionForeignKeyName()
688 protoTagsRow = {
689 "dataset_type_id": self._dataset_type_id,
690 collFkName: run.key,
691 }
692 tmpRows = [
693 dict(protoTagsRow, dataset_id=dataset_id, **dataId.required)
694 for dataset_id, dataId in dataIds.items()
695 ]
696 with self._db.transaction(for_temp_tables=True), self._db.temporary_table(tableSpec) as tmp_tags:
697 # store all incoming data in a temporary table
698 self._db.insert(tmp_tags, *tmpRows)
700 # There are some checks that we want to make for consistency
701 # of the new datasets with existing ones.
702 self._validateImport(tmp_tags, run)
704 # Before we merge temporary table into dataset/tags we need to
705 # drop datasets which are already there (and do not conflict).
706 self._db.deleteWhere(
707 tmp_tags,
708 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
709 )
711 # Copy it into dataset table, need to re-label some columns.
712 self._db.insert(
713 self._static.dataset,
714 select=sqlalchemy.sql.select(
715 tmp_tags.columns.dataset_id.label("id"),
716 tmp_tags.columns.dataset_type_id,
717 tmp_tags.columns[collFkName].label(self._runKeyColumn),
718 timestamp.label("ingest_date"),
719 ),
720 )
722 # Update the summary tables for this collection in case this
723 # is the first time this dataset type or these governor values
724 # will be inserted there.
725 self._summaries.update(run, [self._dataset_type_id], summary)
727 # Copy it into tags table.
728 self._db.insert(self._tags, select=tmp_tags.select())
730 # Return refs in the same order as in the input list.
731 for dataset_id, dataId in dataIds.items():
732 yield DatasetRef(
733 datasetType=self.datasetType,
734 id=dataset_id,
735 dataId=dataId,
736 run=run.name,
737 )
739 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
740 """Validate imported refs against existing datasets.
742 Parameters
743 ----------
744 tmp_tags : `sqlalchemy.schema.Table`
745 Temporary table with new datasets and the same schema as tags
746 table.
747 run : `RunRecord`
748 The record object describing the `~CollectionType.RUN` collection.
750 Raises
751 ------
752 ConflictingDefinitionError
753 Raise if new datasets conflict with existing ones.
754 """
755 dataset = self._static.dataset
756 tags = self._tags
757 collFkName = self._collections.getCollectionForeignKeyName()
759 # Check that existing datasets have the same dataset type and
760 # run.
761 query = (
762 sqlalchemy.sql.select(
763 dataset.columns.id.label("dataset_id"),
764 dataset.columns.dataset_type_id.label("dataset_type_id"),
765 tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"),
766 dataset.columns[self._runKeyColumn].label("run"),
767 tmp_tags.columns[collFkName].label("new_run"),
768 )
769 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
770 .where(
771 sqlalchemy.sql.or_(
772 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
773 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
774 )
775 )
776 .limit(1)
777 )
778 with self._db.query(query) as result:
779 # Only include the first one in the exception message
780 if (row := result.first()) is not None:
781 existing_run = self._collections[row.run].name
782 new_run = self._collections[row.new_run].name
783 if row.dataset_type_id == self._dataset_type_id:
784 if row.new_dataset_type_id == self._dataset_type_id: 784 ↛ 790line 784 didn't jump to line 790, because the condition on line 784 was never false
785 raise ConflictingDefinitionError(
786 f"Current run {existing_run!r} and new run {new_run!r} do not agree for "
787 f"dataset {row.dataset_id}."
788 )
789 else:
790 raise ConflictingDefinitionError(
791 f"Dataset {row.dataset_id} was provided with type {self.datasetType.name!r} "
792 f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} "
793 f"in run {run!r}."
794 )
795 else:
796 raise ConflictingDefinitionError(
797 f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} "
798 f"in run {new_run!r}, but was already defined with type {self.datasetType.name!r} "
799 f"in run {run!r}."
800 )
802 # Check that matching dataset in tags table has the same DataId.
803 query = (
804 sqlalchemy.sql.select(
805 tags.columns.dataset_id,
806 tags.columns.dataset_type_id.label("type_id"),
807 tmp_tags.columns.dataset_type_id.label("new_type_id"),
808 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
809 *[
810 tmp_tags.columns[dim].label(f"new_{dim}")
811 for dim in self.datasetType.dimensions.required.names
812 ],
813 )
814 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
815 .where(
816 sqlalchemy.sql.or_(
817 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
818 *[
819 tags.columns[dim] != tmp_tags.columns[dim]
820 for dim in self.datasetType.dimensions.required.names
821 ],
822 )
823 )
824 .limit(1)
825 )
827 with self._db.query(query) as result:
828 if (row := result.first()) is not None:
829 # Only include the first one in the exception message
830 raise ConflictingDefinitionError(
831 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
832 )
834 # Check that matching run+dataId have the same dataset ID.
835 query = (
836 sqlalchemy.sql.select(
837 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
838 tags.columns.dataset_id,
839 tmp_tags.columns.dataset_id.label("new_dataset_id"),
840 tags.columns[collFkName],
841 tmp_tags.columns[collFkName].label(f"new_{collFkName}"),
842 )
843 .select_from(
844 tags.join(
845 tmp_tags,
846 sqlalchemy.sql.and_(
847 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
848 tags.columns[collFkName] == tmp_tags.columns[collFkName],
849 *[
850 tags.columns[dim] == tmp_tags.columns[dim]
851 for dim in self.datasetType.dimensions.required.names
852 ],
853 ),
854 )
855 )
856 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
857 .limit(1)
858 )
859 with self._db.query(query) as result:
860 # only include the first one in the exception message
861 if (row := result.first()) is not None:
862 data_id = {dim: getattr(row, dim) for dim in self.datasetType.dimensions.required.names}
863 existing_collection = self._collections[getattr(row, collFkName)].name
864 new_collection = self._collections[getattr(row, f"new_{collFkName}")].name
865 raise ConflictingDefinitionError(
866 f"Dataset with type {self.datasetType.name!r} and data ID {data_id} "
867 f"has ID {row.dataset_id} in existing collection {existing_collection!r} "
868 f"but ID {row.new_dataset_id} in new collection {new_collection!r}."
869 )