Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 95%
241 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-14 09:21 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-14 09:21 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23from __future__ import annotations
25__all__ = ("ByDimensionsDatasetRecordStorage",)
27import uuid
28from collections.abc import Iterable, Iterator, Sequence, Set
29from datetime import datetime
30from typing import TYPE_CHECKING
32import astropy.time
33import sqlalchemy
34from lsst.daf.relation import Relation, sql
36from ....core import (
37 DataCoordinate,
38 DatasetColumnTag,
39 DatasetId,
40 DatasetRef,
41 DatasetType,
42 DimensionKeyColumnTag,
43 LogicalColumn,
44 Timespan,
45 ddl,
46)
47from ..._collection_summary import CollectionSummary
48from ..._collectionType import CollectionType
49from ..._exceptions import CollectionTypeError, ConflictingDefinitionError
50from ...interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage
51from ...queries import SqlQueryContext
52from .tables import makeTagTableSpec
54if TYPE_CHECKING:
55 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
56 from .summaries import CollectionSummaryManager
57 from .tables import StaticDatasetTablesTuple
60class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
61 """Dataset record storage implementation paired with
62 `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more
63 information.
65 Instances of this class should never be constructed directly; use
66 `DatasetRecordStorageManager.register` instead.
67 """
69 def __init__(
70 self,
71 *,
72 datasetType: DatasetType,
73 db: Database,
74 dataset_type_id: int,
75 collections: CollectionManager,
76 static: StaticDatasetTablesTuple,
77 summaries: CollectionSummaryManager,
78 tags: sqlalchemy.schema.Table,
79 use_astropy_ingest_date: bool,
80 calibs: sqlalchemy.schema.Table | None,
81 ):
82 super().__init__(datasetType=datasetType)
83 self._dataset_type_id = dataset_type_id
84 self._db = db
85 self._collections = collections
86 self._static = static
87 self._summaries = summaries
88 self._tags = tags
89 self._calibs = calibs
90 self._runKeyColumn = collections.getRunForeignKeyName()
91 self._use_astropy = use_astropy_ingest_date
93 def delete(self, datasets: Iterable[DatasetRef]) -> None:
94 # Docstring inherited from DatasetRecordStorage.
95 # Only delete from common dataset table; ON DELETE foreign key clauses
96 # will handle the rest.
97 self._db.delete(
98 self._static.dataset,
99 ["id"],
100 *[{"id": dataset.getCheckedId()} for dataset in datasets],
101 )
103 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
104 # Docstring inherited from DatasetRecordStorage.
105 if collection.type is not CollectionType.TAGGED: 105 ↛ 106line 105 didn't jump to line 106, because the condition on line 105 was never true
106 raise TypeError(
107 f"Cannot associate into collection '{collection.name}' "
108 f"of type {collection.type.name}; must be TAGGED."
109 )
110 protoRow = {
111 self._collections.getCollectionForeignKeyName(): collection.key,
112 "dataset_type_id": self._dataset_type_id,
113 }
114 rows = []
115 summary = CollectionSummary()
116 for dataset in summary.add_datasets_generator(datasets):
117 row = dict(protoRow, dataset_id=dataset.getCheckedId())
118 for dimension, value in dataset.dataId.items():
119 row[dimension.name] = value
120 rows.append(row)
121 # Update the summary tables for this collection in case this is the
122 # first time this dataset type or these governor values will be
123 # inserted there.
124 self._summaries.update(collection, [self._dataset_type_id], summary)
125 # Update the tag table itself.
126 self._db.replace(self._tags, *rows)
128 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
129 # Docstring inherited from DatasetRecordStorage.
130 if collection.type is not CollectionType.TAGGED: 130 ↛ 131line 130 didn't jump to line 131, because the condition on line 130 was never true
131 raise TypeError(
132 f"Cannot disassociate from collection '{collection.name}' "
133 f"of type {collection.type.name}; must be TAGGED."
134 )
135 rows = [
136 {
137 "dataset_id": dataset.getCheckedId(),
138 self._collections.getCollectionForeignKeyName(): collection.key,
139 }
140 for dataset in datasets
141 ]
142 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
144 def _buildCalibOverlapQuery(
145 self,
146 collection: CollectionRecord,
147 data_ids: set[DataCoordinate] | None,
148 timespan: Timespan,
149 context: SqlQueryContext,
150 ) -> Relation:
151 relation = self.make_relation(
152 collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context
153 ).with_rows_satisfying(
154 context.make_timespan_overlap_predicate(
155 DatasetColumnTag(self.datasetType.name, "timespan"), timespan
156 ),
157 )
158 if data_ids is not None:
159 relation = relation.join(
160 context.make_data_id_relation(
161 data_ids, self.datasetType.dimensions.required.names
162 ).transferred_to(context.sql_engine),
163 )
164 return relation
166 def certify(
167 self,
168 collection: CollectionRecord,
169 datasets: Iterable[DatasetRef],
170 timespan: Timespan,
171 context: SqlQueryContext,
172 ) -> None:
173 # Docstring inherited from DatasetRecordStorage.
174 if self._calibs is None: 174 ↛ 175line 174 didn't jump to line 175, because the condition on line 174 was never true
175 raise CollectionTypeError(
176 f"Cannot certify datasets of type {self.datasetType.name}, for which "
177 "DatasetType.isCalibration() is False."
178 )
179 if collection.type is not CollectionType.CALIBRATION: 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true
180 raise CollectionTypeError(
181 f"Cannot certify into collection '{collection.name}' "
182 f"of type {collection.type.name}; must be CALIBRATION."
183 )
184 TimespanReprClass = self._db.getTimespanRepresentation()
185 protoRow = {
186 self._collections.getCollectionForeignKeyName(): collection.key,
187 "dataset_type_id": self._dataset_type_id,
188 }
189 rows = []
190 dataIds: set[DataCoordinate] | None = (
191 set() if not TimespanReprClass.hasExclusionConstraint() else None
192 )
193 summary = CollectionSummary()
194 for dataset in summary.add_datasets_generator(datasets):
195 row = dict(protoRow, dataset_id=dataset.getCheckedId())
196 for dimension, value in dataset.dataId.items():
197 row[dimension.name] = value
198 TimespanReprClass.update(timespan, result=row)
199 rows.append(row)
200 if dataIds is not None: 200 ↛ 194line 200 didn't jump to line 194, because the condition on line 200 was never false
201 dataIds.add(dataset.dataId)
202 # Update the summary tables for this collection in case this is the
203 # first time this dataset type or these governor values will be
204 # inserted there.
205 self._summaries.update(collection, [self._dataset_type_id], summary)
206 # Update the association table itself.
207 if TimespanReprClass.hasExclusionConstraint(): 207 ↛ 210line 207 didn't jump to line 210, because the condition on line 207 was never true
208 # Rely on database constraint to enforce invariants; we just
209 # reraise the exception for consistency across DB engines.
210 try:
211 self._db.insert(self._calibs, *rows)
212 except sqlalchemy.exc.IntegrityError as err:
213 raise ConflictingDefinitionError(
214 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
215 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
216 ) from err
217 else:
218 # Have to implement exclusion constraint ourselves.
219 # Start by building a SELECT query for any rows that would overlap
220 # this one.
221 relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context)
222 # Acquire a table lock to ensure there are no concurrent writes
223 # could invalidate our checking before we finish the inserts. We
224 # use a SAVEPOINT in case there is an outer transaction that a
225 # failure here should not roll back.
226 with self._db.transaction(lock=[self._calibs], savepoint=True):
227 # Enter SqlQueryContext in case we need to use a temporary
228 # table to include the give data IDs in the query. Note that
229 # by doing this inside the transaction, we make sure it doesn't
230 # attempt to close the session when its done, since it just
231 # sees an already-open session that it knows it shouldn't
232 # manage.
233 with context:
234 # Run the check SELECT query.
235 conflicting = context.count(context.process(relation))
236 if conflicting > 0:
237 raise ConflictingDefinitionError(
238 f"{conflicting} validity range conflicts certifying datasets of type "
239 f"{self.datasetType.name} into {collection.name} for range "
240 f"[{timespan.begin}, {timespan.end})."
241 )
242 # Proceed with the insert.
243 self._db.insert(self._calibs, *rows)
245 def decertify(
246 self,
247 collection: CollectionRecord,
248 timespan: Timespan,
249 *,
250 dataIds: Iterable[DataCoordinate] | None = None,
251 context: SqlQueryContext,
252 ) -> None:
253 # Docstring inherited from DatasetRecordStorage.
254 if self._calibs is None: 254 ↛ 255line 254 didn't jump to line 255, because the condition on line 254 was never true
255 raise CollectionTypeError(
256 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
257 "DatasetType.isCalibration() is False."
258 )
259 if collection.type is not CollectionType.CALIBRATION: 259 ↛ 260line 259 didn't jump to line 260, because the condition on line 259 was never true
260 raise CollectionTypeError(
261 f"Cannot decertify from collection '{collection.name}' "
262 f"of type {collection.type.name}; must be CALIBRATION."
263 )
264 TimespanReprClass = self._db.getTimespanRepresentation()
265 # Construct a SELECT query to find all rows that overlap our inputs.
266 dataIdSet: set[DataCoordinate] | None
267 if dataIds is not None:
268 dataIdSet = set(dataIds)
269 else:
270 dataIdSet = None
271 relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context)
272 calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey")
273 dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id")
274 timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan")
275 data_id_tags = [
276 (name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required.names
277 ]
278 # Set up collections to populate with the rows we'll want to modify.
279 # The insert rows will have the same values for collection and
280 # dataset type.
281 protoInsertRow = {
282 self._collections.getCollectionForeignKeyName(): collection.key,
283 "dataset_type_id": self._dataset_type_id,
284 }
285 rowsToDelete = []
286 rowsToInsert = []
287 # Acquire a table lock to ensure there are no concurrent writes
288 # between the SELECT and the DELETE and INSERT queries based on it.
289 with self._db.transaction(lock=[self._calibs], savepoint=True):
290 # Enter SqlQueryContext in case we need to use a temporary table to
291 # include the give data IDs in the query (see similar block in
292 # certify for details).
293 with context:
294 for row in context.fetch_iterable(relation):
295 rowsToDelete.append({"id": row[calib_pkey_tag]})
296 # Construct the insert row(s) by copying the prototype row,
297 # then adding the dimension column values, then adding
298 # what's left of the timespan from that row after we
299 # subtract the given timespan.
300 newInsertRow = protoInsertRow.copy()
301 newInsertRow["dataset_id"] = row[dataset_id_tag]
302 for name, tag in data_id_tags:
303 newInsertRow[name] = row[tag]
304 rowTimespan = row[timespan_tag]
305 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
306 for diffTimespan in rowTimespan.difference(timespan):
307 rowsToInsert.append(
308 TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())
309 )
310 # Run the DELETE and INSERT queries.
311 self._db.delete(self._calibs, ["id"], *rowsToDelete)
312 self._db.insert(self._calibs, *rowsToInsert)
314 def make_relation(
315 self,
316 *collections: CollectionRecord,
317 columns: Set[str],
318 context: SqlQueryContext,
319 ) -> Relation:
320 # Docstring inherited from DatasetRecordStorage.
321 collection_types = {collection.type for collection in collections}
322 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
323 TimespanReprClass = self._db.getTimespanRepresentation()
324 #
325 # There are two kinds of table in play here:
326 #
327 # - the static dataset table (with the dataset ID, dataset type ID,
328 # run ID/name, and ingest date);
329 #
330 # - the dynamic tags/calibs table (with the dataset ID, dataset type
331 # type ID, collection ID/name, data ID, and possibly validity
332 # range).
333 #
334 # That means that we might want to return a query against either table
335 # or a JOIN of both, depending on which quantities the caller wants.
336 # But the data ID is always included, which means we'll always include
337 # the tags/calibs table and join in the static dataset table only if we
338 # need things from it that we can't get from the tags/calibs table.
339 #
340 # Note that it's important that we include a WHERE constraint on both
341 # tables for any column (e.g. dataset_type_id) that is in both when
342 # it's given explicitly; not doing can prevent the query planner from
343 # using very important indexes. At present, we don't include those
344 # redundant columns in the JOIN ON expression, however, because the
345 # FOREIGN KEY (and its index) are defined only on dataset_id.
346 tag_relation: Relation | None = None
347 calib_relation: Relation | None = None
348 if collection_types != {CollectionType.CALIBRATION}:
349 # We'll need a subquery for the tags table if any of the given
350 # collections are not a CALIBRATION collection. This intentionally
351 # also fires when the list of collections is empty as a way to
352 # create a dummy subquery that we know will fail.
353 # We give the table an alias because it might appear multiple times
354 # in the same query, for different dataset types.
355 tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags"))
356 if "timespan" in columns:
357 tags_parts.columns_available[
358 DatasetColumnTag(self.datasetType.name, "timespan")
359 ] = TimespanReprClass.fromLiteral(Timespan(None, None))
360 tag_relation = self._finish_single_relation(
361 tags_parts,
362 columns,
363 [
364 (record, rank)
365 for rank, record in enumerate(collections)
366 if record.type is not CollectionType.CALIBRATION
367 ],
368 context,
369 )
370 assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries."
371 if CollectionType.CALIBRATION in collection_types:
372 # If at least one collection is a CALIBRATION collection, we'll
373 # need a subquery for the calibs table, and could include the
374 # timespan as a result or constraint.
375 assert (
376 self._calibs is not None
377 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
378 calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs"))
379 if "timespan" in columns:
380 calibs_parts.columns_available[
381 DatasetColumnTag(self.datasetType.name, "timespan")
382 ] = TimespanReprClass.from_columns(calibs_parts.from_clause.columns)
383 if "calib_pkey" in columns:
384 # This is a private extension not included in the base class
385 # interface, for internal use only in _buildCalibOverlapQuery,
386 # which needs access to the autoincrement primary key for the
387 # calib association table.
388 calibs_parts.columns_available[
389 DatasetColumnTag(self.datasetType.name, "calib_pkey")
390 ] = calibs_parts.from_clause.columns.id
391 calib_relation = self._finish_single_relation(
392 calibs_parts,
393 columns,
394 [
395 (record, rank)
396 for rank, record in enumerate(collections)
397 if record.type is CollectionType.CALIBRATION
398 ],
399 context,
400 )
401 if tag_relation is not None:
402 if calib_relation is not None:
403 # daf_relation's chain operation does not automatically
404 # deduplicate; it's more like SQL's UNION ALL. To get UNION
405 # in SQL here, we add an explicit deduplication.
406 return tag_relation.chain(calib_relation).without_duplicates()
407 else:
408 return tag_relation
409 elif calib_relation is not None:
410 return calib_relation
411 else:
412 raise AssertionError("Branch should be unreachable.")
414 def _finish_single_relation(
415 self,
416 payload: sql.Payload[LogicalColumn],
417 requested_columns: Set[str],
418 collections: Sequence[tuple[CollectionRecord, int]],
419 context: SqlQueryContext,
420 ) -> Relation:
421 """Helper method for `make_relation`.
423 This handles adding columns and WHERE terms that are not specific to
424 either the tags or calibs tables.
426 Parameters
427 ----------
428 payload : `lsst.daf.relation.sql.Payload`
429 SQL query parts under construction, to be modified in-place and
430 used to construct the new relation.
431 requested_columns : `~collections.abc.Set` [ `str` ]
432 Columns the relation should include.
433 collections : `Sequence` [ `tuple` [ `CollectionRecord`, `int` ] ]
434 Collections to search for the dataset and their ranks.
435 context : `SqlQueryContext`
436 Context that manages engines and state for the query.
438 Returns
439 -------
440 relation : `lsst.daf.relation.Relation`
441 New dataset query relation.
442 """
443 payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id)
444 dataset_id_col = payload.from_clause.columns.dataset_id
445 collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()]
446 # We always constrain and optionally retrieve the collection(s) via the
447 # tags/calibs table.
448 if len(collections) == 1:
449 payload.where.append(collection_col == collections[0][0].key)
450 if "collection" in requested_columns:
451 payload.columns_available[
452 DatasetColumnTag(self.datasetType.name, "collection")
453 ] = sqlalchemy.sql.literal(collections[0][0].key)
454 else:
455 assert collections, "The no-collections case should be in calling code for better diagnostics."
456 payload.where.append(collection_col.in_([collection.key for collection, _ in collections]))
457 if "collection" in requested_columns:
458 payload.columns_available[
459 DatasetColumnTag(self.datasetType.name, "collection")
460 ] = collection_col
461 # Add rank if requested as a CASE-based calculation the collection
462 # column.
463 if "rank" in requested_columns:
464 payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case(
465 {record.key: rank for record, rank in collections},
466 value=collection_col,
467 )
468 # Add more column definitions, starting with the data ID.
469 for dimension_name in self.datasetType.dimensions.required.names:
470 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[
471 dimension_name
472 ]
473 # We can always get the dataset_id from the tags/calibs table.
474 if "dataset_id" in requested_columns:
475 payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col
476 # It's possible we now have everything we need, from just the
477 # tags/calibs table. The things we might need to get from the static
478 # dataset table are the run key and the ingest date.
479 need_static_table = False
480 if "run" in requested_columns:
481 if len(collections) == 1 and collections[0][0].type is CollectionType.RUN:
482 # If we are searching exactly one RUN collection, we
483 # know that if we find the dataset in that collection,
484 # then that's the datasets's run; we don't need to
485 # query for it.
486 payload.columns_available[
487 DatasetColumnTag(self.datasetType.name, "run")
488 ] = sqlalchemy.sql.literal(collections[0][0].key)
489 else:
490 payload.columns_available[
491 DatasetColumnTag(self.datasetType.name, "run")
492 ] = self._static.dataset.columns[self._runKeyColumn]
493 need_static_table = True
494 # Ingest date can only come from the static table.
495 if "ingest_date" in requested_columns:
496 need_static_table = True
497 payload.columns_available[
498 DatasetColumnTag(self.datasetType.name, "ingest_date")
499 ] = self._static.dataset.columns.ingest_date
500 # If we need the static table, join it in via dataset_id and
501 # dataset_type_id
502 if need_static_table:
503 payload.from_clause = payload.from_clause.join(
504 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id)
505 )
506 # Also constrain dataset_type_id in static table in case that helps
507 # generate a better plan.
508 # We could also include this in the JOIN ON clause, but my guess is
509 # that that's a good idea IFF it's in the foreign key, and right
510 # now it isn't.
511 payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
512 leaf = context.sql_engine.make_leaf(
513 payload.columns_available.keys(),
514 payload=payload,
515 name=self.datasetType.name,
516 parameters={record.name: rank for record, rank in collections},
517 )
518 return leaf
520 def getDataId(self, id: DatasetId) -> DataCoordinate:
521 """Return DataId for a dataset.
523 Parameters
524 ----------
525 id : `DatasetId`
526 Unique dataset identifier.
528 Returns
529 -------
530 dataId : `DataCoordinate`
531 DataId for the dataset.
532 """
533 # This query could return multiple rows (one for each tagged collection
534 # the dataset is in, plus one for its run collection), and we don't
535 # care which of those we get.
536 sql = (
537 self._tags.select()
538 .where(
539 sqlalchemy.sql.and_(
540 self._tags.columns.dataset_id == id,
541 self._tags.columns.dataset_type_id == self._dataset_type_id,
542 )
543 )
544 .limit(1)
545 )
546 with self._db.query(sql) as sql_result:
547 row = sql_result.mappings().fetchone()
548 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
549 return DataCoordinate.standardize(
550 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
551 graph=self.datasetType.dimensions,
552 )
555class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
556 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
557 dataset IDs.
558 """
560 idMaker = DatasetIdFactory()
561 """Factory for dataset IDs. In the future this factory may be shared with
562 other classes (e.g. Registry)."""
564 def insert(
565 self,
566 run: RunRecord,
567 dataIds: Iterable[DataCoordinate],
568 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
569 ) -> Iterator[DatasetRef]:
570 # Docstring inherited from DatasetRecordStorage.
572 # Current timestamp, type depends on schema version. Use microsecond
573 # precision for astropy time to keep things consistent with
574 # TIMESTAMP(6) SQL type.
575 timestamp: datetime | astropy.time.Time
576 if self._use_astropy:
577 # Astropy `now()` precision should be the same as `utcnow()` which
578 # should mean microsecond.
579 timestamp = astropy.time.Time.now()
580 else:
581 timestamp = datetime.utcnow()
583 # Iterate over data IDs, transforming a possibly-single-pass iterable
584 # into a list.
585 dataIdList = []
586 rows = []
587 summary = CollectionSummary()
588 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds):
589 dataIdList.append(dataId)
590 rows.append(
591 {
592 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
593 "dataset_type_id": self._dataset_type_id,
594 self._runKeyColumn: run.key,
595 "ingest_date": timestamp,
596 }
597 )
599 with self._db.transaction():
600 # Insert into the static dataset table.
601 self._db.insert(self._static.dataset, *rows)
602 # Update the summary tables for this collection in case this is the
603 # first time this dataset type or these governor values will be
604 # inserted there.
605 self._summaries.update(run, [self._dataset_type_id], summary)
606 # Combine the generated dataset_id values and data ID fields to
607 # form rows to be inserted into the tags table.
608 protoTagsRow = {
609 "dataset_type_id": self._dataset_type_id,
610 self._collections.getCollectionForeignKeyName(): run.key,
611 }
612 tagsRows = [
613 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
614 for dataId, row in zip(dataIdList, rows)
615 ]
616 # Insert those rows into the tags table.
617 self._db.insert(self._tags, *tagsRows)
619 for dataId, row in zip(dataIdList, rows):
620 yield DatasetRef(
621 datasetType=self.datasetType,
622 dataId=dataId,
623 id=row["id"],
624 run=run.name,
625 )
627 def import_(
628 self,
629 run: RunRecord,
630 datasets: Iterable[DatasetRef],
631 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
632 reuseIds: bool = False,
633 ) -> Iterator[DatasetRef]:
634 # Docstring inherited from DatasetRecordStorage.
636 # Current timestamp, type depends on schema version.
637 if self._use_astropy:
638 # Astropy `now()` precision should be the same as `utcnow()` which
639 # should mean microsecond.
640 timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai)
641 else:
642 timestamp = sqlalchemy.sql.literal(datetime.utcnow())
644 # Iterate over data IDs, transforming a possibly-single-pass iterable
645 # into a list.
646 dataIds = {}
647 summary = CollectionSummary()
648 for dataset in summary.add_datasets_generator(datasets):
649 # Ignore unknown ID types, normally all IDs have the same type but
650 # this code supports mixed types or missing IDs.
651 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
652 if datasetId is None:
653 datasetId = self.idMaker.makeDatasetId(
654 run.name, self.datasetType, dataset.dataId, idGenerationMode
655 )
656 dataIds[datasetId] = dataset.dataId
658 # We'll insert all new rows into a temporary table
659 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False)
660 collFkName = self._collections.getCollectionForeignKeyName()
661 protoTagsRow = {
662 "dataset_type_id": self._dataset_type_id,
663 collFkName: run.key,
664 }
665 tmpRows = [
666 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
667 for dataset_id, dataId in dataIds.items()
668 ]
669 with self._db.transaction(for_temp_tables=True):
670 with self._db.temporary_table(tableSpec) as tmp_tags:
671 # store all incoming data in a temporary table
672 self._db.insert(tmp_tags, *tmpRows)
674 # There are some checks that we want to make for consistency
675 # of the new datasets with existing ones.
676 self._validateImport(tmp_tags, run)
678 # Before we merge temporary table into dataset/tags we need to
679 # drop datasets which are already there (and do not conflict).
680 self._db.deleteWhere(
681 tmp_tags,
682 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
683 )
685 # Copy it into dataset table, need to re-label some columns.
686 self._db.insert(
687 self._static.dataset,
688 select=sqlalchemy.sql.select(
689 tmp_tags.columns.dataset_id.label("id"),
690 tmp_tags.columns.dataset_type_id,
691 tmp_tags.columns[collFkName].label(self._runKeyColumn),
692 timestamp.label("ingest_date"),
693 ),
694 )
696 # Update the summary tables for this collection in case this
697 # is the first time this dataset type or these governor values
698 # will be inserted there.
699 self._summaries.update(run, [self._dataset_type_id], summary)
701 # Copy it into tags table.
702 self._db.insert(self._tags, select=tmp_tags.select())
704 # Return refs in the same order as in the input list.
705 for dataset_id, dataId in dataIds.items():
706 yield DatasetRef(
707 datasetType=self.datasetType,
708 id=dataset_id,
709 dataId=dataId,
710 run=run.name,
711 )
713 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
714 """Validate imported refs against existing datasets.
716 Parameters
717 ----------
718 tmp_tags : `sqlalchemy.schema.Table`
719 Temporary table with new datasets and the same schema as tags
720 table.
721 run : `RunRecord`
722 The record object describing the `~CollectionType.RUN` collection.
724 Raises
725 ------
726 ConflictingDefinitionError
727 Raise if new datasets conflict with existing ones.
728 """
729 dataset = self._static.dataset
730 tags = self._tags
731 collFkName = self._collections.getCollectionForeignKeyName()
733 # Check that existing datasets have the same dataset type and
734 # run.
735 query = (
736 sqlalchemy.sql.select(
737 dataset.columns.id.label("dataset_id"),
738 dataset.columns.dataset_type_id.label("dataset_type_id"),
739 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"),
740 dataset.columns[self._runKeyColumn].label("run"),
741 tmp_tags.columns[collFkName].label("new run"),
742 )
743 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
744 .where(
745 sqlalchemy.sql.or_(
746 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
747 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
748 )
749 )
750 .limit(1)
751 )
752 with self._db.query(query) as result:
753 if (row := result.first()) is not None:
754 # Only include the first one in the exception message
755 raise ConflictingDefinitionError(
756 f"Existing dataset type or run do not match new dataset: {row._asdict()}"
757 )
759 # Check that matching dataset in tags table has the same DataId.
760 query = (
761 sqlalchemy.sql.select(
762 tags.columns.dataset_id,
763 tags.columns.dataset_type_id.label("type_id"),
764 tmp_tags.columns.dataset_type_id.label("new type_id"),
765 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
766 *[
767 tmp_tags.columns[dim].label(f"new {dim}")
768 for dim in self.datasetType.dimensions.required.names
769 ],
770 )
771 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
772 .where(
773 sqlalchemy.sql.or_(
774 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
775 *[
776 tags.columns[dim] != tmp_tags.columns[dim]
777 for dim in self.datasetType.dimensions.required.names
778 ],
779 )
780 )
781 .limit(1)
782 )
784 with self._db.query(query) as result:
785 if (row := result.first()) is not None:
786 # Only include the first one in the exception message
787 raise ConflictingDefinitionError(
788 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
789 )
791 # Check that matching run+dataId have the same dataset ID.
792 query = (
793 sqlalchemy.sql.select(
794 tags.columns.dataset_type_id.label("dataset_type_id"),
795 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
796 tags.columns.dataset_id,
797 tmp_tags.columns.dataset_id.label("new dataset_id"),
798 tags.columns[collFkName],
799 tmp_tags.columns[collFkName].label(f"new {collFkName}"),
800 )
801 .select_from(
802 tags.join(
803 tmp_tags,
804 sqlalchemy.sql.and_(
805 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
806 tags.columns[collFkName] == tmp_tags.columns[collFkName],
807 *[
808 tags.columns[dim] == tmp_tags.columns[dim]
809 for dim in self.datasetType.dimensions.required.names
810 ],
811 ),
812 )
813 )
814 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
815 .limit(1)
816 )
817 with self._db.query(query) as result:
818 if (row := result.first()) is not None:
819 # only include the first one in the exception message
820 raise ConflictingDefinitionError(
821 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
822 )