Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 95%
245 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 07:59 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 07:59 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
29from __future__ import annotations
31__all__ = ("ByDimensionsDatasetRecordStorage",)
33from collections.abc import Iterable, Iterator, Sequence, Set
34from datetime import datetime
35from typing import TYPE_CHECKING
37import astropy.time
38import sqlalchemy
39from lsst.daf.relation import Relation, sql
41from ....core import (
42 DataCoordinate,
43 DatasetColumnTag,
44 DatasetId,
45 DatasetIdFactory,
46 DatasetIdGenEnum,
47 DatasetRef,
48 DatasetType,
49 DimensionKeyColumnTag,
50 LogicalColumn,
51 Timespan,
52 ddl,
53)
54from ..._collection_summary import CollectionSummary
55from ..._collectionType import CollectionType
56from ..._exceptions import CollectionTypeError, ConflictingDefinitionError
57from ...interfaces import DatasetRecordStorage
58from ...queries import SqlQueryContext
59from .tables import makeTagTableSpec
61if TYPE_CHECKING:
62 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
63 from .summaries import CollectionSummaryManager
64 from .tables import StaticDatasetTablesTuple
67class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
68 """Dataset record storage implementation paired with
69 `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more
70 information.
72 Instances of this class should never be constructed directly; use
73 `DatasetRecordStorageManager.register` instead.
74 """
76 def __init__(
77 self,
78 *,
79 datasetType: DatasetType,
80 db: Database,
81 dataset_type_id: int,
82 collections: CollectionManager,
83 static: StaticDatasetTablesTuple,
84 summaries: CollectionSummaryManager,
85 tags: sqlalchemy.schema.Table,
86 use_astropy_ingest_date: bool,
87 calibs: sqlalchemy.schema.Table | None,
88 ):
89 super().__init__(datasetType=datasetType)
90 self._dataset_type_id = dataset_type_id
91 self._db = db
92 self._collections = collections
93 self._static = static
94 self._summaries = summaries
95 self._tags = tags
96 self._calibs = calibs
97 self._runKeyColumn = collections.getRunForeignKeyName()
98 self._use_astropy = use_astropy_ingest_date
100 def delete(self, datasets: Iterable[DatasetRef]) -> None:
101 # Docstring inherited from DatasetRecordStorage.
102 # Only delete from common dataset table; ON DELETE foreign key clauses
103 # will handle the rest.
104 self._db.delete(
105 self._static.dataset,
106 ["id"],
107 *[{"id": dataset.id} for dataset in datasets],
108 )
110 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
111 # Docstring inherited from DatasetRecordStorage.
112 if collection.type is not CollectionType.TAGGED: 112 ↛ 113line 112 didn't jump to line 113, because the condition on line 112 was never true
113 raise TypeError(
114 f"Cannot associate into collection '{collection.name}' "
115 f"of type {collection.type.name}; must be TAGGED."
116 )
117 protoRow = {
118 self._collections.getCollectionForeignKeyName(): collection.key,
119 "dataset_type_id": self._dataset_type_id,
120 }
121 rows = []
122 summary = CollectionSummary()
123 for dataset in summary.add_datasets_generator(datasets):
124 row = dict(protoRow, dataset_id=dataset.id)
125 for dimension, value in dataset.dataId.items():
126 row[dimension.name] = value
127 rows.append(row)
128 # Update the summary tables for this collection in case this is the
129 # first time this dataset type or these governor values will be
130 # inserted there.
131 self._summaries.update(collection, [self._dataset_type_id], summary)
132 # Update the tag table itself.
133 self._db.replace(self._tags, *rows)
135 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
136 # Docstring inherited from DatasetRecordStorage.
137 if collection.type is not CollectionType.TAGGED: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true
138 raise TypeError(
139 f"Cannot disassociate from collection '{collection.name}' "
140 f"of type {collection.type.name}; must be TAGGED."
141 )
142 rows = [
143 {
144 "dataset_id": dataset.id,
145 self._collections.getCollectionForeignKeyName(): collection.key,
146 }
147 for dataset in datasets
148 ]
149 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
151 def _buildCalibOverlapQuery(
152 self,
153 collection: CollectionRecord,
154 data_ids: set[DataCoordinate] | None,
155 timespan: Timespan,
156 context: SqlQueryContext,
157 ) -> Relation:
158 relation = self.make_relation(
159 collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context
160 ).with_rows_satisfying(
161 context.make_timespan_overlap_predicate(
162 DatasetColumnTag(self.datasetType.name, "timespan"), timespan
163 ),
164 )
165 if data_ids is not None:
166 relation = relation.join(
167 context.make_data_id_relation(
168 data_ids, self.datasetType.dimensions.required.names
169 ).transferred_to(context.sql_engine),
170 )
171 return relation
173 def certify(
174 self,
175 collection: CollectionRecord,
176 datasets: Iterable[DatasetRef],
177 timespan: Timespan,
178 context: SqlQueryContext,
179 ) -> None:
180 # Docstring inherited from DatasetRecordStorage.
181 if self._calibs is None: 181 ↛ 182line 181 didn't jump to line 182, because the condition on line 181 was never true
182 raise CollectionTypeError(
183 f"Cannot certify datasets of type {self.datasetType.name}, for which "
184 "DatasetType.isCalibration() is False."
185 )
186 if collection.type is not CollectionType.CALIBRATION: 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true
187 raise CollectionTypeError(
188 f"Cannot certify into collection '{collection.name}' "
189 f"of type {collection.type.name}; must be CALIBRATION."
190 )
191 TimespanReprClass = self._db.getTimespanRepresentation()
192 protoRow = {
193 self._collections.getCollectionForeignKeyName(): collection.key,
194 "dataset_type_id": self._dataset_type_id,
195 }
196 rows = []
197 dataIds: set[DataCoordinate] | None = (
198 set() if not TimespanReprClass.hasExclusionConstraint() else None
199 )
200 summary = CollectionSummary()
201 for dataset in summary.add_datasets_generator(datasets):
202 row = dict(protoRow, dataset_id=dataset.id)
203 for dimension, value in dataset.dataId.items():
204 row[dimension.name] = value
205 TimespanReprClass.update(timespan, result=row)
206 rows.append(row)
207 if dataIds is not None: 207 ↛ 201line 207 didn't jump to line 201, because the condition on line 207 was never false
208 dataIds.add(dataset.dataId)
209 # Update the summary tables for this collection in case this is the
210 # first time this dataset type or these governor values will be
211 # inserted there.
212 self._summaries.update(collection, [self._dataset_type_id], summary)
213 # Update the association table itself.
214 if TimespanReprClass.hasExclusionConstraint(): 214 ↛ 217line 214 didn't jump to line 217, because the condition on line 214 was never true
215 # Rely on database constraint to enforce invariants; we just
216 # reraise the exception for consistency across DB engines.
217 try:
218 self._db.insert(self._calibs, *rows)
219 except sqlalchemy.exc.IntegrityError as err:
220 raise ConflictingDefinitionError(
221 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
222 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
223 ) from err
224 else:
225 # Have to implement exclusion constraint ourselves.
226 # Start by building a SELECT query for any rows that would overlap
227 # this one.
228 relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context)
229 # Acquire a table lock to ensure there are no concurrent writes
230 # could invalidate our checking before we finish the inserts. We
231 # use a SAVEPOINT in case there is an outer transaction that a
232 # failure here should not roll back.
233 with self._db.transaction(lock=[self._calibs], savepoint=True):
234 # Enter SqlQueryContext in case we need to use a temporary
235 # table to include the give data IDs in the query. Note that
236 # by doing this inside the transaction, we make sure it doesn't
237 # attempt to close the session when its done, since it just
238 # sees an already-open session that it knows it shouldn't
239 # manage.
240 with context:
241 # Run the check SELECT query.
242 conflicting = context.count(context.process(relation))
243 if conflicting > 0:
244 raise ConflictingDefinitionError(
245 f"{conflicting} validity range conflicts certifying datasets of type "
246 f"{self.datasetType.name} into {collection.name} for range "
247 f"[{timespan.begin}, {timespan.end})."
248 )
249 # Proceed with the insert.
250 self._db.insert(self._calibs, *rows)
252 def decertify(
253 self,
254 collection: CollectionRecord,
255 timespan: Timespan,
256 *,
257 dataIds: Iterable[DataCoordinate] | None = None,
258 context: SqlQueryContext,
259 ) -> None:
260 # Docstring inherited from DatasetRecordStorage.
261 if self._calibs is None: 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true
262 raise CollectionTypeError(
263 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
264 "DatasetType.isCalibration() is False."
265 )
266 if collection.type is not CollectionType.CALIBRATION: 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true
267 raise CollectionTypeError(
268 f"Cannot decertify from collection '{collection.name}' "
269 f"of type {collection.type.name}; must be CALIBRATION."
270 )
271 TimespanReprClass = self._db.getTimespanRepresentation()
272 # Construct a SELECT query to find all rows that overlap our inputs.
273 dataIdSet: set[DataCoordinate] | None
274 if dataIds is not None:
275 dataIdSet = set(dataIds)
276 else:
277 dataIdSet = None
278 relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context)
279 calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey")
280 dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id")
281 timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan")
282 data_id_tags = [
283 (name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required.names
284 ]
285 # Set up collections to populate with the rows we'll want to modify.
286 # The insert rows will have the same values for collection and
287 # dataset type.
288 protoInsertRow = {
289 self._collections.getCollectionForeignKeyName(): collection.key,
290 "dataset_type_id": self._dataset_type_id,
291 }
292 rowsToDelete = []
293 rowsToInsert = []
294 # Acquire a table lock to ensure there are no concurrent writes
295 # between the SELECT and the DELETE and INSERT queries based on it.
296 with self._db.transaction(lock=[self._calibs], savepoint=True):
297 # Enter SqlQueryContext in case we need to use a temporary table to
298 # include the give data IDs in the query (see similar block in
299 # certify for details).
300 with context:
301 for row in context.fetch_iterable(relation):
302 rowsToDelete.append({"id": row[calib_pkey_tag]})
303 # Construct the insert row(s) by copying the prototype row,
304 # then adding the dimension column values, then adding
305 # what's left of the timespan from that row after we
306 # subtract the given timespan.
307 newInsertRow = protoInsertRow.copy()
308 newInsertRow["dataset_id"] = row[dataset_id_tag]
309 for name, tag in data_id_tags:
310 newInsertRow[name] = row[tag]
311 rowTimespan = row[timespan_tag]
312 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
313 for diffTimespan in rowTimespan.difference(timespan):
314 rowsToInsert.append(
315 TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())
316 )
317 # Run the DELETE and INSERT queries.
318 self._db.delete(self._calibs, ["id"], *rowsToDelete)
319 self._db.insert(self._calibs, *rowsToInsert)
321 def make_relation(
322 self,
323 *collections: CollectionRecord,
324 columns: Set[str],
325 context: SqlQueryContext,
326 ) -> Relation:
327 # Docstring inherited from DatasetRecordStorage.
328 collection_types = {collection.type for collection in collections}
329 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
330 TimespanReprClass = self._db.getTimespanRepresentation()
331 #
332 # There are two kinds of table in play here:
333 #
334 # - the static dataset table (with the dataset ID, dataset type ID,
335 # run ID/name, and ingest date);
336 #
337 # - the dynamic tags/calibs table (with the dataset ID, dataset type
338 # type ID, collection ID/name, data ID, and possibly validity
339 # range).
340 #
341 # That means that we might want to return a query against either table
342 # or a JOIN of both, depending on which quantities the caller wants.
343 # But the data ID is always included, which means we'll always include
344 # the tags/calibs table and join in the static dataset table only if we
345 # need things from it that we can't get from the tags/calibs table.
346 #
347 # Note that it's important that we include a WHERE constraint on both
348 # tables for any column (e.g. dataset_type_id) that is in both when
349 # it's given explicitly; not doing can prevent the query planner from
350 # using very important indexes. At present, we don't include those
351 # redundant columns in the JOIN ON expression, however, because the
352 # FOREIGN KEY (and its index) are defined only on dataset_id.
353 tag_relation: Relation | None = None
354 calib_relation: Relation | None = None
355 if collection_types != {CollectionType.CALIBRATION}:
356 # We'll need a subquery for the tags table if any of the given
357 # collections are not a CALIBRATION collection. This intentionally
358 # also fires when the list of collections is empty as a way to
359 # create a dummy subquery that we know will fail.
360 # We give the table an alias because it might appear multiple times
361 # in the same query, for different dataset types.
362 tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags"))
363 if "timespan" in columns:
364 tags_parts.columns_available[
365 DatasetColumnTag(self.datasetType.name, "timespan")
366 ] = TimespanReprClass.fromLiteral(Timespan(None, None))
367 tag_relation = self._finish_single_relation(
368 tags_parts,
369 columns,
370 [
371 (record, rank)
372 for rank, record in enumerate(collections)
373 if record.type is not CollectionType.CALIBRATION
374 ],
375 context,
376 )
377 assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries."
378 if CollectionType.CALIBRATION in collection_types:
379 # If at least one collection is a CALIBRATION collection, we'll
380 # need a subquery for the calibs table, and could include the
381 # timespan as a result or constraint.
382 assert (
383 self._calibs is not None
384 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
385 calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs"))
386 if "timespan" in columns:
387 calibs_parts.columns_available[
388 DatasetColumnTag(self.datasetType.name, "timespan")
389 ] = TimespanReprClass.from_columns(calibs_parts.from_clause.columns)
390 if "calib_pkey" in columns:
391 # This is a private extension not included in the base class
392 # interface, for internal use only in _buildCalibOverlapQuery,
393 # which needs access to the autoincrement primary key for the
394 # calib association table.
395 calibs_parts.columns_available[
396 DatasetColumnTag(self.datasetType.name, "calib_pkey")
397 ] = calibs_parts.from_clause.columns.id
398 calib_relation = self._finish_single_relation(
399 calibs_parts,
400 columns,
401 [
402 (record, rank)
403 for rank, record in enumerate(collections)
404 if record.type is CollectionType.CALIBRATION
405 ],
406 context,
407 )
408 if tag_relation is not None:
409 if calib_relation is not None:
410 # daf_relation's chain operation does not automatically
411 # deduplicate; it's more like SQL's UNION ALL. To get UNION
412 # in SQL here, we add an explicit deduplication.
413 return tag_relation.chain(calib_relation).without_duplicates()
414 else:
415 return tag_relation
416 elif calib_relation is not None:
417 return calib_relation
418 else:
419 raise AssertionError("Branch should be unreachable.")
421 def _finish_single_relation(
422 self,
423 payload: sql.Payload[LogicalColumn],
424 requested_columns: Set[str],
425 collections: Sequence[tuple[CollectionRecord, int]],
426 context: SqlQueryContext,
427 ) -> Relation:
428 """Handle adding columns and WHERE terms that are not specific to
429 either the tags or calibs tables.
431 Helper method for `make_relation`.
433 Parameters
434 ----------
435 payload : `lsst.daf.relation.sql.Payload`
436 SQL query parts under construction, to be modified in-place and
437 used to construct the new relation.
438 requested_columns : `~collections.abc.Set` [ `str` ]
439 Columns the relation should include.
440 collections : `~collections.abc.Sequence` [ `tuple` \
441 [ `CollectionRecord`, `int` ] ]
442 Collections to search for the dataset and their ranks.
443 context : `SqlQueryContext`
444 Context that manages engines and state for the query.
446 Returns
447 -------
448 relation : `lsst.daf.relation.Relation`
449 New dataset query relation.
450 """
451 payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id)
452 dataset_id_col = payload.from_clause.columns.dataset_id
453 collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()]
454 # We always constrain and optionally retrieve the collection(s) via the
455 # tags/calibs table.
456 if len(collections) == 1:
457 payload.where.append(collection_col == collections[0][0].key)
458 if "collection" in requested_columns:
459 payload.columns_available[
460 DatasetColumnTag(self.datasetType.name, "collection")
461 ] = sqlalchemy.sql.literal(collections[0][0].key)
462 else:
463 assert collections, "The no-collections case should be in calling code for better diagnostics."
464 payload.where.append(collection_col.in_([collection.key for collection, _ in collections]))
465 if "collection" in requested_columns:
466 payload.columns_available[
467 DatasetColumnTag(self.datasetType.name, "collection")
468 ] = collection_col
469 # Add rank if requested as a CASE-based calculation the collection
470 # column.
471 if "rank" in requested_columns:
472 payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case(
473 {record.key: rank for record, rank in collections},
474 value=collection_col,
475 )
476 # Add more column definitions, starting with the data ID.
477 for dimension_name in self.datasetType.dimensions.required.names:
478 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[
479 dimension_name
480 ]
481 # We can always get the dataset_id from the tags/calibs table.
482 if "dataset_id" in requested_columns:
483 payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col
484 # It's possible we now have everything we need, from just the
485 # tags/calibs table. The things we might need to get from the static
486 # dataset table are the run key and the ingest date.
487 need_static_table = False
488 if "run" in requested_columns:
489 if len(collections) == 1 and collections[0][0].type is CollectionType.RUN:
490 # If we are searching exactly one RUN collection, we
491 # know that if we find the dataset in that collection,
492 # then that's the datasets's run; we don't need to
493 # query for it.
494 payload.columns_available[
495 DatasetColumnTag(self.datasetType.name, "run")
496 ] = sqlalchemy.sql.literal(collections[0][0].key)
497 else:
498 payload.columns_available[
499 DatasetColumnTag(self.datasetType.name, "run")
500 ] = self._static.dataset.columns[self._runKeyColumn]
501 need_static_table = True
502 # Ingest date can only come from the static table.
503 if "ingest_date" in requested_columns:
504 need_static_table = True
505 payload.columns_available[
506 DatasetColumnTag(self.datasetType.name, "ingest_date")
507 ] = self._static.dataset.columns.ingest_date
508 # If we need the static table, join it in via dataset_id and
509 # dataset_type_id
510 if need_static_table:
511 payload.from_clause = payload.from_clause.join(
512 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id)
513 )
514 # Also constrain dataset_type_id in static table in case that helps
515 # generate a better plan.
516 # We could also include this in the JOIN ON clause, but my guess is
517 # that that's a good idea IFF it's in the foreign key, and right
518 # now it isn't.
519 payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
520 leaf = context.sql_engine.make_leaf(
521 payload.columns_available.keys(),
522 payload=payload,
523 name=self.datasetType.name,
524 parameters={record.name: rank for record, rank in collections},
525 )
526 return leaf
528 def getDataId(self, id: DatasetId) -> DataCoordinate:
529 """Return DataId for a dataset.
531 Parameters
532 ----------
533 id : `DatasetId`
534 Unique dataset identifier.
536 Returns
537 -------
538 dataId : `DataCoordinate`
539 DataId for the dataset.
540 """
541 # This query could return multiple rows (one for each tagged collection
542 # the dataset is in, plus one for its run collection), and we don't
543 # care which of those we get.
544 sql = (
545 self._tags.select()
546 .where(
547 sqlalchemy.sql.and_(
548 self._tags.columns.dataset_id == id,
549 self._tags.columns.dataset_type_id == self._dataset_type_id,
550 )
551 )
552 .limit(1)
553 )
554 with self._db.query(sql) as sql_result:
555 row = sql_result.mappings().fetchone()
556 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
557 return DataCoordinate.standardize(
558 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
559 graph=self.datasetType.dimensions,
560 )
563class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
564 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
565 dataset IDs.
566 """
568 idMaker = DatasetIdFactory()
569 """Factory for dataset IDs. In the future this factory may be shared with
570 other classes (e.g. Registry)."""
572 def insert(
573 self,
574 run: RunRecord,
575 dataIds: Iterable[DataCoordinate],
576 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
577 ) -> Iterator[DatasetRef]:
578 # Docstring inherited from DatasetRecordStorage.
580 # Current timestamp, type depends on schema version. Use microsecond
581 # precision for astropy time to keep things consistent with
582 # TIMESTAMP(6) SQL type.
583 timestamp: datetime | astropy.time.Time
584 if self._use_astropy:
585 # Astropy `now()` precision should be the same as `utcnow()` which
586 # should mean microsecond.
587 timestamp = astropy.time.Time.now()
588 else:
589 timestamp = datetime.utcnow()
591 # Iterate over data IDs, transforming a possibly-single-pass iterable
592 # into a list.
593 dataIdList = []
594 rows = []
595 summary = CollectionSummary()
596 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds):
597 dataIdList.append(dataId)
598 rows.append(
599 {
600 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
601 "dataset_type_id": self._dataset_type_id,
602 self._runKeyColumn: run.key,
603 "ingest_date": timestamp,
604 }
605 )
607 with self._db.transaction():
608 # Insert into the static dataset table.
609 self._db.insert(self._static.dataset, *rows)
610 # Update the summary tables for this collection in case this is the
611 # first time this dataset type or these governor values will be
612 # inserted there.
613 self._summaries.update(run, [self._dataset_type_id], summary)
614 # Combine the generated dataset_id values and data ID fields to
615 # form rows to be inserted into the tags table.
616 protoTagsRow = {
617 "dataset_type_id": self._dataset_type_id,
618 self._collections.getCollectionForeignKeyName(): run.key,
619 }
620 tagsRows = [
621 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
622 for dataId, row in zip(dataIdList, rows, strict=True)
623 ]
624 # Insert those rows into the tags table.
625 self._db.insert(self._tags, *tagsRows)
627 for dataId, row in zip(dataIdList, rows, strict=True):
628 yield DatasetRef(
629 datasetType=self.datasetType,
630 dataId=dataId,
631 id=row["id"],
632 run=run.name,
633 )
635 def import_(
636 self,
637 run: RunRecord,
638 datasets: Iterable[DatasetRef],
639 ) -> Iterator[DatasetRef]:
640 # Docstring inherited from DatasetRecordStorage.
642 # Current timestamp, type depends on schema version.
643 if self._use_astropy:
644 # Astropy `now()` precision should be the same as `utcnow()` which
645 # should mean microsecond.
646 timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai)
647 else:
648 timestamp = sqlalchemy.sql.literal(datetime.utcnow())
650 # Iterate over data IDs, transforming a possibly-single-pass iterable
651 # into a list.
652 dataIds = {}
653 summary = CollectionSummary()
654 for dataset in summary.add_datasets_generator(datasets):
655 dataIds[dataset.id] = dataset.dataId
657 # We'll insert all new rows into a temporary table
658 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False)
659 collFkName = self._collections.getCollectionForeignKeyName()
660 protoTagsRow = {
661 "dataset_type_id": self._dataset_type_id,
662 collFkName: run.key,
663 }
664 tmpRows = [
665 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
666 for dataset_id, dataId in dataIds.items()
667 ]
668 with self._db.transaction(for_temp_tables=True), self._db.temporary_table(tableSpec) as tmp_tags:
669 # store all incoming data in a temporary table
670 self._db.insert(tmp_tags, *tmpRows)
672 # There are some checks that we want to make for consistency
673 # of the new datasets with existing ones.
674 self._validateImport(tmp_tags, run)
676 # Before we merge temporary table into dataset/tags we need to
677 # drop datasets which are already there (and do not conflict).
678 self._db.deleteWhere(
679 tmp_tags,
680 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
681 )
683 # Copy it into dataset table, need to re-label some columns.
684 self._db.insert(
685 self._static.dataset,
686 select=sqlalchemy.sql.select(
687 tmp_tags.columns.dataset_id.label("id"),
688 tmp_tags.columns.dataset_type_id,
689 tmp_tags.columns[collFkName].label(self._runKeyColumn),
690 timestamp.label("ingest_date"),
691 ),
692 )
694 # Update the summary tables for this collection in case this
695 # is the first time this dataset type or these governor values
696 # will be inserted there.
697 self._summaries.update(run, [self._dataset_type_id], summary)
699 # Copy it into tags table.
700 self._db.insert(self._tags, select=tmp_tags.select())
702 # Return refs in the same order as in the input list.
703 for dataset_id, dataId in dataIds.items():
704 yield DatasetRef(
705 datasetType=self.datasetType,
706 id=dataset_id,
707 dataId=dataId,
708 run=run.name,
709 )
711 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
712 """Validate imported refs against existing datasets.
714 Parameters
715 ----------
716 tmp_tags : `sqlalchemy.schema.Table`
717 Temporary table with new datasets and the same schema as tags
718 table.
719 run : `RunRecord`
720 The record object describing the `~CollectionType.RUN` collection.
722 Raises
723 ------
724 ConflictingDefinitionError
725 Raise if new datasets conflict with existing ones.
726 """
727 dataset = self._static.dataset
728 tags = self._tags
729 collFkName = self._collections.getCollectionForeignKeyName()
731 # Check that existing datasets have the same dataset type and
732 # run.
733 query = (
734 sqlalchemy.sql.select(
735 dataset.columns.id.label("dataset_id"),
736 dataset.columns.dataset_type_id.label("dataset_type_id"),
737 tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"),
738 dataset.columns[self._runKeyColumn].label("run"),
739 tmp_tags.columns[collFkName].label("new_run"),
740 )
741 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
742 .where(
743 sqlalchemy.sql.or_(
744 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
745 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
746 )
747 )
748 .limit(1)
749 )
750 with self._db.query(query) as result:
751 # Only include the first one in the exception message
752 if (row := result.first()) is not None:
753 existing_run = self._collections[row.run].name
754 new_run = self._collections[row.new_run].name
755 if row.dataset_type_id == self._dataset_type_id:
756 if row.new_dataset_type_id == self._dataset_type_id: 756 ↛ 762line 756 didn't jump to line 762, because the condition on line 756 was never false
757 raise ConflictingDefinitionError(
758 f"Current run {existing_run!r} and new run {new_run!r} do not agree for "
759 f"dataset {row.dataset_id}."
760 )
761 else:
762 raise ConflictingDefinitionError(
763 f"Dataset {row.dataset_id} was provided with type {self.datasetType.name!r} "
764 f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} "
765 f"in run {run!r}."
766 )
767 else:
768 raise ConflictingDefinitionError(
769 f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} "
770 f"in run {new_run!r}, but was already defined with type {self.datasetType.name!r} "
771 f"in run {run!r}."
772 )
774 # Check that matching dataset in tags table has the same DataId.
775 query = (
776 sqlalchemy.sql.select(
777 tags.columns.dataset_id,
778 tags.columns.dataset_type_id.label("type_id"),
779 tmp_tags.columns.dataset_type_id.label("new_type_id"),
780 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
781 *[
782 tmp_tags.columns[dim].label(f"new_{dim}")
783 for dim in self.datasetType.dimensions.required.names
784 ],
785 )
786 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
787 .where(
788 sqlalchemy.sql.or_(
789 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
790 *[
791 tags.columns[dim] != tmp_tags.columns[dim]
792 for dim in self.datasetType.dimensions.required.names
793 ],
794 )
795 )
796 .limit(1)
797 )
799 with self._db.query(query) as result:
800 if (row := result.first()) is not None:
801 # Only include the first one in the exception message
802 raise ConflictingDefinitionError(
803 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
804 )
806 # Check that matching run+dataId have the same dataset ID.
807 query = (
808 sqlalchemy.sql.select(
809 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
810 tags.columns.dataset_id,
811 tmp_tags.columns.dataset_id.label("new_dataset_id"),
812 tags.columns[collFkName],
813 tmp_tags.columns[collFkName].label(f"new_{collFkName}"),
814 )
815 .select_from(
816 tags.join(
817 tmp_tags,
818 sqlalchemy.sql.and_(
819 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
820 tags.columns[collFkName] == tmp_tags.columns[collFkName],
821 *[
822 tags.columns[dim] == tmp_tags.columns[dim]
823 for dim in self.datasetType.dimensions.required.names
824 ],
825 ),
826 )
827 )
828 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
829 .limit(1)
830 )
831 with self._db.query(query) as result:
832 # only include the first one in the exception message
833 if (row := result.first()) is not None:
834 data_id = {dim: getattr(row, dim) for dim in self.datasetType.dimensions.required.names}
835 existing_collection = self._collections[getattr(row, collFkName)].name
836 new_collection = self._collections[getattr(row, f"new_{collFkName}")].name
837 raise ConflictingDefinitionError(
838 f"Dataset with type {self.datasetType.name!r} and data ID {data_id} "
839 f"has ID {row.dataset_id} in existing collection {existing_collection!r} "
840 f"but ID {row.new_dataset_id} in new collection {new_collection!r}."
841 )