Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 80%
309 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-04 02:04 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-04 02:04 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23from __future__ import annotations
25__all__ = ("ByDimensionsDatasetRecordStorage",)
27import uuid
28from collections.abc import Iterable, Iterator, Sequence
29from typing import TYPE_CHECKING, Any
31import sqlalchemy
32from deprecated.sphinx import deprecated
34from ....core import (
35 DataCoordinate,
36 DataCoordinateSet,
37 DatasetId,
38 DatasetRef,
39 DatasetType,
40 SimpleQuery,
41 StorageClass,
42 Timespan,
43 ddl,
44)
45from ..._collection_summary import CollectionSummary
46from ..._collectionType import CollectionType
47from ..._exceptions import CollectionTypeError, ConflictingDefinitionError, UnsupportedIdGeneratorError
48from ...interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage
49from .tables import makeTagTableSpec
51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true
52 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
53 from .summaries import CollectionSummaryManager
54 from .tables import StaticDatasetTablesTuple
57class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
58 """Dataset record storage implementation paired with
59 `ByDimensionsDatasetRecordStorageManager`; see that class for more
60 information.
62 Instances of this class should never be constructed directly; use
63 `DatasetRecordStorageManager.register` instead.
64 """
66 def __init__(
67 self,
68 *,
69 datasetType: DatasetType,
70 db: Database,
71 dataset_type_id: int,
72 collections: CollectionManager,
73 static: StaticDatasetTablesTuple,
74 summaries: CollectionSummaryManager,
75 tags: sqlalchemy.schema.Table,
76 calibs: sqlalchemy.schema.Table | None,
77 ):
78 super().__init__(datasetType=datasetType)
79 self._dataset_type_id = dataset_type_id
80 self._db = db
81 self._collections = collections
82 self._static = static
83 self._summaries = summaries
84 self._tags = tags
85 self._calibs = calibs
86 self._runKeyColumn = collections.getRunForeignKeyName()
88 def find(
89 self,
90 collection: CollectionRecord,
91 dataId: DataCoordinate,
92 timespan: Timespan | None = None,
93 storage_class: StorageClass | str | None = None,
94 ) -> DatasetRef | None:
95 # Docstring inherited from DatasetRecordStorage.
96 assert dataId.graph == self.datasetType.dimensions
97 if collection.type is CollectionType.CALIBRATION and timespan is None: 97 ↛ 98line 97 didn't jump to line 98, because the condition on line 97 was never true
98 raise TypeError(
99 f"Cannot search for dataset in CALIBRATION collection {collection.name} "
100 f"without an input timespan."
101 )
102 sql = self.select(
103 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan
104 )
105 with self._db.query(sql) as results:
106 row = results.fetchone()
107 if row is None:
108 return None
109 if collection.type is CollectionType.CALIBRATION:
110 # For temporal calibration lookups (only!) our invariants do
111 # not guarantee that the number of result rows is <= 1. They
112 # would if `select` constrained the given timespan to be
113 # _contained_ by the validity range in the self._calibs table,
114 # instead of simply _overlapping_ it, because we do guarantee
115 # that the validity ranges are disjoint for a particular
116 # dataset type, collection, and data ID. But using an overlap
117 # test and a check for multiple result rows here allows us to
118 # provide a more useful diagnostic, as well as allowing
119 # `select` to support more general queries where multiple
120 # results are not an error.
121 if results.fetchone() is not None:
122 raise RuntimeError(
123 f"Multiple matches found for calibration lookup in {collection.name} for "
124 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
125 )
126 datasetType = self.datasetType
127 if storage_class is not None:
128 datasetType = datasetType.overrideStorageClass(storage_class)
129 return DatasetRef(
130 datasetType=datasetType,
131 dataId=dataId,
132 id=row.id,
133 run=self._collections[row._mapping[self._runKeyColumn]].name,
134 )
136 def delete(self, datasets: Iterable[DatasetRef]) -> None:
137 # Docstring inherited from DatasetRecordStorage.
138 # Only delete from common dataset table; ON DELETE foreign key clauses
139 # will handle the rest.
140 self._db.delete(
141 self._static.dataset,
142 ["id"],
143 *[{"id": dataset.getCheckedId()} for dataset in datasets],
144 )
146 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
147 # Docstring inherited from DatasetRecordStorage.
148 if collection.type is not CollectionType.TAGGED: 148 ↛ 149line 148 didn't jump to line 149, because the condition on line 148 was never true
149 raise TypeError(
150 f"Cannot associate into collection '{collection.name}' "
151 f"of type {collection.type.name}; must be TAGGED."
152 )
153 protoRow = {
154 self._collections.getCollectionForeignKeyName(): collection.key,
155 "dataset_type_id": self._dataset_type_id,
156 }
157 rows = []
158 summary = CollectionSummary()
159 for dataset in summary.add_datasets_generator(datasets):
160 row = dict(protoRow, dataset_id=dataset.getCheckedId())
161 for dimension, value in dataset.dataId.items():
162 row[dimension.name] = value
163 rows.append(row)
164 # Update the summary tables for this collection in case this is the
165 # first time this dataset type or these governor values will be
166 # inserted there.
167 self._summaries.update(collection, [self._dataset_type_id], summary)
168 # Update the tag table itself.
169 self._db.replace(self._tags, *rows)
171 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
172 # Docstring inherited from DatasetRecordStorage.
173 if collection.type is not CollectionType.TAGGED: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true
174 raise TypeError(
175 f"Cannot disassociate from collection '{collection.name}' "
176 f"of type {collection.type.name}; must be TAGGED."
177 )
178 rows = [
179 {
180 "dataset_id": dataset.getCheckedId(),
181 self._collections.getCollectionForeignKeyName(): collection.key,
182 }
183 for dataset in datasets
184 ]
185 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
187 def _buildCalibOverlapQuery(
188 self, collection: CollectionRecord, dataIds: DataCoordinateSet | None, timespan: Timespan
189 ) -> SimpleQuery:
190 assert self._calibs is not None
191 # Start by building a SELECT query for any rows that would overlap
192 # this one.
193 query = SimpleQuery()
194 query.join(self._calibs)
195 # Add a WHERE clause matching the dataset type and collection.
196 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
197 query.where.append(
198 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
199 )
200 # Add a WHERE clause matching any of the given data IDs.
201 if dataIds is not None:
202 dataIds.constrain(
203 query,
204 lambda name: self._calibs.columns[name], # type: ignore
205 )
206 # Add WHERE clause for timespan overlaps.
207 TimespanReprClass = self._db.getTimespanRepresentation()
208 query.where.append(
209 TimespanReprClass.from_columns(self._calibs.columns).overlaps(
210 TimespanReprClass.fromLiteral(timespan)
211 )
212 )
213 return query
215 def certify(
216 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
217 ) -> None:
218 # Docstring inherited from DatasetRecordStorage.
219 if self._calibs is None: 219 ↛ 220line 219 didn't jump to line 220, because the condition on line 219 was never true
220 raise CollectionTypeError(
221 f"Cannot certify datasets of type {self.datasetType.name}, for which "
222 f"DatasetType.isCalibration() is False."
223 )
224 if collection.type is not CollectionType.CALIBRATION: 224 ↛ 225line 224 didn't jump to line 225, because the condition on line 224 was never true
225 raise CollectionTypeError(
226 f"Cannot certify into collection '{collection.name}' "
227 f"of type {collection.type.name}; must be CALIBRATION."
228 )
229 TimespanReprClass = self._db.getTimespanRepresentation()
230 protoRow = {
231 self._collections.getCollectionForeignKeyName(): collection.key,
232 "dataset_type_id": self._dataset_type_id,
233 }
234 rows = []
235 dataIds: set[DataCoordinate] | None = (
236 set() if not TimespanReprClass.hasExclusionConstraint() else None
237 )
238 summary = CollectionSummary()
239 for dataset in summary.add_datasets_generator(datasets):
240 row = dict(protoRow, dataset_id=dataset.getCheckedId())
241 for dimension, value in dataset.dataId.items():
242 row[dimension.name] = value
243 TimespanReprClass.update(timespan, result=row)
244 rows.append(row)
245 if dataIds is not None: 245 ↛ 239line 245 didn't jump to line 239, because the condition on line 245 was never false
246 dataIds.add(dataset.dataId)
247 # Update the summary tables for this collection in case this is the
248 # first time this dataset type or these governor values will be
249 # inserted there.
250 self._summaries.update(collection, [self._dataset_type_id], summary)
251 # Update the association table itself.
252 if TimespanReprClass.hasExclusionConstraint(): 252 ↛ 255line 252 didn't jump to line 255, because the condition on line 252 was never true
253 # Rely on database constraint to enforce invariants; we just
254 # reraise the exception for consistency across DB engines.
255 try:
256 self._db.insert(self._calibs, *rows)
257 except sqlalchemy.exc.IntegrityError as err:
258 raise ConflictingDefinitionError(
259 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
260 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
261 ) from err
262 else:
263 # Have to implement exclusion constraint ourselves.
264 # Start by building a SELECT query for any rows that would overlap
265 # this one.
266 query = self._buildCalibOverlapQuery(
267 collection,
268 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
269 timespan,
270 )
271 query.columns.append(sqlalchemy.sql.func.count())
272 sql = query.combine()
273 # Acquire a table lock to ensure there are no concurrent writes
274 # could invalidate our checking before we finish the inserts. We
275 # use a SAVEPOINT in case there is an outer transaction that a
276 # failure here should not roll back.
277 with self._db.transaction(lock=[self._calibs], savepoint=True):
278 # Run the check SELECT query.
279 with self._db.query(sql) as sql_result:
280 conflicting = sql_result.scalar()
281 if conflicting > 0:
282 raise ConflictingDefinitionError(
283 f"{conflicting} validity range conflicts certifying datasets of type "
284 f"{self.datasetType.name} into {collection.name} for range "
285 f"[{timespan.begin}, {timespan.end})."
286 )
287 # Proceed with the insert.
288 self._db.insert(self._calibs, *rows)
290 def decertify(
291 self,
292 collection: CollectionRecord,
293 timespan: Timespan,
294 *,
295 dataIds: Iterable[DataCoordinate] | None = None,
296 ) -> None:
297 # Docstring inherited from DatasetRecordStorage.
298 if self._calibs is None: 298 ↛ 299line 298 didn't jump to line 299, because the condition on line 298 was never true
299 raise CollectionTypeError(
300 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
301 f"DatasetType.isCalibration() is False."
302 )
303 if collection.type is not CollectionType.CALIBRATION: 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true
304 raise CollectionTypeError(
305 f"Cannot decertify from collection '{collection.name}' "
306 f"of type {collection.type.name}; must be CALIBRATION."
307 )
308 TimespanReprClass = self._db.getTimespanRepresentation()
309 # Construct a SELECT query to find all rows that overlap our inputs.
310 dataIdSet: DataCoordinateSet | None
311 if dataIds is not None:
312 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
313 else:
314 dataIdSet = None
315 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
316 query.columns.extend(self._calibs.columns)
317 sql = query.combine()
318 # Set up collections to populate with the rows we'll want to modify.
319 # The insert rows will have the same values for collection and
320 # dataset type.
321 protoInsertRow = {
322 self._collections.getCollectionForeignKeyName(): collection.key,
323 "dataset_type_id": self._dataset_type_id,
324 }
325 rowsToDelete = []
326 rowsToInsert = []
327 # Acquire a table lock to ensure there are no concurrent writes
328 # between the SELECT and the DELETE and INSERT queries based on it.
329 with self._db.transaction(lock=[self._calibs], savepoint=True):
330 with self._db.query(sql) as sql_result:
331 sql_rows = sql_result.mappings().fetchall()
332 for row in sql_rows:
333 rowsToDelete.append({"id": row["id"]})
334 # Construct the insert row(s) by copying the prototype row,
335 # then adding the dimension column values, then adding what's
336 # left of the timespan from that row after we subtract the
337 # given timespan.
338 newInsertRow = protoInsertRow.copy()
339 newInsertRow["dataset_id"] = row["dataset_id"]
340 for name in self.datasetType.dimensions.required.names:
341 newInsertRow[name] = row[name]
342 rowTimespan = TimespanReprClass.extract(row)
343 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
344 for diffTimespan in rowTimespan.difference(timespan):
345 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
346 # Run the DELETE and INSERT queries.
347 self._db.delete(self._calibs, ["id"], *rowsToDelete)
348 self._db.insert(self._calibs, *rowsToInsert)
350 def select(
351 self,
352 *collections: CollectionRecord,
353 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
354 id: SimpleQuery.Select.Or[int | None] = SimpleQuery.Select,
355 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
356 timespan: SimpleQuery.Select.Or[Timespan | None] = SimpleQuery.Select,
357 ingestDate: SimpleQuery.Select.Or[Timespan | None] = None,
358 rank: SimpleQuery.Select.Or[None] = None,
359 ) -> sqlalchemy.sql.Selectable:
360 # Docstring inherited from DatasetRecordStorage.
361 collection_types = {collection.type for collection in collections}
362 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
363 TimespanReprClass = self._db.getTimespanRepresentation()
364 #
365 # There are two kinds of table in play here:
366 #
367 # - the static dataset table (with the dataset ID, dataset type ID,
368 # run ID/name, and ingest date);
369 #
370 # - the dynamic tags/calibs table (with the dataset ID, dataset type
371 # type ID, collection ID/name, data ID, and possibly validity
372 # range).
373 #
374 # That means that we might want to return a query against either table
375 # or a JOIN of both, depending on which quantities the caller wants.
376 # But this method is documented/typed such that ``dataId`` is never
377 # `None` - i.e. we always constrain or retreive the data ID. That
378 # means we'll always include the tags/calibs table and join in the
379 # static dataset table only if we need things from it that we can't get
380 # from the tags/calibs table.
381 #
382 # Note that it's important that we include a WHERE constraint on both
383 # tables for any column (e.g. dataset_type_id) that is in both when
384 # it's given explicitly; not doing can prevent the query planner from
385 # using very important indexes. At present, we don't include those
386 # redundant columns in the JOIN ON expression, however, because the
387 # FOREIGN KEY (and its index) are defined only on dataset_id.
388 #
389 # We'll start by accumulating kwargs to pass to SimpleQuery.join when
390 # we bring in the tags/calibs table. We get the data ID or constrain
391 # it in the tags/calibs table(s), but that's multiple columns, not one,
392 # so we need to transform the one Select.Or argument into a dictionary
393 # of them.
394 kwargs: dict[str, Any]
395 if dataId is SimpleQuery.Select:
396 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
397 else:
398 kwargs = dict(dataId.byName())
399 # We always constrain (never retrieve) the dataset type in at least the
400 # tags/calibs table.
401 kwargs["dataset_type_id"] = self._dataset_type_id
402 # Join in the tags and/or calibs tables, turning those 'kwargs' entries
403 # into WHERE constraints or SELECT columns as appropriate.
404 if collection_types != {CollectionType.CALIBRATION}:
405 # We'll need a subquery for the tags table if any of the given
406 # collections are not a CALIBRATION collection. This intentionally
407 # also fires when the list of collections is empty as a way to
408 # create a dummy subquery that we know will fail.
409 tags_query = SimpleQuery()
410 tags_query.join(self._tags, **kwargs)
411 # If the timespan is requested, simulate a potentially compound
412 # column whose values are the maximum and minimum timespan
413 # bounds.
414 # If the timespan is constrained, ignore the constraint, since
415 # it'd be guaranteed to evaluate to True.
416 if timespan is SimpleQuery.Select:
417 tags_query.columns.extend(TimespanReprClass.fromLiteral(Timespan(None, None)).flatten())
418 self._finish_single_select(
419 tags_query,
420 self._tags,
421 collections,
422 id=id,
423 run=run,
424 ingestDate=ingestDate,
425 rank=rank,
426 )
427 else:
428 tags_query = None
429 if CollectionType.CALIBRATION in collection_types:
430 # If at least one collection is a CALIBRATION collection, we'll
431 # need a subquery for the calibs table, and could include the
432 # timespan as a result or constraint.
433 calibs_query = SimpleQuery()
434 assert (
435 self._calibs is not None
436 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
437 calibs_query.join(self._calibs, **kwargs)
438 # Add the timespan column(s) to the result columns, or constrain
439 # the timespan via an overlap condition.
440 if timespan is SimpleQuery.Select:
441 calibs_query.columns.extend(TimespanReprClass.from_columns(self._calibs.columns).flatten())
442 elif timespan is not None:
443 calibs_query.where.append(
444 TimespanReprClass.from_columns(self._calibs.columns).overlaps(
445 TimespanReprClass.fromLiteral(timespan)
446 )
447 )
448 self._finish_single_select(
449 calibs_query,
450 self._calibs,
451 collections,
452 id=id,
453 run=run,
454 ingestDate=ingestDate,
455 rank=rank,
456 )
457 else:
458 calibs_query = None
459 if calibs_query is not None:
460 if tags_query is not None:
461 return tags_query.combine().union(calibs_query.combine())
462 else:
463 return calibs_query.combine()
464 else:
465 assert tags_query is not None, "Earlier logic should guaranteed at least one is not None."
466 return tags_query.combine()
468 def _finish_single_select(
469 self,
470 query: SimpleQuery,
471 table: sqlalchemy.schema.Table,
472 collections: Sequence[CollectionRecord],
473 id: SimpleQuery.Select.Or[int | None],
474 run: SimpleQuery.Select.Or[None],
475 ingestDate: SimpleQuery.Select.Or[Timespan | None],
476 rank: SimpleQuery.Select.Or[None],
477 ) -> None:
478 dataset_id_col = table.columns.dataset_id
479 collection_col = table.columns[self._collections.getCollectionForeignKeyName()]
480 # We always constrain (never retrieve) the collection(s) in the
481 # tags/calibs table.
482 if len(collections) == 1:
483 query.where.append(collection_col == collections[0].key)
484 elif len(collections) == 0:
485 # We support the case where there are no collections as a way to
486 # generate a valid SQL query that can't yield results. This should
487 # never get executed, but lots of downstream code will still try
488 # to access the SQLAlchemy objects representing the columns in the
489 # subquery. That's not ideal, but it'd take a lot of refactoring
490 # to fix it (DM-31725).
491 query.where.append(sqlalchemy.sql.literal(False))
492 else:
493 query.where.append(collection_col.in_([collection.key for collection in collections]))
494 # Add rank if requested as a CASE-based calculation the collection
495 # column.
496 if rank is not None:
497 assert rank is SimpleQuery.Select, "Cannot constraint rank, only select it."
498 query.columns.append(
499 sqlalchemy.sql.case(
500 {record.key: n for n, record in enumerate(collections)},
501 value=collection_col,
502 ).label("rank")
503 )
504 # We can always get the dataset_id from the tags/calibs table or
505 # constrain it there. Can't use kwargs for that because we need to
506 # alias it to 'id'.
507 if id is SimpleQuery.Select:
508 query.columns.append(dataset_id_col.label("id"))
509 elif id is not None: 509 ↛ 510line 509 didn't jump to line 510, because the condition on line 509 was never true
510 query.where.append(dataset_id_col == id)
511 # It's possible we now have everything we need, from just the
512 # tags/calibs table. The things we might need to get from the static
513 # dataset table are the run key and the ingest date.
514 need_static_table = False
515 static_kwargs: dict[str, Any] = {}
516 if run is not None:
517 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection."
518 if len(collections) == 1 and collections[0].type is CollectionType.RUN:
519 # If we are searching exactly one RUN collection, we
520 # know that if we find the dataset in that collection,
521 # then that's the datasets's run; we don't need to
522 # query for it.
523 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn))
524 else:
525 static_kwargs[self._runKeyColumn] = SimpleQuery.Select
526 need_static_table = True
527 # Ingest date can only come from the static table.
528 if ingestDate is not None:
529 need_static_table = True
530 if ingestDate is SimpleQuery.Select: 530 ↛ 533line 530 didn't jump to line 533, because the condition on line 530 was never false
531 static_kwargs["ingest_date"] = SimpleQuery.Select
532 else:
533 assert isinstance(ingestDate, Timespan)
534 # Timespan is astropy Time (usually in TAI) and ingest_date is
535 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
536 if ingestDate.isEmpty():
537 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
538 if ingestDate.begin is not None:
539 begin = ingestDate.begin.utc.datetime # type: ignore
540 query.where.append(self._static.dataset.columns.ingest_date >= begin)
541 if ingestDate.end is not None:
542 end = ingestDate.end.utc.datetime # type: ignore
543 query.where.append(self._static.dataset.columns.ingest_date < end)
544 # If we need the static table, join it in via dataset_id and
545 # dataset_type_id
546 if need_static_table:
547 query.join(
548 self._static.dataset,
549 onclause=(dataset_id_col == self._static.dataset.columns.id),
550 **static_kwargs,
551 )
552 # Also constrain dataset_type_id in static table in case that helps
553 # generate a better plan.
554 # We could also include this in the JOIN ON clause, but my guess is
555 # that that's a good idea IFF it's in the foreign key, and right
556 # now it isn't.
557 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
559 def getDataId(self, id: DatasetId) -> DataCoordinate:
560 """Return DataId for a dataset.
562 Parameters
563 ----------
564 id : `DatasetId`
565 Unique dataset identifier.
567 Returns
568 -------
569 dataId : `DataCoordinate`
570 DataId for the dataset.
571 """
572 # This query could return multiple rows (one for each tagged collection
573 # the dataset is in, plus one for its run collection), and we don't
574 # care which of those we get.
575 sql = (
576 self._tags.select()
577 .where(
578 sqlalchemy.sql.and_(
579 self._tags.columns.dataset_id == id,
580 self._tags.columns.dataset_type_id == self._dataset_type_id,
581 )
582 )
583 .limit(1)
584 )
585 with self._db.query(sql) as sql_result:
586 row = sql_result.mappings().fetchone()
587 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
588 return DataCoordinate.standardize(
589 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
590 graph=self.datasetType.dimensions,
591 )
594@deprecated(
595 "Integer dataset IDs are deprecated in favor of UUIDs; support will be removed after v26. "
596 "Please migrate or re-create this data repository.",
597 version="v25.0",
598 category=FutureWarning,
599)
600class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
601 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
602 auto-incremented column for dataset IDs.
603 """
605 def insert(
606 self,
607 run: RunRecord,
608 dataIds: Iterable[DataCoordinate],
609 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
610 ) -> Iterator[DatasetRef]:
611 # Docstring inherited from DatasetRecordStorage.
613 # We only support UNIQUE mode for integer dataset IDs
614 if idMode != DatasetIdGenEnum.UNIQUE:
615 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
617 # Transform a possibly-single-pass iterable into a list.
618 dataIdList = list(dataIds)
619 yield from self._insert(run, dataIdList)
621 def import_(
622 self,
623 run: RunRecord,
624 datasets: Iterable[DatasetRef],
625 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
626 reuseIds: bool = False,
627 ) -> Iterator[DatasetRef]:
628 # Docstring inherited from DatasetRecordStorage.
630 # We only support UNIQUE mode for integer dataset IDs
631 if idGenerationMode != DatasetIdGenEnum.UNIQUE:
632 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
634 # Make a list of dataIds and optionally dataset IDs.
635 dataIdList: list[DataCoordinate] = []
636 datasetIdList: list[int] = []
637 for dataset in datasets:
638 dataIdList.append(dataset.dataId)
640 # We only accept integer dataset IDs, but also allow None.
641 datasetId = dataset.id
642 if datasetId is None:
643 # if reuseIds is set then all IDs must be known
644 if reuseIds:
645 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
646 elif isinstance(datasetId, int):
647 if reuseIds:
648 datasetIdList.append(datasetId)
649 else:
650 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
652 yield from self._insert(run, dataIdList, datasetIdList)
654 def _insert(
655 self, run: RunRecord, dataIdList: list[DataCoordinate], datasetIdList: list[int] | None = None
656 ) -> Iterator[DatasetRef]:
657 """Common part of implementation of `insert` and `import_` methods."""
659 # Remember any governor dimension values we see.
660 summary = CollectionSummary()
661 summary.add_data_ids(self.datasetType, dataIdList)
663 staticRow = {
664 "dataset_type_id": self._dataset_type_id,
665 self._runKeyColumn: run.key,
666 }
667 with self._db.transaction():
668 # Insert into the static dataset table, generating autoincrement
669 # dataset_id values.
670 if datasetIdList:
671 # reuse existing IDs
672 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
673 self._db.insert(self._static.dataset, *rows)
674 else:
675 # use auto-incremented IDs
676 datasetIdList = self._db.insert(
677 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True
678 )
679 assert datasetIdList is not None
680 # Update the summary tables for this collection in case this is the
681 # first time this dataset type or these governor values will be
682 # inserted there.
683 self._summaries.update(run, [self._dataset_type_id], summary)
684 # Combine the generated dataset_id values and data ID fields to
685 # form rows to be inserted into the tags table.
686 protoTagsRow = {
687 "dataset_type_id": self._dataset_type_id,
688 self._collections.getCollectionForeignKeyName(): run.key,
689 }
690 tagsRows = [
691 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
692 for dataId, dataset_id in zip(dataIdList, datasetIdList)
693 ]
694 # Insert those rows into the tags table. This is where we'll
695 # get any unique constraint violations.
696 self._db.insert(self._tags, *tagsRows)
698 for dataId, datasetId in zip(dataIdList, datasetIdList):
699 yield DatasetRef(
700 datasetType=self.datasetType,
701 dataId=dataId,
702 id=datasetId,
703 run=run.name,
704 )
707class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
708 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
709 dataset IDs.
710 """
712 idMaker = DatasetIdFactory()
713 """Factory for dataset IDs. In the future this factory may be shared with
714 other classes (e.g. Registry)."""
716 def insert(
717 self,
718 run: RunRecord,
719 dataIds: Iterable[DataCoordinate],
720 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
721 ) -> Iterator[DatasetRef]:
722 # Docstring inherited from DatasetRecordStorage.
724 # Iterate over data IDs, transforming a possibly-single-pass iterable
725 # into a list.
726 dataIdList = []
727 rows = []
728 summary = CollectionSummary()
729 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds):
730 dataIdList.append(dataId)
731 rows.append(
732 {
733 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
734 "dataset_type_id": self._dataset_type_id,
735 self._runKeyColumn: run.key,
736 }
737 )
739 with self._db.transaction():
740 # Insert into the static dataset table.
741 self._db.insert(self._static.dataset, *rows)
742 # Update the summary tables for this collection in case this is the
743 # first time this dataset type or these governor values will be
744 # inserted there.
745 self._summaries.update(run, [self._dataset_type_id], summary)
746 # Combine the generated dataset_id values and data ID fields to
747 # form rows to be inserted into the tags table.
748 protoTagsRow = {
749 "dataset_type_id": self._dataset_type_id,
750 self._collections.getCollectionForeignKeyName(): run.key,
751 }
752 tagsRows = [
753 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
754 for dataId, row in zip(dataIdList, rows)
755 ]
756 # Insert those rows into the tags table.
757 self._db.insert(self._tags, *tagsRows)
759 for dataId, row in zip(dataIdList, rows):
760 yield DatasetRef(
761 datasetType=self.datasetType,
762 dataId=dataId,
763 id=row["id"],
764 run=run.name,
765 )
767 def import_(
768 self,
769 run: RunRecord,
770 datasets: Iterable[DatasetRef],
771 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
772 reuseIds: bool = False,
773 ) -> Iterator[DatasetRef]:
774 # Docstring inherited from DatasetRecordStorage.
776 # Iterate over data IDs, transforming a possibly-single-pass iterable
777 # into a list.
778 dataIds = {}
779 summary = CollectionSummary()
780 for dataset in summary.add_datasets_generator(datasets):
781 # Ignore unknown ID types, normally all IDs have the same type but
782 # this code supports mixed types or missing IDs.
783 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
784 if datasetId is None:
785 datasetId = self.idMaker.makeDatasetId(
786 run.name, self.datasetType, dataset.dataId, idGenerationMode
787 )
788 dataIds[datasetId] = dataset.dataId
790 # We'll insert all new rows into a temporary table
791 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False)
792 collFkName = self._collections.getCollectionForeignKeyName()
793 protoTagsRow = {
794 "dataset_type_id": self._dataset_type_id,
795 collFkName: run.key,
796 }
797 tmpRows = [
798 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
799 for dataset_id, dataId in dataIds.items()
800 ]
801 with self._db.transaction(for_temp_tables=True):
802 with self._db.temporary_table(tableSpec) as tmp_tags:
803 # store all incoming data in a temporary table
804 self._db.insert(tmp_tags, *tmpRows)
806 # There are some checks that we want to make for consistency
807 # of the new datasets with existing ones.
808 self._validateImport(tmp_tags, run)
810 # Before we merge temporary table into dataset/tags we need to
811 # drop datasets which are already there (and do not conflict).
812 self._db.deleteWhere(
813 tmp_tags,
814 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
815 )
817 # Copy it into dataset table, need to re-label some columns.
818 self._db.insert(
819 self._static.dataset,
820 select=sqlalchemy.sql.select(
821 tmp_tags.columns.dataset_id.label("id"),
822 tmp_tags.columns.dataset_type_id,
823 tmp_tags.columns[collFkName].label(self._runKeyColumn),
824 ),
825 )
827 # Update the summary tables for this collection in case this
828 # is the first time this dataset type or these governor values
829 # will be inserted there.
830 self._summaries.update(run, [self._dataset_type_id], summary)
832 # Copy it into tags table.
833 self._db.insert(self._tags, select=tmp_tags.select())
835 # Return refs in the same order as in the input list.
836 for dataset_id, dataId in dataIds.items():
837 yield DatasetRef(
838 datasetType=self.datasetType,
839 id=dataset_id,
840 dataId=dataId,
841 run=run.name,
842 )
844 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
845 """Validate imported refs against existing datasets.
847 Parameters
848 ----------
849 tmp_tags : `sqlalchemy.schema.Table`
850 Temporary table with new datasets and the same schema as tags
851 table.
852 run : `RunRecord`
853 The record object describing the `~CollectionType.RUN` collection.
855 Raises
856 ------
857 ConflictingDefinitionError
858 Raise if new datasets conflict with existing ones.
859 """
860 dataset = self._static.dataset
861 tags = self._tags
862 collFkName = self._collections.getCollectionForeignKeyName()
864 # Check that existing datasets have the same dataset type and
865 # run.
866 query = (
867 sqlalchemy.sql.select(
868 dataset.columns.id.label("dataset_id"),
869 dataset.columns.dataset_type_id.label("dataset_type_id"),
870 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"),
871 dataset.columns[self._runKeyColumn].label("run"),
872 tmp_tags.columns[collFkName].label("new run"),
873 )
874 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
875 .where(
876 sqlalchemy.sql.or_(
877 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
878 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
879 )
880 )
881 .limit(1)
882 )
883 with self._db.query(query) as result:
884 if (row := result.first()) is not None:
885 # Only include the first one in the exception message
886 raise ConflictingDefinitionError(
887 f"Existing dataset type or run do not match new dataset: {row._asdict()}"
888 )
890 # Check that matching dataset in tags table has the same DataId.
891 query = (
892 sqlalchemy.sql.select(
893 tags.columns.dataset_id,
894 tags.columns.dataset_type_id.label("type_id"),
895 tmp_tags.columns.dataset_type_id.label("new type_id"),
896 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
897 *[
898 tmp_tags.columns[dim].label(f"new {dim}")
899 for dim in self.datasetType.dimensions.required.names
900 ],
901 )
902 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
903 .where(
904 sqlalchemy.sql.or_(
905 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
906 *[
907 tags.columns[dim] != tmp_tags.columns[dim]
908 for dim in self.datasetType.dimensions.required.names
909 ],
910 )
911 )
912 .limit(1)
913 )
915 with self._db.query(query) as result:
916 if (row := result.first()) is not None:
917 # Only include the first one in the exception message
918 raise ConflictingDefinitionError(
919 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
920 )
922 # Check that matching run+dataId have the same dataset ID.
923 query = (
924 sqlalchemy.sql.select(
925 tags.columns.dataset_type_id.label("dataset_type_id"),
926 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
927 tags.columns.dataset_id,
928 tmp_tags.columns.dataset_id.label("new dataset_id"),
929 tags.columns[collFkName],
930 tmp_tags.columns[collFkName].label(f"new {collFkName}"),
931 )
932 .select_from(
933 tags.join(
934 tmp_tags,
935 sqlalchemy.sql.and_(
936 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
937 tags.columns[collFkName] == tmp_tags.columns[collFkName],
938 *[
939 tags.columns[dim] == tmp_tags.columns[dim]
940 for dim in self.datasetType.dimensions.required.names
941 ],
942 ),
943 )
944 )
945 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
946 .limit(1)
947 )
948 with self._db.query(query) as result:
949 if (row := result.first()) is not None:
950 # only include the first one in the exception message
951 raise ConflictingDefinitionError(
952 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
953 )