Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 79%
306 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:09 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-15 00:09 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
23from __future__ import annotations
25__all__ = ("ByDimensionsDatasetRecordStorage",)
27import uuid
28from collections.abc import Iterable, Iterator, Sequence
29from typing import TYPE_CHECKING, Any
31import sqlalchemy
32from deprecated.sphinx import deprecated
34from ....core import (
35 DataCoordinate,
36 DataCoordinateSet,
37 DatasetId,
38 DatasetRef,
39 DatasetType,
40 SimpleQuery,
41 Timespan,
42 ddl,
43)
44from ..._collection_summary import CollectionSummary
45from ..._collectionType import CollectionType
46from ..._exceptions import CollectionTypeError, ConflictingDefinitionError, UnsupportedIdGeneratorError
47from ...interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage
48from .tables import makeTagTableSpec
50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true
51 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
52 from .summaries import CollectionSummaryManager
53 from .tables import StaticDatasetTablesTuple
56class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
57 """Dataset record storage implementation paired with
58 `ByDimensionsDatasetRecordStorageManager`; see that class for more
59 information.
61 Instances of this class should never be constructed directly; use
62 `DatasetRecordStorageManager.register` instead.
63 """
65 def __init__(
66 self,
67 *,
68 datasetType: DatasetType,
69 db: Database,
70 dataset_type_id: int,
71 collections: CollectionManager,
72 static: StaticDatasetTablesTuple,
73 summaries: CollectionSummaryManager,
74 tags: sqlalchemy.schema.Table,
75 calibs: sqlalchemy.schema.Table | None,
76 ):
77 super().__init__(datasetType=datasetType)
78 self._dataset_type_id = dataset_type_id
79 self._db = db
80 self._collections = collections
81 self._static = static
82 self._summaries = summaries
83 self._tags = tags
84 self._calibs = calibs
85 self._runKeyColumn = collections.getRunForeignKeyName()
87 def find(
88 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Timespan | None = None
89 ) -> DatasetRef | None:
90 # Docstring inherited from DatasetRecordStorage.
91 assert dataId.graph == self.datasetType.dimensions
92 if collection.type is CollectionType.CALIBRATION and timespan is None: 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true
93 raise TypeError(
94 f"Cannot search for dataset in CALIBRATION collection {collection.name} "
95 f"without an input timespan."
96 )
97 sql = self.select(
98 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan
99 )
100 with self._db.query(sql) as results:
101 row = results.fetchone()
102 if row is None:
103 return None
104 if collection.type is CollectionType.CALIBRATION:
105 # For temporal calibration lookups (only!) our invariants do
106 # not guarantee that the number of result rows is <= 1. They
107 # would if `select` constrained the given timespan to be
108 # _contained_ by the validity range in the self._calibs table,
109 # instead of simply _overlapping_ it, because we do guarantee
110 # that the validity ranges are disjoint for a particular
111 # dataset type, collection, and data ID. But using an overlap
112 # test and a check for multiple result rows here allows us to
113 # provide a more useful diagnostic, as well as allowing
114 # `select` to support more general queries where multiple
115 # results are not an error.
116 if results.fetchone() is not None:
117 raise RuntimeError(
118 f"Multiple matches found for calibration lookup in {collection.name} for "
119 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
120 )
121 return DatasetRef(
122 datasetType=self.datasetType,
123 dataId=dataId,
124 id=row.id,
125 run=self._collections[row._mapping[self._runKeyColumn]].name,
126 )
128 def delete(self, datasets: Iterable[DatasetRef]) -> None:
129 # Docstring inherited from DatasetRecordStorage.
130 # Only delete from common dataset table; ON DELETE foreign key clauses
131 # will handle the rest.
132 self._db.delete(
133 self._static.dataset,
134 ["id"],
135 *[{"id": dataset.getCheckedId()} for dataset in datasets],
136 )
138 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
139 # Docstring inherited from DatasetRecordStorage.
140 if collection.type is not CollectionType.TAGGED: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 raise TypeError(
142 f"Cannot associate into collection '{collection.name}' "
143 f"of type {collection.type.name}; must be TAGGED."
144 )
145 protoRow = {
146 self._collections.getCollectionForeignKeyName(): collection.key,
147 "dataset_type_id": self._dataset_type_id,
148 }
149 rows = []
150 summary = CollectionSummary()
151 for dataset in summary.add_datasets_generator(datasets):
152 row = dict(protoRow, dataset_id=dataset.getCheckedId())
153 for dimension, value in dataset.dataId.items():
154 row[dimension.name] = value
155 rows.append(row)
156 # Update the summary tables for this collection in case this is the
157 # first time this dataset type or these governor values will be
158 # inserted there.
159 self._summaries.update(collection, [self._dataset_type_id], summary)
160 # Update the tag table itself.
161 self._db.replace(self._tags, *rows)
163 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
164 # Docstring inherited from DatasetRecordStorage.
165 if collection.type is not CollectionType.TAGGED: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true
166 raise TypeError(
167 f"Cannot disassociate from collection '{collection.name}' "
168 f"of type {collection.type.name}; must be TAGGED."
169 )
170 rows = [
171 {
172 "dataset_id": dataset.getCheckedId(),
173 self._collections.getCollectionForeignKeyName(): collection.key,
174 }
175 for dataset in datasets
176 ]
177 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
179 def _buildCalibOverlapQuery(
180 self, collection: CollectionRecord, dataIds: DataCoordinateSet | None, timespan: Timespan
181 ) -> SimpleQuery:
182 assert self._calibs is not None
183 # Start by building a SELECT query for any rows that would overlap
184 # this one.
185 query = SimpleQuery()
186 query.join(self._calibs)
187 # Add a WHERE clause matching the dataset type and collection.
188 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
189 query.where.append(
190 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
191 )
192 # Add a WHERE clause matching any of the given data IDs.
193 if dataIds is not None:
194 dataIds.constrain(
195 query,
196 lambda name: self._calibs.columns[name], # type: ignore
197 )
198 # Add WHERE clause for timespan overlaps.
199 TimespanReprClass = self._db.getTimespanRepresentation()
200 query.where.append(
201 TimespanReprClass.from_columns(self._calibs.columns).overlaps(
202 TimespanReprClass.fromLiteral(timespan)
203 )
204 )
205 return query
207 def certify(
208 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
209 ) -> None:
210 # Docstring inherited from DatasetRecordStorage.
211 if self._calibs is None: 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 raise CollectionTypeError(
213 f"Cannot certify datasets of type {self.datasetType.name}, for which "
214 f"DatasetType.isCalibration() is False."
215 )
216 if collection.type is not CollectionType.CALIBRATION: 216 ↛ 217line 216 didn't jump to line 217, because the condition on line 216 was never true
217 raise CollectionTypeError(
218 f"Cannot certify into collection '{collection.name}' "
219 f"of type {collection.type.name}; must be CALIBRATION."
220 )
221 TimespanReprClass = self._db.getTimespanRepresentation()
222 protoRow = {
223 self._collections.getCollectionForeignKeyName(): collection.key,
224 "dataset_type_id": self._dataset_type_id,
225 }
226 rows = []
227 dataIds: set[DataCoordinate] | None = (
228 set() if not TimespanReprClass.hasExclusionConstraint() else None
229 )
230 summary = CollectionSummary()
231 for dataset in summary.add_datasets_generator(datasets):
232 row = dict(protoRow, dataset_id=dataset.getCheckedId())
233 for dimension, value in dataset.dataId.items():
234 row[dimension.name] = value
235 TimespanReprClass.update(timespan, result=row)
236 rows.append(row)
237 if dataIds is not None: 237 ↛ 231line 237 didn't jump to line 231, because the condition on line 237 was never false
238 dataIds.add(dataset.dataId)
239 # Update the summary tables for this collection in case this is the
240 # first time this dataset type or these governor values will be
241 # inserted there.
242 self._summaries.update(collection, [self._dataset_type_id], summary)
243 # Update the association table itself.
244 if TimespanReprClass.hasExclusionConstraint(): 244 ↛ 247line 244 didn't jump to line 247, because the condition on line 244 was never true
245 # Rely on database constraint to enforce invariants; we just
246 # reraise the exception for consistency across DB engines.
247 try:
248 self._db.insert(self._calibs, *rows)
249 except sqlalchemy.exc.IntegrityError as err:
250 raise ConflictingDefinitionError(
251 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
252 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
253 ) from err
254 else:
255 # Have to implement exclusion constraint ourselves.
256 # Start by building a SELECT query for any rows that would overlap
257 # this one.
258 query = self._buildCalibOverlapQuery(
259 collection,
260 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
261 timespan,
262 )
263 query.columns.append(sqlalchemy.sql.func.count())
264 sql = query.combine()
265 # Acquire a table lock to ensure there are no concurrent writes
266 # could invalidate our checking before we finish the inserts. We
267 # use a SAVEPOINT in case there is an outer transaction that a
268 # failure here should not roll back.
269 with self._db.transaction(lock=[self._calibs], savepoint=True):
270 # Run the check SELECT query.
271 with self._db.query(sql) as sql_result:
272 conflicting = sql_result.scalar()
273 if conflicting > 0:
274 raise ConflictingDefinitionError(
275 f"{conflicting} validity range conflicts certifying datasets of type "
276 f"{self.datasetType.name} into {collection.name} for range "
277 f"[{timespan.begin}, {timespan.end})."
278 )
279 # Proceed with the insert.
280 self._db.insert(self._calibs, *rows)
282 def decertify(
283 self,
284 collection: CollectionRecord,
285 timespan: Timespan,
286 *,
287 dataIds: Iterable[DataCoordinate] | None = None,
288 ) -> None:
289 # Docstring inherited from DatasetRecordStorage.
290 if self._calibs is None: 290 ↛ 291line 290 didn't jump to line 291, because the condition on line 290 was never true
291 raise CollectionTypeError(
292 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
293 f"DatasetType.isCalibration() is False."
294 )
295 if collection.type is not CollectionType.CALIBRATION: 295 ↛ 296line 295 didn't jump to line 296, because the condition on line 295 was never true
296 raise CollectionTypeError(
297 f"Cannot decertify from collection '{collection.name}' "
298 f"of type {collection.type.name}; must be CALIBRATION."
299 )
300 TimespanReprClass = self._db.getTimespanRepresentation()
301 # Construct a SELECT query to find all rows that overlap our inputs.
302 dataIdSet: DataCoordinateSet | None
303 if dataIds is not None:
304 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
305 else:
306 dataIdSet = None
307 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
308 query.columns.extend(self._calibs.columns)
309 sql = query.combine()
310 # Set up collections to populate with the rows we'll want to modify.
311 # The insert rows will have the same values for collection and
312 # dataset type.
313 protoInsertRow = {
314 self._collections.getCollectionForeignKeyName(): collection.key,
315 "dataset_type_id": self._dataset_type_id,
316 }
317 rowsToDelete = []
318 rowsToInsert = []
319 # Acquire a table lock to ensure there are no concurrent writes
320 # between the SELECT and the DELETE and INSERT queries based on it.
321 with self._db.transaction(lock=[self._calibs], savepoint=True):
322 with self._db.query(sql) as sql_result:
323 sql_rows = sql_result.mappings().fetchall()
324 for row in sql_rows:
325 rowsToDelete.append({"id": row["id"]})
326 # Construct the insert row(s) by copying the prototype row,
327 # then adding the dimension column values, then adding what's
328 # left of the timespan from that row after we subtract the
329 # given timespan.
330 newInsertRow = protoInsertRow.copy()
331 newInsertRow["dataset_id"] = row["dataset_id"]
332 for name in self.datasetType.dimensions.required.names:
333 newInsertRow[name] = row[name]
334 rowTimespan = TimespanReprClass.extract(row)
335 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
336 for diffTimespan in rowTimespan.difference(timespan):
337 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
338 # Run the DELETE and INSERT queries.
339 self._db.delete(self._calibs, ["id"], *rowsToDelete)
340 self._db.insert(self._calibs, *rowsToInsert)
342 def select(
343 self,
344 *collections: CollectionRecord,
345 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
346 id: SimpleQuery.Select.Or[int | None] = SimpleQuery.Select,
347 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
348 timespan: SimpleQuery.Select.Or[Timespan | None] = SimpleQuery.Select,
349 ingestDate: SimpleQuery.Select.Or[Timespan | None] = None,
350 rank: SimpleQuery.Select.Or[None] = None,
351 ) -> sqlalchemy.sql.Selectable:
352 # Docstring inherited from DatasetRecordStorage.
353 collection_types = {collection.type for collection in collections}
354 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
355 TimespanReprClass = self._db.getTimespanRepresentation()
356 #
357 # There are two kinds of table in play here:
358 #
359 # - the static dataset table (with the dataset ID, dataset type ID,
360 # run ID/name, and ingest date);
361 #
362 # - the dynamic tags/calibs table (with the dataset ID, dataset type
363 # type ID, collection ID/name, data ID, and possibly validity
364 # range).
365 #
366 # That means that we might want to return a query against either table
367 # or a JOIN of both, depending on which quantities the caller wants.
368 # But this method is documented/typed such that ``dataId`` is never
369 # `None` - i.e. we always constrain or retreive the data ID. That
370 # means we'll always include the tags/calibs table and join in the
371 # static dataset table only if we need things from it that we can't get
372 # from the tags/calibs table.
373 #
374 # Note that it's important that we include a WHERE constraint on both
375 # tables for any column (e.g. dataset_type_id) that is in both when
376 # it's given explicitly; not doing can prevent the query planner from
377 # using very important indexes. At present, we don't include those
378 # redundant columns in the JOIN ON expression, however, because the
379 # FOREIGN KEY (and its index) are defined only on dataset_id.
380 #
381 # We'll start by accumulating kwargs to pass to SimpleQuery.join when
382 # we bring in the tags/calibs table. We get the data ID or constrain
383 # it in the tags/calibs table(s), but that's multiple columns, not one,
384 # so we need to transform the one Select.Or argument into a dictionary
385 # of them.
386 kwargs: dict[str, Any]
387 if dataId is SimpleQuery.Select:
388 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
389 else:
390 kwargs = dict(dataId.byName())
391 # We always constrain (never retrieve) the dataset type in at least the
392 # tags/calibs table.
393 kwargs["dataset_type_id"] = self._dataset_type_id
394 # Join in the tags and/or calibs tables, turning those 'kwargs' entries
395 # into WHERE constraints or SELECT columns as appropriate.
396 if collection_types != {CollectionType.CALIBRATION}:
397 # We'll need a subquery for the tags table if any of the given
398 # collections are not a CALIBRATION collection. This intentionally
399 # also fires when the list of collections is empty as a way to
400 # create a dummy subquery that we know will fail.
401 tags_query = SimpleQuery()
402 tags_query.join(self._tags, **kwargs)
403 # If the timespan is requested, simulate a potentially compound
404 # column whose values are the maximum and minimum timespan
405 # bounds.
406 # If the timespan is constrained, ignore the constraint, since
407 # it'd be guaranteed to evaluate to True.
408 if timespan is SimpleQuery.Select:
409 tags_query.columns.extend(TimespanReprClass.fromLiteral(Timespan(None, None)).flatten())
410 self._finish_single_select(
411 tags_query,
412 self._tags,
413 collections,
414 id=id,
415 run=run,
416 ingestDate=ingestDate,
417 rank=rank,
418 )
419 else:
420 tags_query = None
421 if CollectionType.CALIBRATION in collection_types:
422 # If at least one collection is a CALIBRATION collection, we'll
423 # need a subquery for the calibs table, and could include the
424 # timespan as a result or constraint.
425 calibs_query = SimpleQuery()
426 assert (
427 self._calibs is not None
428 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
429 calibs_query.join(self._calibs, **kwargs)
430 # Add the timespan column(s) to the result columns, or constrain
431 # the timespan via an overlap condition.
432 if timespan is SimpleQuery.Select:
433 calibs_query.columns.extend(TimespanReprClass.from_columns(self._calibs.columns).flatten())
434 elif timespan is not None:
435 calibs_query.where.append(
436 TimespanReprClass.from_columns(self._calibs.columns).overlaps(
437 TimespanReprClass.fromLiteral(timespan)
438 )
439 )
440 self._finish_single_select(
441 calibs_query,
442 self._calibs,
443 collections,
444 id=id,
445 run=run,
446 ingestDate=ingestDate,
447 rank=rank,
448 )
449 else:
450 calibs_query = None
451 if calibs_query is not None:
452 if tags_query is not None:
453 return tags_query.combine().union(calibs_query.combine())
454 else:
455 return calibs_query.combine()
456 else:
457 assert tags_query is not None, "Earlier logic should guaranteed at least one is not None."
458 return tags_query.combine()
460 def _finish_single_select(
461 self,
462 query: SimpleQuery,
463 table: sqlalchemy.schema.Table,
464 collections: Sequence[CollectionRecord],
465 id: SimpleQuery.Select.Or[int | None],
466 run: SimpleQuery.Select.Or[None],
467 ingestDate: SimpleQuery.Select.Or[Timespan | None],
468 rank: SimpleQuery.Select.Or[None],
469 ) -> None:
470 dataset_id_col = table.columns.dataset_id
471 collection_col = table.columns[self._collections.getCollectionForeignKeyName()]
472 # We always constrain (never retrieve) the collection(s) in the
473 # tags/calibs table.
474 if len(collections) == 1:
475 query.where.append(collection_col == collections[0].key)
476 elif len(collections) == 0:
477 # We support the case where there are no collections as a way to
478 # generate a valid SQL query that can't yield results. This should
479 # never get executed, but lots of downstream code will still try
480 # to access the SQLAlchemy objects representing the columns in the
481 # subquery. That's not ideal, but it'd take a lot of refactoring
482 # to fix it (DM-31725).
483 query.where.append(sqlalchemy.sql.literal(False))
484 else:
485 query.where.append(collection_col.in_([collection.key for collection in collections]))
486 # Add rank if requested as a CASE-based calculation the collection
487 # column.
488 if rank is not None:
489 assert rank is SimpleQuery.Select, "Cannot constraint rank, only select it."
490 query.columns.append(
491 sqlalchemy.sql.case(
492 {record.key: n for n, record in enumerate(collections)},
493 value=collection_col,
494 ).label("rank")
495 )
496 # We can always get the dataset_id from the tags/calibs table or
497 # constrain it there. Can't use kwargs for that because we need to
498 # alias it to 'id'.
499 if id is SimpleQuery.Select:
500 query.columns.append(dataset_id_col.label("id"))
501 elif id is not None: 501 ↛ 502line 501 didn't jump to line 502, because the condition on line 501 was never true
502 query.where.append(dataset_id_col == id)
503 # It's possible we now have everything we need, from just the
504 # tags/calibs table. The things we might need to get from the static
505 # dataset table are the run key and the ingest date.
506 need_static_table = False
507 static_kwargs: dict[str, Any] = {}
508 if run is not None:
509 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection."
510 if len(collections) == 1 and collections[0].type is CollectionType.RUN:
511 # If we are searching exactly one RUN collection, we
512 # know that if we find the dataset in that collection,
513 # then that's the datasets's run; we don't need to
514 # query for it.
515 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn))
516 else:
517 static_kwargs[self._runKeyColumn] = SimpleQuery.Select
518 need_static_table = True
519 # Ingest date can only come from the static table.
520 if ingestDate is not None:
521 need_static_table = True
522 if ingestDate is SimpleQuery.Select: 522 ↛ 525line 522 didn't jump to line 525, because the condition on line 522 was never false
523 static_kwargs["ingest_date"] = SimpleQuery.Select
524 else:
525 assert isinstance(ingestDate, Timespan)
526 # Timespan is astropy Time (usually in TAI) and ingest_date is
527 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
528 if ingestDate.isEmpty():
529 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
530 if ingestDate.begin is not None:
531 begin = ingestDate.begin.utc.datetime # type: ignore
532 query.where.append(self._static.dataset.columns.ingest_date >= begin)
533 if ingestDate.end is not None:
534 end = ingestDate.end.utc.datetime # type: ignore
535 query.where.append(self._static.dataset.columns.ingest_date < end)
536 # If we need the static table, join it in via dataset_id and
537 # dataset_type_id
538 if need_static_table:
539 query.join(
540 self._static.dataset,
541 onclause=(dataset_id_col == self._static.dataset.columns.id),
542 **static_kwargs,
543 )
544 # Also constrain dataset_type_id in static table in case that helps
545 # generate a better plan.
546 # We could also include this in the JOIN ON clause, but my guess is
547 # that that's a good idea IFF it's in the foreign key, and right
548 # now it isn't.
549 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
551 def getDataId(self, id: DatasetId) -> DataCoordinate:
552 """Return DataId for a dataset.
554 Parameters
555 ----------
556 id : `DatasetId`
557 Unique dataset identifier.
559 Returns
560 -------
561 dataId : `DataCoordinate`
562 DataId for the dataset.
563 """
564 # This query could return multiple rows (one for each tagged collection
565 # the dataset is in, plus one for its run collection), and we don't
566 # care which of those we get.
567 sql = (
568 self._tags.select()
569 .where(
570 sqlalchemy.sql.and_(
571 self._tags.columns.dataset_id == id,
572 self._tags.columns.dataset_type_id == self._dataset_type_id,
573 )
574 )
575 .limit(1)
576 )
577 with self._db.query(sql) as sql_result:
578 row = sql_result.mappings().fetchone()
579 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
580 return DataCoordinate.standardize(
581 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
582 graph=self.datasetType.dimensions,
583 )
586@deprecated(
587 "Integer dataset IDs are deprecated in favor of UUIDs; support will be removed after v25. "
588 "Please migrate or re-create this data repository.",
589 version="v25.0",
590 category=FutureWarning,
591)
592class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
593 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
594 auto-incremented column for dataset IDs.
595 """
597 def insert(
598 self,
599 run: RunRecord,
600 dataIds: Iterable[DataCoordinate],
601 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
602 ) -> Iterator[DatasetRef]:
603 # Docstring inherited from DatasetRecordStorage.
605 # We only support UNIQUE mode for integer dataset IDs
606 if idMode != DatasetIdGenEnum.UNIQUE:
607 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
609 # Transform a possibly-single-pass iterable into a list.
610 dataIdList = list(dataIds)
611 yield from self._insert(run, dataIdList)
613 def import_(
614 self,
615 run: RunRecord,
616 datasets: Iterable[DatasetRef],
617 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
618 reuseIds: bool = False,
619 ) -> Iterator[DatasetRef]:
620 # Docstring inherited from DatasetRecordStorage.
622 # We only support UNIQUE mode for integer dataset IDs
623 if idGenerationMode != DatasetIdGenEnum.UNIQUE:
624 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
626 # Make a list of dataIds and optionally dataset IDs.
627 dataIdList: list[DataCoordinate] = []
628 datasetIdList: list[int] = []
629 for dataset in datasets:
630 dataIdList.append(dataset.dataId)
632 # We only accept integer dataset IDs, but also allow None.
633 datasetId = dataset.id
634 if datasetId is None:
635 # if reuseIds is set then all IDs must be known
636 if reuseIds:
637 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
638 elif isinstance(datasetId, int):
639 if reuseIds:
640 datasetIdList.append(datasetId)
641 else:
642 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
644 yield from self._insert(run, dataIdList, datasetIdList)
646 def _insert(
647 self, run: RunRecord, dataIdList: list[DataCoordinate], datasetIdList: list[int] | None = None
648 ) -> Iterator[DatasetRef]:
649 """Common part of implementation of `insert` and `import_` methods."""
651 # Remember any governor dimension values we see.
652 summary = CollectionSummary()
653 summary.add_data_ids(self.datasetType, dataIdList)
655 staticRow = {
656 "dataset_type_id": self._dataset_type_id,
657 self._runKeyColumn: run.key,
658 }
659 with self._db.transaction():
660 # Insert into the static dataset table, generating autoincrement
661 # dataset_id values.
662 if datasetIdList:
663 # reuse existing IDs
664 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
665 self._db.insert(self._static.dataset, *rows)
666 else:
667 # use auto-incremented IDs
668 datasetIdList = self._db.insert(
669 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True
670 )
671 assert datasetIdList is not None
672 # Update the summary tables for this collection in case this is the
673 # first time this dataset type or these governor values will be
674 # inserted there.
675 self._summaries.update(run, [self._dataset_type_id], summary)
676 # Combine the generated dataset_id values and data ID fields to
677 # form rows to be inserted into the tags table.
678 protoTagsRow = {
679 "dataset_type_id": self._dataset_type_id,
680 self._collections.getCollectionForeignKeyName(): run.key,
681 }
682 tagsRows = [
683 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
684 for dataId, dataset_id in zip(dataIdList, datasetIdList)
685 ]
686 # Insert those rows into the tags table. This is where we'll
687 # get any unique constraint violations.
688 self._db.insert(self._tags, *tagsRows)
690 for dataId, datasetId in zip(dataIdList, datasetIdList):
691 yield DatasetRef(
692 datasetType=self.datasetType,
693 dataId=dataId,
694 id=datasetId,
695 run=run.name,
696 )
699class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
700 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
701 dataset IDs.
702 """
704 idMaker = DatasetIdFactory()
705 """Factory for dataset IDs. In the future this factory may be shared with
706 other classes (e.g. Registry)."""
708 def insert(
709 self,
710 run: RunRecord,
711 dataIds: Iterable[DataCoordinate],
712 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
713 ) -> Iterator[DatasetRef]:
714 # Docstring inherited from DatasetRecordStorage.
716 # Iterate over data IDs, transforming a possibly-single-pass iterable
717 # into a list.
718 dataIdList = []
719 rows = []
720 summary = CollectionSummary()
721 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds):
722 dataIdList.append(dataId)
723 rows.append(
724 {
725 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
726 "dataset_type_id": self._dataset_type_id,
727 self._runKeyColumn: run.key,
728 }
729 )
731 with self._db.transaction():
732 # Insert into the static dataset table.
733 self._db.insert(self._static.dataset, *rows)
734 # Update the summary tables for this collection in case this is the
735 # first time this dataset type or these governor values will be
736 # inserted there.
737 self._summaries.update(run, [self._dataset_type_id], summary)
738 # Combine the generated dataset_id values and data ID fields to
739 # form rows to be inserted into the tags table.
740 protoTagsRow = {
741 "dataset_type_id": self._dataset_type_id,
742 self._collections.getCollectionForeignKeyName(): run.key,
743 }
744 tagsRows = [
745 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
746 for dataId, row in zip(dataIdList, rows)
747 ]
748 # Insert those rows into the tags table.
749 self._db.insert(self._tags, *tagsRows)
751 for dataId, row in zip(dataIdList, rows):
752 yield DatasetRef(
753 datasetType=self.datasetType,
754 dataId=dataId,
755 id=row["id"],
756 run=run.name,
757 )
759 def import_(
760 self,
761 run: RunRecord,
762 datasets: Iterable[DatasetRef],
763 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
764 reuseIds: bool = False,
765 ) -> Iterator[DatasetRef]:
766 # Docstring inherited from DatasetRecordStorage.
768 # Iterate over data IDs, transforming a possibly-single-pass iterable
769 # into a list.
770 dataIds = {}
771 summary = CollectionSummary()
772 for dataset in summary.add_datasets_generator(datasets):
773 # Ignore unknown ID types, normally all IDs have the same type but
774 # this code supports mixed types or missing IDs.
775 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
776 if datasetId is None:
777 datasetId = self.idMaker.makeDatasetId(
778 run.name, self.datasetType, dataset.dataId, idGenerationMode
779 )
780 dataIds[datasetId] = dataset.dataId
782 # We'll insert all new rows into a temporary table
783 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False)
784 collFkName = self._collections.getCollectionForeignKeyName()
785 protoTagsRow = {
786 "dataset_type_id": self._dataset_type_id,
787 collFkName: run.key,
788 }
789 tmpRows = [
790 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
791 for dataset_id, dataId in dataIds.items()
792 ]
793 with self._db.transaction(for_temp_tables=True):
794 with self._db.temporary_table(tableSpec) as tmp_tags:
795 # store all incoming data in a temporary table
796 self._db.insert(tmp_tags, *tmpRows)
798 # There are some checks that we want to make for consistency
799 # of the new datasets with existing ones.
800 self._validateImport(tmp_tags, run)
802 # Before we merge temporary table into dataset/tags we need to
803 # drop datasets which are already there (and do not conflict).
804 self._db.deleteWhere(
805 tmp_tags,
806 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
807 )
809 # Copy it into dataset table, need to re-label some columns.
810 self._db.insert(
811 self._static.dataset,
812 select=sqlalchemy.sql.select(
813 tmp_tags.columns.dataset_id.label("id"),
814 tmp_tags.columns.dataset_type_id,
815 tmp_tags.columns[collFkName].label(self._runKeyColumn),
816 ),
817 )
819 # Update the summary tables for this collection in case this
820 # is the first time this dataset type or these governor values
821 # will be inserted there.
822 self._summaries.update(run, [self._dataset_type_id], summary)
824 # Copy it into tags table.
825 self._db.insert(self._tags, select=tmp_tags.select())
827 # Return refs in the same order as in the input list.
828 for dataset_id, dataId in dataIds.items():
829 yield DatasetRef(
830 datasetType=self.datasetType,
831 id=dataset_id,
832 dataId=dataId,
833 run=run.name,
834 )
836 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
837 """Validate imported refs against existing datasets.
839 Parameters
840 ----------
841 tmp_tags : `sqlalchemy.schema.Table`
842 Temporary table with new datasets and the same schema as tags
843 table.
844 run : `RunRecord`
845 The record object describing the `~CollectionType.RUN` collection.
847 Raises
848 ------
849 ConflictingDefinitionError
850 Raise if new datasets conflict with existing ones.
851 """
852 dataset = self._static.dataset
853 tags = self._tags
854 collFkName = self._collections.getCollectionForeignKeyName()
856 # Check that existing datasets have the same dataset type and
857 # run.
858 query = (
859 sqlalchemy.sql.select(
860 dataset.columns.id.label("dataset_id"),
861 dataset.columns.dataset_type_id.label("dataset_type_id"),
862 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"),
863 dataset.columns[self._runKeyColumn].label("run"),
864 tmp_tags.columns[collFkName].label("new run"),
865 )
866 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
867 .where(
868 sqlalchemy.sql.or_(
869 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
870 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
871 )
872 )
873 .limit(1)
874 )
875 with self._db.query(query) as result:
876 if (row := result.first()) is not None:
877 # Only include the first one in the exception message
878 raise ConflictingDefinitionError(
879 f"Existing dataset type or run do not match new dataset: {row._asdict()}"
880 )
882 # Check that matching dataset in tags table has the same DataId.
883 query = (
884 sqlalchemy.sql.select(
885 tags.columns.dataset_id,
886 tags.columns.dataset_type_id.label("type_id"),
887 tmp_tags.columns.dataset_type_id.label("new type_id"),
888 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
889 *[
890 tmp_tags.columns[dim].label(f"new {dim}")
891 for dim in self.datasetType.dimensions.required.names
892 ],
893 )
894 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
895 .where(
896 sqlalchemy.sql.or_(
897 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
898 *[
899 tags.columns[dim] != tmp_tags.columns[dim]
900 for dim in self.datasetType.dimensions.required.names
901 ],
902 )
903 )
904 .limit(1)
905 )
907 with self._db.query(query) as result:
908 if (row := result.first()) is not None:
909 # Only include the first one in the exception message
910 raise ConflictingDefinitionError(
911 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
912 )
914 # Check that matching run+dataId have the same dataset ID.
915 query = (
916 sqlalchemy.sql.select(
917 tags.columns.dataset_type_id.label("dataset_type_id"),
918 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
919 tags.columns.dataset_id,
920 tmp_tags.columns.dataset_id.label("new dataset_id"),
921 tags.columns[collFkName],
922 tmp_tags.columns[collFkName].label(f"new {collFkName}"),
923 )
924 .select_from(
925 tags.join(
926 tmp_tags,
927 sqlalchemy.sql.and_(
928 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
929 tags.columns[collFkName] == tmp_tags.columns[collFkName],
930 *[
931 tags.columns[dim] == tmp_tags.columns[dim]
932 for dim in self.datasetType.dimensions.required.names
933 ],
934 ),
935 )
936 )
937 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
938 .limit(1)
939 )
940 with self._db.query(query) as result:
941 if (row := result.first()) is not None:
942 # only include the first one in the exception message
943 raise ConflictingDefinitionError(
944 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
945 )