Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 89%
317 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:18 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:18 -0700
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5import uuid
6from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple
8import sqlalchemy
9from lsst.daf.butler import (
10 CollectionType,
11 DataCoordinate,
12 DataCoordinateSet,
13 DatasetId,
14 DatasetRef,
15 DatasetType,
16 SimpleQuery,
17 Timespan,
18 ddl,
19)
20from lsst.daf.butler.registry import (
21 CollectionTypeError,
22 ConflictingDefinitionError,
23 UnsupportedIdGeneratorError,
24)
25from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage
27from ...summaries import GovernorDimensionRestriction
28from .tables import makeTagTableSpec
30if TYPE_CHECKING: 30 ↛ 31line 30 didn't jump to line 31, because the condition on line 30 was never true
31 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
32 from .summaries import CollectionSummaryManager
33 from .tables import StaticDatasetTablesTuple
36class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
37 """Dataset record storage implementation paired with
38 `ByDimensionsDatasetRecordStorageManager`; see that class for more
39 information.
41 Instances of this class should never be constructed directly; use
42 `DatasetRecordStorageManager.register` instead.
43 """
45 def __init__(
46 self,
47 *,
48 datasetType: DatasetType,
49 db: Database,
50 dataset_type_id: int,
51 collections: CollectionManager,
52 static: StaticDatasetTablesTuple,
53 summaries: CollectionSummaryManager,
54 tags: sqlalchemy.schema.Table,
55 calibs: Optional[sqlalchemy.schema.Table],
56 ):
57 super().__init__(datasetType=datasetType)
58 self._dataset_type_id = dataset_type_id
59 self._db = db
60 self._collections = collections
61 self._static = static
62 self._summaries = summaries
63 self._tags = tags
64 self._calibs = calibs
65 self._runKeyColumn = collections.getRunForeignKeyName()
67 def find(
68 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None
69 ) -> Optional[DatasetRef]:
70 # Docstring inherited from DatasetRecordStorage.
71 assert dataId.graph == self.datasetType.dimensions
72 if collection.type is CollectionType.CALIBRATION and timespan is None: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true
73 raise TypeError(
74 f"Cannot search for dataset in CALIBRATION collection {collection.name} "
75 f"without an input timespan."
76 )
77 sql = self.select(
78 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan
79 )
80 with self._db.query(sql) as results:
81 row = results.fetchone()
82 if row is None:
83 return None
84 if collection.type is CollectionType.CALIBRATION:
85 # For temporal calibration lookups (only!) our invariants do
86 # not guarantee that the number of result rows is <= 1. They
87 # would if `select` constrained the given timespan to be
88 # _contained_ by the validity range in the self._calibs table,
89 # instead of simply _overlapping_ it, because we do guarantee
90 # that the validity ranges are disjoint for a particular
91 # dataset type, collection, and data ID. But using an overlap
92 # test and a check for multiple result rows here allows us to
93 # provide a more useful diagnostic, as well as allowing
94 # `select` to support more general queries where multiple
95 # results are not an error.
96 if results.fetchone() is not None:
97 raise RuntimeError(
98 f"Multiple matches found for calibration lookup in {collection.name} for "
99 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
100 )
101 return DatasetRef(
102 datasetType=self.datasetType,
103 dataId=dataId,
104 id=row.id,
105 run=self._collections[row._mapping[self._runKeyColumn]].name,
106 )
108 def delete(self, datasets: Iterable[DatasetRef]) -> None:
109 # Docstring inherited from DatasetRecordStorage.
110 # Only delete from common dataset table; ON DELETE foreign key clauses
111 # will handle the rest.
112 self._db.delete(
113 self._static.dataset,
114 ["id"],
115 *[{"id": dataset.getCheckedId()} for dataset in datasets],
116 )
118 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
119 # Docstring inherited from DatasetRecordStorage.
120 if collection.type is not CollectionType.TAGGED: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true
121 raise TypeError(
122 f"Cannot associate into collection '{collection.name}' "
123 f"of type {collection.type.name}; must be TAGGED."
124 )
125 protoRow = {
126 self._collections.getCollectionForeignKeyName(): collection.key,
127 "dataset_type_id": self._dataset_type_id,
128 }
129 rows = []
130 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
131 for dataset in datasets:
132 row = dict(protoRow, dataset_id=dataset.getCheckedId())
133 for dimension, value in dataset.dataId.items():
134 row[dimension.name] = value
135 governorValues.update_extract(dataset.dataId)
136 rows.append(row)
137 # Update the summary tables for this collection in case this is the
138 # first time this dataset type or these governor values will be
139 # inserted there.
140 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
141 # Update the tag table itself.
142 self._db.replace(self._tags, *rows)
144 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
145 # Docstring inherited from DatasetRecordStorage.
146 if collection.type is not CollectionType.TAGGED: 146 ↛ 147line 146 didn't jump to line 147, because the condition on line 146 was never true
147 raise TypeError(
148 f"Cannot disassociate from collection '{collection.name}' "
149 f"of type {collection.type.name}; must be TAGGED."
150 )
151 rows = [
152 {
153 "dataset_id": dataset.getCheckedId(),
154 self._collections.getCollectionForeignKeyName(): collection.key,
155 }
156 for dataset in datasets
157 ]
158 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
160 def _buildCalibOverlapQuery(
161 self, collection: CollectionRecord, dataIds: Optional[DataCoordinateSet], timespan: Timespan
162 ) -> SimpleQuery:
163 assert self._calibs is not None
164 # Start by building a SELECT query for any rows that would overlap
165 # this one.
166 query = SimpleQuery()
167 query.join(self._calibs)
168 # Add a WHERE clause matching the dataset type and collection.
169 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
170 query.where.append(
171 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
172 )
173 # Add a WHERE clause matching any of the given data IDs.
174 if dataIds is not None:
175 dataIds.constrain(
176 query,
177 lambda name: self._calibs.columns[name], # type: ignore
178 )
179 # Add WHERE clause for timespan overlaps.
180 TimespanReprClass = self._db.getTimespanRepresentation()
181 query.where.append(
182 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan))
183 )
184 return query
186 def certify(
187 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
188 ) -> None:
189 # Docstring inherited from DatasetRecordStorage.
190 if self._calibs is None: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true
191 raise CollectionTypeError(
192 f"Cannot certify datasets of type {self.datasetType.name}, for which "
193 f"DatasetType.isCalibration() is False."
194 )
195 if collection.type is not CollectionType.CALIBRATION: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true
196 raise CollectionTypeError(
197 f"Cannot certify into collection '{collection.name}' "
198 f"of type {collection.type.name}; must be CALIBRATION."
199 )
200 TimespanReprClass = self._db.getTimespanRepresentation()
201 protoRow = {
202 self._collections.getCollectionForeignKeyName(): collection.key,
203 "dataset_type_id": self._dataset_type_id,
204 }
205 rows = []
206 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
207 dataIds: Optional[Set[DataCoordinate]] = (
208 set() if not TimespanReprClass.hasExclusionConstraint() else None
209 )
210 for dataset in datasets:
211 row = dict(protoRow, dataset_id=dataset.getCheckedId())
212 for dimension, value in dataset.dataId.items():
213 row[dimension.name] = value
214 TimespanReprClass.update(timespan, result=row)
215 governorValues.update_extract(dataset.dataId)
216 rows.append(row)
217 if dataIds is not None: 217 ↛ 210line 217 didn't jump to line 210, because the condition on line 217 was never false
218 dataIds.add(dataset.dataId)
219 # Update the summary tables for this collection in case this is the
220 # first time this dataset type or these governor values will be
221 # inserted there.
222 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
223 # Update the association table itself.
224 if TimespanReprClass.hasExclusionConstraint(): 224 ↛ 227line 224 didn't jump to line 227, because the condition on line 224 was never true
225 # Rely on database constraint to enforce invariants; we just
226 # reraise the exception for consistency across DB engines.
227 try:
228 self._db.insert(self._calibs, *rows)
229 except sqlalchemy.exc.IntegrityError as err:
230 raise ConflictingDefinitionError(
231 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
232 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
233 ) from err
234 else:
235 # Have to implement exclusion constraint ourselves.
236 # Start by building a SELECT query for any rows that would overlap
237 # this one.
238 query = self._buildCalibOverlapQuery(
239 collection,
240 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
241 timespan,
242 )
243 query.columns.append(sqlalchemy.sql.func.count())
244 sql = query.combine()
245 # Acquire a table lock to ensure there are no concurrent writes
246 # could invalidate our checking before we finish the inserts. We
247 # use a SAVEPOINT in case there is an outer transaction that a
248 # failure here should not roll back.
249 with self._db.transaction(lock=[self._calibs], savepoint=True):
250 # Run the check SELECT query.
251 with self._db.query(sql) as sql_result:
252 conflicting = sql_result.scalar()
253 if conflicting > 0:
254 raise ConflictingDefinitionError(
255 f"{conflicting} validity range conflicts certifying datasets of type "
256 f"{self.datasetType.name} into {collection.name} for range "
257 f"[{timespan.begin}, {timespan.end})."
258 )
259 # Proceed with the insert.
260 self._db.insert(self._calibs, *rows)
262 def decertify(
263 self,
264 collection: CollectionRecord,
265 timespan: Timespan,
266 *,
267 dataIds: Optional[Iterable[DataCoordinate]] = None,
268 ) -> None:
269 # Docstring inherited from DatasetRecordStorage.
270 if self._calibs is None: 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true
271 raise CollectionTypeError(
272 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
273 f"DatasetType.isCalibration() is False."
274 )
275 if collection.type is not CollectionType.CALIBRATION: 275 ↛ 276line 275 didn't jump to line 276, because the condition on line 275 was never true
276 raise CollectionTypeError(
277 f"Cannot decertify from collection '{collection.name}' "
278 f"of type {collection.type.name}; must be CALIBRATION."
279 )
280 TimespanReprClass = self._db.getTimespanRepresentation()
281 # Construct a SELECT query to find all rows that overlap our inputs.
282 dataIdSet: Optional[DataCoordinateSet]
283 if dataIds is not None:
284 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
285 else:
286 dataIdSet = None
287 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
288 query.columns.extend(self._calibs.columns)
289 sql = query.combine()
290 # Set up collections to populate with the rows we'll want to modify.
291 # The insert rows will have the same values for collection and
292 # dataset type.
293 protoInsertRow = {
294 self._collections.getCollectionForeignKeyName(): collection.key,
295 "dataset_type_id": self._dataset_type_id,
296 }
297 rowsToDelete = []
298 rowsToInsert = []
299 # Acquire a table lock to ensure there are no concurrent writes
300 # between the SELECT and the DELETE and INSERT queries based on it.
301 with self._db.transaction(lock=[self._calibs], savepoint=True):
302 with self._db.query(sql) as sql_result:
303 sql_rows = sql_result.mappings().fetchall()
304 for row in sql_rows:
305 rowsToDelete.append({"id": row["id"]})
306 # Construct the insert row(s) by copying the prototype row,
307 # then adding the dimension column values, then adding what's
308 # left of the timespan from that row after we subtract the
309 # given timespan.
310 newInsertRow = protoInsertRow.copy()
311 newInsertRow["dataset_id"] = row["dataset_id"]
312 for name in self.datasetType.dimensions.required.names:
313 newInsertRow[name] = row[name]
314 rowTimespan = TimespanReprClass.extract(row)
315 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
316 for diffTimespan in rowTimespan.difference(timespan):
317 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
318 # Run the DELETE and INSERT queries.
319 self._db.delete(self._calibs, ["id"], *rowsToDelete)
320 self._db.insert(self._calibs, *rowsToInsert)
322 def select(
323 self,
324 *collections: CollectionRecord,
325 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
326 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
327 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
328 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
329 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
330 ) -> sqlalchemy.sql.Selectable:
331 # Docstring inherited from DatasetRecordStorage.
332 collection_types = {collection.type for collection in collections}
333 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
334 #
335 # There are two kinds of table in play here:
336 #
337 # - the static dataset table (with the dataset ID, dataset type ID,
338 # run ID/name, and ingest date);
339 #
340 # - the dynamic tags/calibs table (with the dataset ID, dataset type
341 # type ID, collection ID/name, data ID, and possibly validity
342 # range).
343 #
344 # That means that we might want to return a query against either table
345 # or a JOIN of both, depending on which quantities the caller wants.
346 # But this method is documented/typed such that ``dataId`` is never
347 # `None` - i.e. we always constrain or retreive the data ID. That
348 # means we'll always include the tags/calibs table and join in the
349 # static dataset table only if we need things from it that we can't get
350 # from the tags/calibs table.
351 #
352 # Note that it's important that we include a WHERE constraint on both
353 # tables for any column (e.g. dataset_type_id) that is in both when
354 # it's given explicitly; not doing can prevent the query planner from
355 # using very important indexes. At present, we don't include those
356 # redundant columns in the JOIN ON expression, however, because the
357 # FOREIGN KEY (and its index) are defined only on dataset_id.
358 #
359 # We'll start by accumulating kwargs to pass to SimpleQuery.join when
360 # we bring in the tags/calibs table. We get the data ID or constrain
361 # it in the tags/calibs table(s), but that's multiple columns, not one,
362 # so we need to transform the one Select.Or argument into a dictionary
363 # of them.
364 kwargs: Dict[str, Any]
365 if dataId is SimpleQuery.Select:
366 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
367 else:
368 kwargs = dict(dataId.byName())
369 # We always constrain (never retrieve) the dataset type in at least the
370 # tags/calibs table.
371 kwargs["dataset_type_id"] = self._dataset_type_id
372 # Join in the tags and/or calibs tables, turning those 'kwargs' entries
373 # into WHERE constraints or SELECT columns as appropriate.
374 if collection_types != {CollectionType.CALIBRATION}:
375 # We'll need a subquery for the tags table if any of the given
376 # collections are not a CALIBRATION collection. This intentionally
377 # also fires when the list of collections is empty as a way to
378 # create a dummy subquery that we know will fail.
379 tags_query = SimpleQuery()
380 tags_query.join(self._tags, **kwargs)
381 self._finish_single_select(
382 tags_query, self._tags, collections, id=id, run=run, ingestDate=ingestDate
383 )
384 else:
385 tags_query = None
386 if CollectionType.CALIBRATION in collection_types:
387 # If at least one collection is a CALIBRATION collection, we'll
388 # need a subquery for the calibs table, and could include the
389 # timespan as a result or constraint.
390 calibs_query = SimpleQuery()
391 assert (
392 self._calibs is not None
393 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
394 TimespanReprClass = self._db.getTimespanRepresentation()
395 # Add the timespan column(s) to the result columns, or constrain
396 # the timespan via an overlap condition.
397 if timespan is SimpleQuery.Select:
398 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()})
399 elif timespan is not None:
400 calibs_query.where.append(
401 TimespanReprClass.fromSelectable(self._calibs).overlaps(
402 TimespanReprClass.fromLiteral(timespan)
403 )
404 )
405 calibs_query.join(self._calibs, **kwargs)
406 self._finish_single_select(
407 calibs_query, self._calibs, collections, id=id, run=run, ingestDate=ingestDate
408 )
409 else:
410 calibs_query = None
411 if calibs_query is not None:
412 if tags_query is not None:
413 if timespan is not None: 413 ↛ 414line 413 didn't jump to line 414, because the condition on line 413 was never true
414 raise TypeError(
415 "Cannot query for timespan when the collections include both calibration and "
416 "non-calibration collections."
417 )
418 return tags_query.combine().union(calibs_query.combine())
419 else:
420 return calibs_query.combine()
421 else:
422 assert tags_query is not None, "Earlier logic should guaranteed at least one is not None."
423 return tags_query.combine()
425 def _finish_single_select(
426 self,
427 query: SimpleQuery,
428 table: sqlalchemy.schema.Table,
429 collections: Sequence[CollectionRecord],
430 id: SimpleQuery.Select.Or[Optional[int]],
431 run: SimpleQuery.Select.Or[None],
432 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]],
433 ) -> None:
434 dataset_id_col = table.columns.dataset_id
435 collection_col = table.columns[self._collections.getCollectionForeignKeyName()]
436 # We always constrain (never retrieve) the collection(s) in the
437 # tags/calibs table.
438 if len(collections) == 1:
439 query.where.append(collection_col == collections[0].key)
440 elif len(collections) == 0:
441 # We support the case where there are no collections as a way to
442 # generate a valid SQL query that can't yield results. This should
443 # never get executed, but lots of downstream code will still try
444 # to access the SQLAlchemy objects representing the columns in the
445 # subquery. That's not ideal, but it'd take a lot of refactoring
446 # to fix it (DM-31725).
447 query.where.append(sqlalchemy.sql.literal(False))
448 else:
449 query.where.append(collection_col.in_([collection.key for collection in collections]))
450 # We can always get the dataset_id from the tags/calibs table or
451 # constrain it there. Can't use kwargs for that because we need to
452 # alias it to 'id'.
453 if id is SimpleQuery.Select:
454 query.columns.append(dataset_id_col.label("id"))
455 elif id is not None: 455 ↛ 456line 455 didn't jump to line 456, because the condition on line 455 was never true
456 query.where.append(dataset_id_col == id)
457 # It's possible we now have everything we need, from just the
458 # tags/calibs table. The things we might need to get from the static
459 # dataset table are the run key and the ingest date.
460 need_static_table = False
461 static_kwargs: Dict[str, Any] = {}
462 if run is not None:
463 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection."
464 if len(collections) == 1 and collections[0].type is CollectionType.RUN:
465 # If we are searching exactly one RUN collection, we
466 # know that if we find the dataset in that collection,
467 # then that's the datasets's run; we don't need to
468 # query for it.
469 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn))
470 else:
471 static_kwargs[self._runKeyColumn] = SimpleQuery.Select
472 need_static_table = True
473 # Ingest date can only come from the static table.
474 if ingestDate is not None:
475 need_static_table = True
476 if ingestDate is SimpleQuery.Select: 476 ↛ 479line 476 didn't jump to line 479, because the condition on line 476 was never false
477 static_kwargs["ingest_date"] = SimpleQuery.Select
478 else:
479 assert isinstance(ingestDate, Timespan)
480 # Timespan is astropy Time (usually in TAI) and ingest_date is
481 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
482 if ingestDate.isEmpty():
483 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
484 if ingestDate.begin is not None:
485 begin = ingestDate.begin.utc.datetime # type: ignore
486 query.where.append(self._static.dataset.columns.ingest_date >= begin)
487 if ingestDate.end is not None:
488 end = ingestDate.end.utc.datetime # type: ignore
489 query.where.append(self._static.dataset.columns.ingest_date < end)
490 # If we need the static table, join it in via dataset_id and
491 # dataset_type_id
492 if need_static_table:
493 query.join(
494 self._static.dataset,
495 onclause=(dataset_id_col == self._static.dataset.columns.id),
496 **static_kwargs,
497 )
498 # Also constrain dataset_type_id in static table in case that helps
499 # generate a better plan.
500 # We could also include this in the JOIN ON clause, but my guess is
501 # that that's a good idea IFF it's in the foreign key, and right
502 # now it isn't.
503 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
505 def getDataId(self, id: DatasetId) -> DataCoordinate:
506 """Return DataId for a dataset.
508 Parameters
509 ----------
510 id : `DatasetId`
511 Unique dataset identifier.
513 Returns
514 -------
515 dataId : `DataCoordinate`
516 DataId for the dataset.
517 """
518 # This query could return multiple rows (one for each tagged collection
519 # the dataset is in, plus one for its run collection), and we don't
520 # care which of those we get.
521 sql = (
522 self._tags.select()
523 .where(
524 sqlalchemy.sql.and_(
525 self._tags.columns.dataset_id == id,
526 self._tags.columns.dataset_type_id == self._dataset_type_id,
527 )
528 )
529 .limit(1)
530 )
531 with self._db.query(sql) as sql_result:
532 row = sql_result.mappings().fetchone()
533 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
534 return DataCoordinate.standardize(
535 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
536 graph=self.datasetType.dimensions,
537 )
540class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
541 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
542 auto-incremented column for dataset IDs.
543 """
545 def insert(
546 self,
547 run: RunRecord,
548 dataIds: Iterable[DataCoordinate],
549 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
550 ) -> Iterator[DatasetRef]:
551 # Docstring inherited from DatasetRecordStorage.
553 # We only support UNIQUE mode for integer dataset IDs
554 if idMode != DatasetIdGenEnum.UNIQUE: 554 ↛ 555line 554 didn't jump to line 555, because the condition on line 554 was never true
555 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
557 # Transform a possibly-single-pass iterable into a list.
558 dataIdList = list(dataIds)
559 yield from self._insert(run, dataIdList)
561 def import_(
562 self,
563 run: RunRecord,
564 datasets: Iterable[DatasetRef],
565 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
566 reuseIds: bool = False,
567 ) -> Iterator[DatasetRef]:
568 # Docstring inherited from DatasetRecordStorage.
570 # We only support UNIQUE mode for integer dataset IDs
571 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 571 ↛ 572line 571 didn't jump to line 572, because the condition on line 571 was never true
572 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
574 # Make a list of dataIds and optionally dataset IDs.
575 dataIdList: List[DataCoordinate] = []
576 datasetIdList: List[int] = []
577 for dataset in datasets:
578 dataIdList.append(dataset.dataId)
580 # We only accept integer dataset IDs, but also allow None.
581 datasetId = dataset.id
582 if datasetId is None: 582 ↛ 584line 582 didn't jump to line 584, because the condition on line 582 was never true
583 # if reuseIds is set then all IDs must be known
584 if reuseIds:
585 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
586 elif isinstance(datasetId, int): 586 ↛ 590line 586 didn't jump to line 590, because the condition on line 586 was never false
587 if reuseIds:
588 datasetIdList.append(datasetId)
589 else:
590 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
592 yield from self._insert(run, dataIdList, datasetIdList)
594 def _insert(
595 self, run: RunRecord, dataIdList: List[DataCoordinate], datasetIdList: Optional[List[int]] = None
596 ) -> Iterator[DatasetRef]:
597 """Common part of implementation of `insert` and `import_` methods."""
599 # Remember any governor dimension values we see.
600 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
601 for dataId in dataIdList:
602 governorValues.update_extract(dataId)
604 staticRow = {
605 "dataset_type_id": self._dataset_type_id,
606 self._runKeyColumn: run.key,
607 }
608 with self._db.transaction():
609 # Insert into the static dataset table, generating autoincrement
610 # dataset_id values.
611 if datasetIdList:
612 # reuse existing IDs
613 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
614 self._db.insert(self._static.dataset, *rows)
615 else:
616 # use auto-incremented IDs
617 datasetIdList = self._db.insert(
618 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True
619 )
620 assert datasetIdList is not None
621 # Update the summary tables for this collection in case this is the
622 # first time this dataset type or these governor values will be
623 # inserted there.
624 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
625 # Combine the generated dataset_id values and data ID fields to
626 # form rows to be inserted into the tags table.
627 protoTagsRow = {
628 "dataset_type_id": self._dataset_type_id,
629 self._collections.getCollectionForeignKeyName(): run.key,
630 }
631 tagsRows = [
632 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
633 for dataId, dataset_id in zip(dataIdList, datasetIdList)
634 ]
635 # Insert those rows into the tags table. This is where we'll
636 # get any unique constraint violations.
637 self._db.insert(self._tags, *tagsRows)
639 for dataId, datasetId in zip(dataIdList, datasetIdList):
640 yield DatasetRef(
641 datasetType=self.datasetType,
642 dataId=dataId,
643 id=datasetId,
644 run=run.name,
645 )
648class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
649 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
650 dataset IDs.
651 """
653 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
654 """Namespace UUID used for UUID5 generation. Do not change. This was
655 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
656 """
658 def insert(
659 self,
660 run: RunRecord,
661 dataIds: Iterable[DataCoordinate],
662 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
663 ) -> Iterator[DatasetRef]:
664 # Docstring inherited from DatasetRecordStorage.
666 # Remember any governor dimension values we see.
667 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
669 # Iterate over data IDs, transforming a possibly-single-pass iterable
670 # into a list.
671 dataIdList = []
672 rows = []
673 for dataId in dataIds:
674 dataIdList.append(dataId)
675 rows.append(
676 {
677 "id": self._makeDatasetId(run, dataId, idMode),
678 "dataset_type_id": self._dataset_type_id,
679 self._runKeyColumn: run.key,
680 }
681 )
682 governorValues.update_extract(dataId)
684 with self._db.transaction():
685 # Insert into the static dataset table.
686 self._db.insert(self._static.dataset, *rows)
687 # Update the summary tables for this collection in case this is the
688 # first time this dataset type or these governor values will be
689 # inserted there.
690 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
691 # Combine the generated dataset_id values and data ID fields to
692 # form rows to be inserted into the tags table.
693 protoTagsRow = {
694 "dataset_type_id": self._dataset_type_id,
695 self._collections.getCollectionForeignKeyName(): run.key,
696 }
697 tagsRows = [
698 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
699 for dataId, row in zip(dataIdList, rows)
700 ]
701 # Insert those rows into the tags table.
702 self._db.insert(self._tags, *tagsRows)
704 for dataId, row in zip(dataIdList, rows):
705 yield DatasetRef(
706 datasetType=self.datasetType,
707 dataId=dataId,
708 id=row["id"],
709 run=run.name,
710 )
712 def import_(
713 self,
714 run: RunRecord,
715 datasets: Iterable[DatasetRef],
716 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
717 reuseIds: bool = False,
718 ) -> Iterator[DatasetRef]:
719 # Docstring inherited from DatasetRecordStorage.
721 # Remember any governor dimension values we see.
722 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
724 # Iterate over data IDs, transforming a possibly-single-pass iterable
725 # into a list.
726 dataIds = {}
727 for dataset in datasets:
728 # Ignore unknown ID types, normally all IDs have the same type but
729 # this code supports mixed types or missing IDs.
730 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
731 if datasetId is None:
732 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
733 dataIds[datasetId] = dataset.dataId
734 governorValues.update_extract(dataset.dataId)
736 # We'll insert all new rows into a temporary table
737 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False)
738 collFkName = self._collections.getCollectionForeignKeyName()
739 protoTagsRow = {
740 "dataset_type_id": self._dataset_type_id,
741 collFkName: run.key,
742 }
743 tmpRows = [
744 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
745 for dataset_id, dataId in dataIds.items()
746 ]
747 with self._db.transaction(for_temp_tables=True):
748 with self._db.temporary_table(tableSpec) as tmp_tags:
749 # store all incoming data in a temporary table
750 self._db.insert(tmp_tags, *tmpRows)
752 # There are some checks that we want to make for consistency
753 # of the new datasets with existing ones.
754 self._validateImport(tmp_tags, run)
756 # Before we merge temporary table into dataset/tags we need to
757 # drop datasets which are already there (and do not conflict).
758 self._db.deleteWhere(
759 tmp_tags,
760 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
761 )
763 # Copy it into dataset table, need to re-label some columns.
764 self._db.insert(
765 self._static.dataset,
766 select=sqlalchemy.sql.select(
767 tmp_tags.columns.dataset_id.label("id"),
768 tmp_tags.columns.dataset_type_id,
769 tmp_tags.columns[collFkName].label(self._runKeyColumn),
770 ),
771 )
773 # Update the summary tables for this collection in case this
774 # is the first time this dataset type or these governor values
775 # will be inserted there.
776 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
778 # Copy it into tags table.
779 self._db.insert(self._tags, select=tmp_tags.select())
781 # Return refs in the same order as in the input list.
782 for dataset_id, dataId in dataIds.items():
783 yield DatasetRef(
784 datasetType=self.datasetType,
785 id=dataset_id,
786 dataId=dataId,
787 run=run.name,
788 )
790 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
791 """Validate imported refs against existing datasets.
793 Parameters
794 ----------
795 tmp_tags : `sqlalchemy.schema.Table`
796 Temporary table with new datasets and the same schema as tags
797 table.
798 run : `RunRecord`
799 The record object describing the `~CollectionType.RUN` collection.
801 Raises
802 ------
803 ConflictingDefinitionError
804 Raise if new datasets conflict with existing ones.
805 """
806 dataset = self._static.dataset
807 tags = self._tags
808 collFkName = self._collections.getCollectionForeignKeyName()
810 # Check that existing datasets have the same dataset type and
811 # run.
812 query = (
813 sqlalchemy.sql.select(
814 dataset.columns.id.label("dataset_id"),
815 dataset.columns.dataset_type_id.label("dataset_type_id"),
816 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"),
817 dataset.columns[self._runKeyColumn].label("run"),
818 tmp_tags.columns[collFkName].label("new run"),
819 )
820 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
821 .where(
822 sqlalchemy.sql.or_(
823 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
824 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
825 )
826 )
827 .limit(1)
828 )
829 with self._db.query(query) as result:
830 if (row := result.first()) is not None:
831 # Only include the first one in the exception message
832 raise ConflictingDefinitionError(
833 f"Existing dataset type or run do not match new dataset: {row._asdict()}"
834 )
836 # Check that matching dataset in tags table has the same DataId.
837 query = (
838 sqlalchemy.sql.select(
839 tags.columns.dataset_id,
840 tags.columns.dataset_type_id.label("type_id"),
841 tmp_tags.columns.dataset_type_id.label("new type_id"),
842 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
843 *[
844 tmp_tags.columns[dim].label(f"new {dim}")
845 for dim in self.datasetType.dimensions.required.names
846 ],
847 )
848 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
849 .where(
850 sqlalchemy.sql.or_(
851 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
852 *[
853 tags.columns[dim] != tmp_tags.columns[dim]
854 for dim in self.datasetType.dimensions.required.names
855 ],
856 )
857 )
858 .limit(1)
859 )
861 with self._db.query(query) as result:
862 if (row := result.first()) is not None:
863 # Only include the first one in the exception message
864 raise ConflictingDefinitionError(
865 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
866 )
868 # Check that matching run+dataId have the same dataset ID.
869 query = (
870 sqlalchemy.sql.select(
871 tags.columns.dataset_type_id.label("dataset_type_id"),
872 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
873 tags.columns.dataset_id,
874 tmp_tags.columns.dataset_id.label("new dataset_id"),
875 tags.columns[collFkName],
876 tmp_tags.columns[collFkName].label(f"new {collFkName}"),
877 )
878 .select_from(
879 tags.join(
880 tmp_tags,
881 sqlalchemy.sql.and_(
882 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
883 tags.columns[collFkName] == tmp_tags.columns[collFkName],
884 *[
885 tags.columns[dim] == tmp_tags.columns[dim]
886 for dim in self.datasetType.dimensions.required.names
887 ],
888 ),
889 )
890 )
891 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
892 .limit(1)
893 )
894 with self._db.query(query) as result:
895 if (row := result.first()) is not None:
896 # only include the first one in the exception message
897 raise ConflictingDefinitionError(
898 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
899 )
901 def _makeDatasetId(
902 self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum
903 ) -> uuid.UUID:
904 """Generate dataset ID for a dataset.
906 Parameters
907 ----------
908 run : `RunRecord`
909 The record object describing the RUN collection for the dataset.
910 dataId : `DataCoordinate`
911 Expanded data ID for the dataset.
912 idGenerationMode : `DatasetIdGenEnum`
913 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
914 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
915 deterministic UUID5-type ID based on a dataset type name and
916 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
917 deterministic UUID5-type ID based on a dataset type name, run
918 collection name, and ``dataId``.
920 Returns
921 -------
922 datasetId : `uuid.UUID`
923 Dataset identifier.
924 """
925 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
926 return uuid.uuid4()
927 else:
928 # WARNING: If you modify this code make sure that the order of
929 # items in the `items` list below never changes.
930 items: List[Tuple[str, str]] = []
931 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
932 items = [
933 ("dataset_type", self.datasetType.name),
934 ]
935 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 935 ↛ 941line 935 didn't jump to line 941, because the condition on line 935 was never false
936 items = [
937 ("dataset_type", self.datasetType.name),
938 ("run", run.name),
939 ]
940 else:
941 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
943 for name, value in sorted(dataId.byName().items()):
944 items.append((name, str(value)))
945 data = ",".join(f"{key}={value}" for key, value in items)
946 return uuid.uuid5(self.NS_UUID, data)