Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 88%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5from typing import (
6 Any,
7 Dict,
8 Iterable,
9 Iterator,
10 List,
11 Optional,
12 Set,
13 Tuple,
14 TYPE_CHECKING,
15)
16import uuid
18import sqlalchemy
20from lsst.daf.butler import (
21 CollectionType,
22 DataCoordinate,
23 DataCoordinateSet,
24 DatasetId,
25 DatasetRef,
26 DatasetType,
27 SimpleQuery,
28 Timespan,
29 ddl
30)
31from lsst.daf.butler.registry import ConflictingDefinitionError, UnsupportedIdGeneratorError
32from lsst.daf.butler.registry.interfaces import DatasetRecordStorage, DatasetIdGenEnum
34from ...summaries import GovernorDimensionRestriction
35from .tables import makeTagTableSpec
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
39 from .tables import StaticDatasetTablesTuple
40 from .summaries import CollectionSummaryManager
43class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
44 """Dataset record storage implementation paired with
45 `ByDimensionsDatasetRecordStorageManager`; see that class for more
46 information.
48 Instances of this class should never be constructed directly; use
49 `DatasetRecordStorageManager.register` instead.
50 """
52 def __init__(self, *, datasetType: DatasetType,
53 db: Database,
54 dataset_type_id: int,
55 collections: CollectionManager,
56 static: StaticDatasetTablesTuple,
57 summaries: CollectionSummaryManager,
58 tags: sqlalchemy.schema.Table,
59 calibs: Optional[sqlalchemy.schema.Table]):
60 super().__init__(datasetType=datasetType)
61 self._dataset_type_id = dataset_type_id
62 self._db = db
63 self._collections = collections
64 self._static = static
65 self._summaries = summaries
66 self._tags = tags
67 self._calibs = calibs
68 self._runKeyColumn = collections.getRunForeignKeyName()
70 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
71 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
72 # Docstring inherited from DatasetRecordStorage.
73 assert dataId.graph == self.datasetType.dimensions
74 if collection.type is CollectionType.CALIBRATION and timespan is None: 74 ↛ 75line 74 didn't jump to line 75, because the condition on line 74 was never true
75 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} "
76 f"without an input timespan.")
77 sql = self.select(collection, dataId=dataId, id=SimpleQuery.Select,
78 run=SimpleQuery.Select, timespan=timespan)
79 sql = sql.combine()
80 results = self._db.query(sql)
81 row = results.fetchone()
82 if row is None:
83 return None
84 if collection.type is CollectionType.CALIBRATION:
85 # For temporal calibration lookups (only!) our invariants do not
86 # guarantee that the number of result rows is <= 1.
87 # They would if `select` constrained the given timespan to be
88 # _contained_ by the validity range in the self._calibs table,
89 # instead of simply _overlapping_ it, because we do guarantee that
90 # the validity ranges are disjoint for a particular dataset type,
91 # collection, and data ID. But using an overlap test and a check
92 # for multiple result rows here allows us to provide a more useful
93 # diagnostic, as well as allowing `select` to support more general
94 # queries where multiple results are not an error.
95 if results.fetchone() is not None:
96 raise RuntimeError(
97 f"Multiple matches found for calibration lookup in {collection.name} for "
98 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
99 )
100 return DatasetRef(
101 datasetType=self.datasetType,
102 dataId=dataId,
103 id=row.id,
104 run=self._collections[row._mapping[self._runKeyColumn]].name
105 )
107 def delete(self, datasets: Iterable[DatasetRef]) -> None:
108 # Docstring inherited from DatasetRecordStorage.
109 # Only delete from common dataset table; ON DELETE foreign key clauses
110 # will handle the rest.
111 self._db.delete(
112 self._static.dataset,
113 ["id"],
114 *[{"id": dataset.getCheckedId()} for dataset in datasets],
115 )
117 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
118 # Docstring inherited from DatasetRecordStorage.
119 if collection.type is not CollectionType.TAGGED: 119 ↛ 120line 119 didn't jump to line 120, because the condition on line 119 was never true
120 raise TypeError(f"Cannot associate into collection '{collection.name}' "
121 f"of type {collection.type.name}; must be TAGGED.")
122 protoRow = {
123 self._collections.getCollectionForeignKeyName(): collection.key,
124 "dataset_type_id": self._dataset_type_id,
125 }
126 rows = []
127 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
128 for dataset in datasets:
129 row = dict(protoRow, dataset_id=dataset.getCheckedId())
130 for dimension, value in dataset.dataId.items():
131 row[dimension.name] = value
132 governorValues.update_extract(dataset.dataId)
133 rows.append(row)
134 # Update the summary tables for this collection in case this is the
135 # first time this dataset type or these governor values will be
136 # inserted there.
137 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
138 # Update the tag table itself.
139 self._db.replace(self._tags, *rows)
141 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
142 # Docstring inherited from DatasetRecordStorage.
143 if collection.type is not CollectionType.TAGGED: 143 ↛ 144line 143 didn't jump to line 144, because the condition on line 143 was never true
144 raise TypeError(f"Cannot disassociate from collection '{collection.name}' "
145 f"of type {collection.type.name}; must be TAGGED.")
146 rows = [
147 {
148 "dataset_id": dataset.getCheckedId(),
149 self._collections.getCollectionForeignKeyName(): collection.key
150 }
151 for dataset in datasets
152 ]
153 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()],
154 *rows)
156 def _buildCalibOverlapQuery(self, collection: CollectionRecord,
157 dataIds: Optional[DataCoordinateSet],
158 timespan: Timespan) -> SimpleQuery:
159 assert self._calibs is not None
160 # Start by building a SELECT query for any rows that would overlap
161 # this one.
162 query = SimpleQuery()
163 query.join(self._calibs)
164 # Add a WHERE clause matching the dataset type and collection.
165 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
166 query.where.append(
167 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
168 )
169 # Add a WHERE clause matching any of the given data IDs.
170 if dataIds is not None:
171 dataIds.constrain(
172 query,
173 lambda name: self._calibs.columns[name], # type: ignore
174 )
175 # Add WHERE clause for timespan overlaps.
176 TimespanReprClass = self._db.getTimespanRepresentation()
177 query.where.append(
178 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan))
179 )
180 return query
182 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
183 timespan: Timespan) -> None:
184 # Docstring inherited from DatasetRecordStorage.
185 if self._calibs is None: 185 ↛ 186line 185 didn't jump to line 186, because the condition on line 185 was never true
186 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which "
187 f"DatasetType.isCalibration() is False.")
188 if collection.type is not CollectionType.CALIBRATION: 188 ↛ 189line 188 didn't jump to line 189, because the condition on line 188 was never true
189 raise TypeError(f"Cannot certify into collection '{collection.name}' "
190 f"of type {collection.type.name}; must be CALIBRATION.")
191 TimespanReprClass = self._db.getTimespanRepresentation()
192 protoRow = {
193 self._collections.getCollectionForeignKeyName(): collection.key,
194 "dataset_type_id": self._dataset_type_id,
195 }
196 rows = []
197 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
198 dataIds: Optional[Set[DataCoordinate]] = (
199 set() if not TimespanReprClass.hasExclusionConstraint() else None
200 )
201 for dataset in datasets:
202 row = dict(protoRow, dataset_id=dataset.getCheckedId())
203 for dimension, value in dataset.dataId.items():
204 row[dimension.name] = value
205 TimespanReprClass.update(timespan, result=row)
206 governorValues.update_extract(dataset.dataId)
207 rows.append(row)
208 if dataIds is not None: 208 ↛ 201line 208 didn't jump to line 201, because the condition on line 208 was never false
209 dataIds.add(dataset.dataId)
210 # Update the summary tables for this collection in case this is the
211 # first time this dataset type or these governor values will be
212 # inserted there.
213 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
214 # Update the association table itself.
215 if TimespanReprClass.hasExclusionConstraint(): 215 ↛ 218line 215 didn't jump to line 218, because the condition on line 215 was never true
216 # Rely on database constraint to enforce invariants; we just
217 # reraise the exception for consistency across DB engines.
218 try:
219 self._db.insert(self._calibs, *rows)
220 except sqlalchemy.exc.IntegrityError as err:
221 raise ConflictingDefinitionError(
222 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
223 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
224 ) from err
225 else:
226 # Have to implement exclusion constraint ourselves.
227 # Start by building a SELECT query for any rows that would overlap
228 # this one.
229 query = self._buildCalibOverlapQuery(
230 collection,
231 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
232 timespan
233 )
234 query.columns.append(sqlalchemy.sql.func.count())
235 sql = query.combine()
236 # Acquire a table lock to ensure there are no concurrent writes
237 # could invalidate our checking before we finish the inserts. We
238 # use a SAVEPOINT in case there is an outer transaction that a
239 # failure here should not roll back.
240 with self._db.transaction(lock=[self._calibs], savepoint=True):
241 # Run the check SELECT query.
242 conflicting = self._db.query(sql).scalar()
243 if conflicting > 0:
244 raise ConflictingDefinitionError(
245 f"{conflicting} validity range conflicts certifying datasets of type "
246 f"{self.datasetType.name} into {collection.name} for range "
247 f"[{timespan.begin}, {timespan.end})."
248 )
249 # Proceed with the insert.
250 self._db.insert(self._calibs, *rows)
252 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
253 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
254 # Docstring inherited from DatasetRecordStorage.
255 if self._calibs is None: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true
256 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which "
257 f"DatasetType.isCalibration() is False.")
258 if collection.type is not CollectionType.CALIBRATION: 258 ↛ 259line 258 didn't jump to line 259, because the condition on line 258 was never true
259 raise TypeError(f"Cannot decertify from collection '{collection.name}' "
260 f"of type {collection.type.name}; must be CALIBRATION.")
261 TimespanReprClass = self._db.getTimespanRepresentation()
262 # Construct a SELECT query to find all rows that overlap our inputs.
263 dataIdSet: Optional[DataCoordinateSet]
264 if dataIds is not None:
265 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
266 else:
267 dataIdSet = None
268 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
269 query.columns.extend(self._calibs.columns)
270 sql = query.combine()
271 # Set up collections to populate with the rows we'll want to modify.
272 # The insert rows will have the same values for collection and
273 # dataset type.
274 protoInsertRow = {
275 self._collections.getCollectionForeignKeyName(): collection.key,
276 "dataset_type_id": self._dataset_type_id,
277 }
278 rowsToDelete = []
279 rowsToInsert = []
280 # Acquire a table lock to ensure there are no concurrent writes
281 # between the SELECT and the DELETE and INSERT queries based on it.
282 with self._db.transaction(lock=[self._calibs], savepoint=True):
283 for row in self._db.query(sql).mappings():
284 rowsToDelete.append({"id": row["id"]})
285 # Construct the insert row(s) by copying the prototype row,
286 # then adding the dimension column values, then adding what's
287 # left of the timespan from that row after we subtract the
288 # given timespan.
289 newInsertRow = protoInsertRow.copy()
290 newInsertRow["dataset_id"] = row["dataset_id"]
291 for name in self.datasetType.dimensions.required.names:
292 newInsertRow[name] = row[name]
293 rowTimespan = TimespanReprClass.extract(row)
294 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
295 for diffTimespan in rowTimespan.difference(timespan):
296 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
297 # Run the DELETE and INSERT queries.
298 self._db.delete(self._calibs, ["id"], *rowsToDelete)
299 self._db.insert(self._calibs, *rowsToInsert)
301 def select(self, *collections: CollectionRecord,
302 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
303 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
304 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
305 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
306 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
307 ) -> SimpleQuery:
308 # Docstring inherited from DatasetRecordStorage.
309 collection_types = {collection.type for collection in collections}
310 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
311 #
312 # There are two tables in play here:
313 #
314 # - the static dataset table (with the dataset ID, dataset type ID,
315 # run ID/name, and ingest date);
316 #
317 # - the dynamic tags/calibs table (with the dataset ID, dataset type
318 # type ID, collection ID/name, data ID, and possibly validity
319 # range).
320 #
321 # That means that we might want to return a query against either table
322 # or a JOIN of both, depending on which quantities the caller wants.
323 # But this method is documented/typed such that ``dataId`` is never
324 # `None` - i.e. we always constrain or retreive the data ID. That
325 # means we'll always include the tags/calibs table and join in the
326 # static dataset table only if we need things from it that we can't get
327 # from the tags/calibs table.
328 #
329 # Note that it's important that we include a WHERE constraint on both
330 # tables for any column (e.g. dataset_type_id) that is in both when
331 # it's given explicitly; not doing can prevent the query planner from
332 # using very important indexes. At present, we don't include those
333 # redundant columns in the JOIN ON expression, however, because the
334 # FOREIGN KEY (and its index) are defined only on dataset_id.
335 #
336 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass
337 # to its `join` method when we bring in the tags/calibs table.
338 query = SimpleQuery()
339 # We get the data ID or constrain it in the tags/calibs table, but
340 # that's multiple columns, not one, so we need to transform the one
341 # Select.Or argument into a dictionary of them.
342 kwargs: Dict[str, Any]
343 if dataId is SimpleQuery.Select:
344 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
345 else:
346 kwargs = dict(dataId.byName())
347 # We always constrain (never retrieve) the dataset type in at least the
348 # tags/calibs table.
349 kwargs["dataset_type_id"] = self._dataset_type_id
350 # Join in the tags or calibs table, turning those 'kwargs' entries into
351 # WHERE constraints or SELECT columns as appropriate.
352 if collection_types == {CollectionType.CALIBRATION}:
353 assert self._calibs is not None, \
354 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
355 TimespanReprClass = self._db.getTimespanRepresentation()
356 # Add the timespan column(s) to the result columns, or constrain
357 # the timespan via an overlap condition.
358 if timespan is SimpleQuery.Select:
359 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()})
360 elif timespan is not None: 360 ↛ 366line 360 didn't jump to line 366, because the condition on line 360 was never false
361 query.where.append(
362 TimespanReprClass.fromSelectable(self._calibs).overlaps(
363 TimespanReprClass.fromLiteral(timespan)
364 )
365 )
366 query.join(self._calibs, **kwargs)
367 dataset_id_col = self._calibs.columns.dataset_id
368 collection_col = self._calibs.columns[self._collections.getCollectionForeignKeyName()]
369 elif CollectionType.CALIBRATION not in collection_types: 369 ↛ 374line 369 didn't jump to line 374, because the condition on line 369 was never false
370 query.join(self._tags, **kwargs)
371 dataset_id_col = self._tags.columns.dataset_id
372 collection_col = self._tags.columns[self._collections.getCollectionForeignKeyName()]
373 else:
374 raise TypeError(
375 "Cannot query for CALIBRATION collections in the same "
376 "subquery as other kinds of collections."
377 )
378 # We always constrain (never retrieve) the collection(s) in the
379 # tags/calibs table.
380 if len(collections) == 1:
381 query.where.append(collection_col == collections[0].key)
382 elif len(collections) == 0:
383 # We support the case where there are no collections as a way to
384 # generate a valid SQL query that can't yield results. This should
385 # never get executed, but lots of downstream code will still try
386 # to access the SQLAlchemy objects representing the columns in the
387 # subquery. That's not idea, but it'd take a lot of refactoring to
388 # fix it.
389 query.where.append(sqlalchemy.sql.literal(False))
390 else:
391 query.where.append(collection_col.in_([collection.key for collection in collections]))
392 # We can always get the dataset_id from the tags/calibs table or
393 # constrain it there. Can't use kwargs for that because we need to
394 # alias it to 'id'.
395 if id is SimpleQuery.Select:
396 query.columns.append(dataset_id_col.label("id"))
397 elif id is not None: 397 ↛ 398line 397 didn't jump to line 398, because the condition on line 397 was never true
398 query.where.append(dataset_id_col == id)
399 # It's possible we now have everything we need, from just the
400 # tags/calibs table. The things we might need to get from the static
401 # dataset table are the run key and the ingest date.
402 need_static_table = False
403 static_kwargs: Dict[str, Any] = {}
404 if run is not None:
405 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection."
406 if len(collections) == 1 and collections[0].type is CollectionType.RUN:
407 # If we are searching exactly one RUN collection, we
408 # know that if we find the dataset in that collection,
409 # then that's the datasets's run; we don't need to
410 # query for it.
411 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn))
412 else:
413 static_kwargs[self._runKeyColumn] = SimpleQuery.Select
414 need_static_table = True
415 # Ingest date can only come from the static table.
416 if ingestDate is not None:
417 need_static_table = True
418 if ingestDate is SimpleQuery.Select: 418 ↛ 421line 418 didn't jump to line 421, because the condition on line 418 was never false
419 static_kwargs["ingest_date"] = SimpleQuery.Select
420 else:
421 assert isinstance(ingestDate, Timespan)
422 # Timespan is astropy Time (usually in TAI) and ingest_date is
423 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
424 if ingestDate.isEmpty():
425 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
426 if ingestDate.begin is not None:
427 begin = ingestDate.begin.utc.datetime # type: ignore
428 query.where.append(self._static.dataset.columns.ingest_date >= begin)
429 if ingestDate.end is not None:
430 end = ingestDate.end.utc.datetime # type: ignore
431 query.where.append(self._static.dataset.columns.ingest_date < end)
432 # If we need the static table, join it in via dataset_id and
433 # dataset_type_id
434 if need_static_table:
435 query.join(
436 self._static.dataset,
437 onclause=(dataset_id_col == self._static.dataset.columns.id),
438 **static_kwargs,
439 )
440 # Also constrain dataset_type_id in static table in case that helps
441 # generate a better plan.
442 # We could also include this in the JOIN ON clause, but my guess is
443 # that that's a good idea IFF it's in the foreign key, and right
444 # now it isn't.
445 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
446 return query
448 def getDataId(self, id: DatasetId) -> DataCoordinate:
449 """Return DataId for a dataset.
451 Parameters
452 ----------
453 id : `DatasetId`
454 Unique dataset identifier.
456 Returns
457 -------
458 dataId : `DataCoordinate`
459 DataId for the dataset.
460 """
461 # This query could return multiple rows (one for each tagged collection
462 # the dataset is in, plus one for its run collection), and we don't
463 # care which of those we get.
464 sql = self._tags.select().where(
465 sqlalchemy.sql.and_(
466 self._tags.columns.dataset_id == id,
467 self._tags.columns.dataset_type_id == self._dataset_type_id
468 )
469 ).limit(1)
470 row = self._db.query(sql).mappings().fetchone()
471 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
472 return DataCoordinate.standardize(
473 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
474 graph=self.datasetType.dimensions
475 )
478class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
479 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
480 auto-incremented column for dataset IDs.
481 """
483 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
484 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
485 # Docstring inherited from DatasetRecordStorage.
487 # We only support UNIQUE mode for integer dataset IDs
488 if idMode != DatasetIdGenEnum.UNIQUE: 488 ↛ 489line 488 didn't jump to line 489, because the condition on line 488 was never true
489 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
491 # Transform a possibly-single-pass iterable into a list.
492 dataIdList = list(dataIds)
493 yield from self._insert(run, dataIdList)
495 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
496 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
497 reuseIds: bool = False) -> Iterator[DatasetRef]:
498 # Docstring inherited from DatasetRecordStorage.
500 # We only support UNIQUE mode for integer dataset IDs
501 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 501 ↛ 502line 501 didn't jump to line 502, because the condition on line 501 was never true
502 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
504 # Make a list of dataIds and optionally dataset IDs.
505 dataIdList: List[DataCoordinate] = []
506 datasetIdList: List[int] = []
507 for dataset in datasets:
508 dataIdList.append(dataset.dataId)
510 # We only accept integer dataset IDs, but also allow None.
511 datasetId = dataset.id
512 if datasetId is None: 512 ↛ 514line 512 didn't jump to line 514, because the condition on line 512 was never true
513 # if reuseIds is set then all IDs must be known
514 if reuseIds:
515 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
516 elif isinstance(datasetId, int): 516 ↛ 520line 516 didn't jump to line 520, because the condition on line 516 was never false
517 if reuseIds:
518 datasetIdList.append(datasetId)
519 else:
520 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
522 yield from self._insert(run, dataIdList, datasetIdList)
524 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate],
525 datasetIdList: Optional[List[int]] = None) -> Iterator[DatasetRef]:
526 """Common part of implementation of `insert` and `import_` methods.
527 """
529 # Remember any governor dimension values we see.
530 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
531 for dataId in dataIdList:
532 governorValues.update_extract(dataId)
534 staticRow = {
535 "dataset_type_id": self._dataset_type_id,
536 self._runKeyColumn: run.key,
537 }
538 with self._db.transaction():
539 # Insert into the static dataset table, generating autoincrement
540 # dataset_id values.
541 if datasetIdList:
542 # reuse existing IDs
543 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
544 self._db.insert(self._static.dataset, *rows)
545 else:
546 # use auto-incremented IDs
547 datasetIdList = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)),
548 returnIds=True)
549 assert datasetIdList is not None
550 # Update the summary tables for this collection in case this is the
551 # first time this dataset type or these governor values will be
552 # inserted there.
553 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
554 # Combine the generated dataset_id values and data ID fields to
555 # form rows to be inserted into the tags table.
556 protoTagsRow = {
557 "dataset_type_id": self._dataset_type_id,
558 self._collections.getCollectionForeignKeyName(): run.key,
559 }
560 tagsRows = [
561 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
562 for dataId, dataset_id in zip(dataIdList, datasetIdList)
563 ]
564 # Insert those rows into the tags table. This is where we'll
565 # get any unique constraint violations.
566 self._db.insert(self._tags, *tagsRows)
568 for dataId, datasetId in zip(dataIdList, datasetIdList):
569 yield DatasetRef(
570 datasetType=self.datasetType,
571 dataId=dataId,
572 id=datasetId,
573 run=run.name,
574 )
577class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
578 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
579 dataset IDs.
580 """
582 NS_UUID = uuid.UUID('840b31d9-05cd-5161-b2c8-00d32b280d0f')
583 """Namespace UUID used for UUID5 generation. Do not change. This was
584 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
585 """
587 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
588 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
589 # Docstring inherited from DatasetRecordStorage.
591 # Remember any governor dimension values we see.
592 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
594 # Iterate over data IDs, transforming a possibly-single-pass iterable
595 # into a list.
596 dataIdList = []
597 rows = []
598 for dataId in dataIds:
599 dataIdList.append(dataId)
600 rows.append({
601 "id": self._makeDatasetId(run, dataId, idMode),
602 "dataset_type_id": self._dataset_type_id,
603 self._runKeyColumn: run.key,
604 })
605 governorValues.update_extract(dataId)
607 with self._db.transaction():
608 # Insert into the static dataset table.
609 self._db.insert(self._static.dataset, *rows)
610 # Update the summary tables for this collection in case this is the
611 # first time this dataset type or these governor values will be
612 # inserted there.
613 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
614 # Combine the generated dataset_id values and data ID fields to
615 # form rows to be inserted into the tags table.
616 protoTagsRow = {
617 "dataset_type_id": self._dataset_type_id,
618 self._collections.getCollectionForeignKeyName(): run.key,
619 }
620 tagsRows = [
621 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
622 for dataId, row in zip(dataIdList, rows)
623 ]
624 # Insert those rows into the tags table.
625 self._db.insert(self._tags, *tagsRows)
627 for dataId, row in zip(dataIdList, rows):
628 yield DatasetRef(
629 datasetType=self.datasetType,
630 dataId=dataId,
631 id=row["id"],
632 run=run.name,
633 )
635 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
636 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
637 reuseIds: bool = False) -> Iterator[DatasetRef]:
638 # Docstring inherited from DatasetRecordStorage.
640 # Remember any governor dimension values we see.
641 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
643 # Iterate over data IDs, transforming a possibly-single-pass iterable
644 # into a list.
645 dataIds = {}
646 for dataset in datasets:
647 # Ignore unknown ID types, normally all IDs have the same type but
648 # this code supports mixed types or missing IDs.
649 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
650 if datasetId is None:
651 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
652 dataIds[datasetId] = dataset.dataId
653 governorValues.update_extract(dataset.dataId)
655 with self._db.session() as session:
657 # insert all new rows into a temporary table
658 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections),
659 ddl.GUID, constraints=False)
660 tmp_tags = session.makeTemporaryTable(tableSpec)
662 collFkName = self._collections.getCollectionForeignKeyName()
663 protoTagsRow = {
664 "dataset_type_id": self._dataset_type_id,
665 collFkName: run.key,
666 }
667 tmpRows = [dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
668 for dataset_id, dataId in dataIds.items()]
670 with self._db.transaction():
672 # store all incoming data in a temporary table
673 self._db.insert(tmp_tags, *tmpRows)
675 # There are some checks that we want to make for consistency
676 # of the new datasets with existing ones.
677 self._validateImport(tmp_tags, run)
679 # Before we merge temporary table into dataset/tags we need to
680 # drop datasets which are already there (and do not conflict).
681 self._db.deleteWhere(tmp_tags, tmp_tags.columns.dataset_id.in_(
682 sqlalchemy.sql.select(self._static.dataset.columns.id)
683 ))
685 # Copy it into dataset table, need to re-label some columns.
686 self._db.insert(self._static.dataset, select=sqlalchemy.sql.select(
687 tmp_tags.columns.dataset_id.label("id"),
688 tmp_tags.columns.dataset_type_id,
689 tmp_tags.columns[collFkName].label(self._runKeyColumn)
690 ))
692 # Update the summary tables for this collection in case this
693 # is the first time this dataset type or these governor values
694 # will be inserted there.
695 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
697 # Copy it into tags table.
698 self._db.insert(self._tags, select=tmp_tags.select())
700 # Return refs in the same order as in the input list.
701 for dataset_id, dataId in dataIds.items():
702 yield DatasetRef(
703 datasetType=self.datasetType,
704 id=dataset_id,
705 dataId=dataId,
706 run=run.name,
707 )
709 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
710 """Validate imported refs against existing datasets.
712 Parameters
713 ----------
714 tmp_tags : `sqlalchemy.schema.Table`
715 Temporary table with new datasets and the same schema as tags
716 table.
717 run : `RunRecord`
718 The record object describing the `~CollectionType.RUN` collection.
720 Raises
721 ------
722 ConflictingDefinitionError
723 Raise if new datasets conflict with existing ones.
724 """
725 dataset = self._static.dataset
726 tags = self._tags
727 collFkName = self._collections.getCollectionForeignKeyName()
729 # Check that existing datasets have the same dataset type and
730 # run.
731 query = sqlalchemy.sql.select(
732 dataset.columns.id.label("dataset_id"),
733 dataset.columns.dataset_type_id.label("dataset_type_id"),
734 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"),
735 dataset.columns[self._runKeyColumn].label("run"),
736 tmp_tags.columns[collFkName].label("new run")
737 ).select_from(
738 dataset.join(
739 tmp_tags,
740 dataset.columns.id == tmp_tags.columns.dataset_id
741 )
742 ).where(
743 sqlalchemy.sql.or_(
744 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
745 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName]
746 )
747 )
748 result = self._db.query(query)
749 if (row := result.first()) is not None:
750 # Only include the first one in the exception message
751 raise ConflictingDefinitionError(
752 f"Existing dataset type or run do not match new dataset: {row._asdict()}"
753 )
755 # Check that matching dataset in tags table has the same DataId.
756 query = sqlalchemy.sql.select(
757 tags.columns.dataset_id,
758 tags.columns.dataset_type_id.label("type_id"),
759 tmp_tags.columns.dataset_type_id.label("new type_id"),
760 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
761 *[tmp_tags.columns[dim].label(f"new {dim}")
762 for dim in self.datasetType.dimensions.required.names],
763 ).select_from(
764 tags.join(
765 tmp_tags,
766 tags.columns.dataset_id == tmp_tags.columns.dataset_id
767 )
768 ).where(
769 sqlalchemy.sql.or_(
770 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
771 *[tags.columns[dim] != tmp_tags.columns[dim]
772 for dim in self.datasetType.dimensions.required.names]
773 )
774 )
775 result = self._db.query(query)
776 if (row := result.first()) is not None:
777 # Only include the first one in the exception message
778 raise ConflictingDefinitionError(
779 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
780 )
782 # Check that matching run+dataId have the same dataset ID.
783 query = sqlalchemy.sql.select(
784 tags.columns.dataset_type_id.label("dataset_type_id"),
785 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
786 tags.columns.dataset_id,
787 tmp_tags.columns.dataset_id.label("new dataset_id"),
788 tags.columns[collFkName],
789 tmp_tags.columns[collFkName].label(f"new {collFkName}")
790 ).select_from(
791 tags.join(
792 tmp_tags,
793 sqlalchemy.sql.and_(
794 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
795 tags.columns[collFkName] == tmp_tags.columns[collFkName],
796 *[tags.columns[dim] == tmp_tags.columns[dim]
797 for dim in self.datasetType.dimensions.required.names]
798 )
799 )
800 ).where(
801 tags.columns.dataset_id != tmp_tags.columns.dataset_id
802 )
803 result = self._db.query(query)
804 if (row := result.first()) is not None:
805 # only include the first one in the exception message
806 raise ConflictingDefinitionError(
807 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
808 )
810 def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate,
811 idGenerationMode: DatasetIdGenEnum) -> uuid.UUID:
812 """Generate dataset ID for a dataset.
814 Parameters
815 ----------
816 run : `RunRecord`
817 The record object describing the RUN collection for the dataset.
818 dataId : `DataCoordinate`
819 Expanded data ID for the dataset.
820 idGenerationMode : `DatasetIdGenEnum`
821 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
822 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
823 deterministic UUID5-type ID based on a dataset type name and
824 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
825 deterministic UUID5-type ID based on a dataset type name, run
826 collection name, and ``dataId``.
828 Returns
829 -------
830 datasetId : `uuid.UUID`
831 Dataset identifier.
832 """
833 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
834 return uuid.uuid4()
835 else:
836 # WARNING: If you modify this code make sure that the order of
837 # items in the `items` list below never changes.
838 items: List[Tuple[str, str]] = []
839 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
840 items = [
841 ("dataset_type", self.datasetType.name),
842 ]
843 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 843 ↛ 849line 843 didn't jump to line 849, because the condition on line 843 was never false
844 items = [
845 ("dataset_type", self.datasetType.name),
846 ("run", run.name),
847 ]
848 else:
849 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
851 for name, value in sorted(dataId.byName().items()):
852 items.append((name, str(value)))
853 data = ",".join(f"{key}={value}" for key, value in items)
854 return uuid.uuid5(self.NS_UUID, data)