Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 88%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5import uuid
6from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple
8import sqlalchemy
9from lsst.daf.butler import (
10 CollectionType,
11 DataCoordinate,
12 DataCoordinateSet,
13 DatasetId,
14 DatasetRef,
15 DatasetType,
16 SimpleQuery,
17 Timespan,
18 ddl,
19)
20from lsst.daf.butler.registry import (
21 CollectionTypeError,
22 ConflictingDefinitionError,
23 UnsupportedIdGeneratorError,
24)
25from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage
27from ...summaries import GovernorDimensionRestriction
28from .tables import makeTagTableSpec
30if TYPE_CHECKING: 30 ↛ 31line 30 didn't jump to line 31, because the condition on line 30 was never true
31 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
32 from .summaries import CollectionSummaryManager
33 from .tables import StaticDatasetTablesTuple
36class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
37 """Dataset record storage implementation paired with
38 `ByDimensionsDatasetRecordStorageManager`; see that class for more
39 information.
41 Instances of this class should never be constructed directly; use
42 `DatasetRecordStorageManager.register` instead.
43 """
45 def __init__(
46 self,
47 *,
48 datasetType: DatasetType,
49 db: Database,
50 dataset_type_id: int,
51 collections: CollectionManager,
52 static: StaticDatasetTablesTuple,
53 summaries: CollectionSummaryManager,
54 tags: sqlalchemy.schema.Table,
55 calibs: Optional[sqlalchemy.schema.Table],
56 ):
57 super().__init__(datasetType=datasetType)
58 self._dataset_type_id = dataset_type_id
59 self._db = db
60 self._collections = collections
61 self._static = static
62 self._summaries = summaries
63 self._tags = tags
64 self._calibs = calibs
65 self._runKeyColumn = collections.getRunForeignKeyName()
67 def find(
68 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None
69 ) -> Optional[DatasetRef]:
70 # Docstring inherited from DatasetRecordStorage.
71 assert dataId.graph == self.datasetType.dimensions
72 if collection.type is CollectionType.CALIBRATION and timespan is None: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true
73 raise TypeError(
74 f"Cannot search for dataset in CALIBRATION collection {collection.name} "
75 f"without an input timespan."
76 )
77 sql = self.select(
78 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan
79 )
80 sql = sql.combine()
81 results = self._db.query(sql)
82 row = results.fetchone()
83 if row is None:
84 return None
85 if collection.type is CollectionType.CALIBRATION:
86 # For temporal calibration lookups (only!) our invariants do not
87 # guarantee that the number of result rows is <= 1.
88 # They would if `select` constrained the given timespan to be
89 # _contained_ by the validity range in the self._calibs table,
90 # instead of simply _overlapping_ it, because we do guarantee that
91 # the validity ranges are disjoint for a particular dataset type,
92 # collection, and data ID. But using an overlap test and a check
93 # for multiple result rows here allows us to provide a more useful
94 # diagnostic, as well as allowing `select` to support more general
95 # queries where multiple results are not an error.
96 if results.fetchone() is not None:
97 raise RuntimeError(
98 f"Multiple matches found for calibration lookup in {collection.name} for "
99 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
100 )
101 return DatasetRef(
102 datasetType=self.datasetType,
103 dataId=dataId,
104 id=row.id,
105 run=self._collections[row._mapping[self._runKeyColumn]].name,
106 )
108 def delete(self, datasets: Iterable[DatasetRef]) -> None:
109 # Docstring inherited from DatasetRecordStorage.
110 # Only delete from common dataset table; ON DELETE foreign key clauses
111 # will handle the rest.
112 self._db.delete(
113 self._static.dataset,
114 ["id"],
115 *[{"id": dataset.getCheckedId()} for dataset in datasets],
116 )
118 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
119 # Docstring inherited from DatasetRecordStorage.
120 if collection.type is not CollectionType.TAGGED: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true
121 raise TypeError(
122 f"Cannot associate into collection '{collection.name}' "
123 f"of type {collection.type.name}; must be TAGGED."
124 )
125 protoRow = {
126 self._collections.getCollectionForeignKeyName(): collection.key,
127 "dataset_type_id": self._dataset_type_id,
128 }
129 rows = []
130 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
131 for dataset in datasets:
132 row = dict(protoRow, dataset_id=dataset.getCheckedId())
133 for dimension, value in dataset.dataId.items():
134 row[dimension.name] = value
135 governorValues.update_extract(dataset.dataId)
136 rows.append(row)
137 # Update the summary tables for this collection in case this is the
138 # first time this dataset type or these governor values will be
139 # inserted there.
140 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
141 # Update the tag table itself.
142 self._db.replace(self._tags, *rows)
144 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
145 # Docstring inherited from DatasetRecordStorage.
146 if collection.type is not CollectionType.TAGGED: 146 ↛ 147line 146 didn't jump to line 147, because the condition on line 146 was never true
147 raise TypeError(
148 f"Cannot disassociate from collection '{collection.name}' "
149 f"of type {collection.type.name}; must be TAGGED."
150 )
151 rows = [
152 {
153 "dataset_id": dataset.getCheckedId(),
154 self._collections.getCollectionForeignKeyName(): collection.key,
155 }
156 for dataset in datasets
157 ]
158 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
160 def _buildCalibOverlapQuery(
161 self, collection: CollectionRecord, dataIds: Optional[DataCoordinateSet], timespan: Timespan
162 ) -> SimpleQuery:
163 assert self._calibs is not None
164 # Start by building a SELECT query for any rows that would overlap
165 # this one.
166 query = SimpleQuery()
167 query.join(self._calibs)
168 # Add a WHERE clause matching the dataset type and collection.
169 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
170 query.where.append(
171 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
172 )
173 # Add a WHERE clause matching any of the given data IDs.
174 if dataIds is not None:
175 dataIds.constrain(
176 query,
177 lambda name: self._calibs.columns[name], # type: ignore
178 )
179 # Add WHERE clause for timespan overlaps.
180 TimespanReprClass = self._db.getTimespanRepresentation()
181 query.where.append(
182 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan))
183 )
184 return query
186 def certify(
187 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
188 ) -> None:
189 # Docstring inherited from DatasetRecordStorage.
190 if self._calibs is None: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true
191 raise CollectionTypeError(
192 f"Cannot certify datasets of type {self.datasetType.name}, for which "
193 f"DatasetType.isCalibration() is False."
194 )
195 if collection.type is not CollectionType.CALIBRATION: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true
196 raise CollectionTypeError(
197 f"Cannot certify into collection '{collection.name}' "
198 f"of type {collection.type.name}; must be CALIBRATION."
199 )
200 TimespanReprClass = self._db.getTimespanRepresentation()
201 protoRow = {
202 self._collections.getCollectionForeignKeyName(): collection.key,
203 "dataset_type_id": self._dataset_type_id,
204 }
205 rows = []
206 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
207 dataIds: Optional[Set[DataCoordinate]] = (
208 set() if not TimespanReprClass.hasExclusionConstraint() else None
209 )
210 for dataset in datasets:
211 row = dict(protoRow, dataset_id=dataset.getCheckedId())
212 for dimension, value in dataset.dataId.items():
213 row[dimension.name] = value
214 TimespanReprClass.update(timespan, result=row)
215 governorValues.update_extract(dataset.dataId)
216 rows.append(row)
217 if dataIds is not None: 217 ↛ 210line 217 didn't jump to line 210, because the condition on line 217 was never false
218 dataIds.add(dataset.dataId)
219 # Update the summary tables for this collection in case this is the
220 # first time this dataset type or these governor values will be
221 # inserted there.
222 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
223 # Update the association table itself.
224 if TimespanReprClass.hasExclusionConstraint(): 224 ↛ 227line 224 didn't jump to line 227, because the condition on line 224 was never true
225 # Rely on database constraint to enforce invariants; we just
226 # reraise the exception for consistency across DB engines.
227 try:
228 self._db.insert(self._calibs, *rows)
229 except sqlalchemy.exc.IntegrityError as err:
230 raise ConflictingDefinitionError(
231 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
232 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
233 ) from err
234 else:
235 # Have to implement exclusion constraint ourselves.
236 # Start by building a SELECT query for any rows that would overlap
237 # this one.
238 query = self._buildCalibOverlapQuery(
239 collection,
240 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
241 timespan,
242 )
243 query.columns.append(sqlalchemy.sql.func.count())
244 sql = query.combine()
245 # Acquire a table lock to ensure there are no concurrent writes
246 # could invalidate our checking before we finish the inserts. We
247 # use a SAVEPOINT in case there is an outer transaction that a
248 # failure here should not roll back.
249 with self._db.transaction(lock=[self._calibs], savepoint=True):
250 # Run the check SELECT query.
251 conflicting = self._db.query(sql).scalar()
252 if conflicting > 0:
253 raise ConflictingDefinitionError(
254 f"{conflicting} validity range conflicts certifying datasets of type "
255 f"{self.datasetType.name} into {collection.name} for range "
256 f"[{timespan.begin}, {timespan.end})."
257 )
258 # Proceed with the insert.
259 self._db.insert(self._calibs, *rows)
261 def decertify(
262 self,
263 collection: CollectionRecord,
264 timespan: Timespan,
265 *,
266 dataIds: Optional[Iterable[DataCoordinate]] = None,
267 ) -> None:
268 # Docstring inherited from DatasetRecordStorage.
269 if self._calibs is None: 269 ↛ 270line 269 didn't jump to line 270, because the condition on line 269 was never true
270 raise CollectionTypeError(
271 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
272 f"DatasetType.isCalibration() is False."
273 )
274 if collection.type is not CollectionType.CALIBRATION: 274 ↛ 275line 274 didn't jump to line 275, because the condition on line 274 was never true
275 raise CollectionTypeError(
276 f"Cannot decertify from collection '{collection.name}' "
277 f"of type {collection.type.name}; must be CALIBRATION."
278 )
279 TimespanReprClass = self._db.getTimespanRepresentation()
280 # Construct a SELECT query to find all rows that overlap our inputs.
281 dataIdSet: Optional[DataCoordinateSet]
282 if dataIds is not None:
283 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
284 else:
285 dataIdSet = None
286 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
287 query.columns.extend(self._calibs.columns)
288 sql = query.combine()
289 # Set up collections to populate with the rows we'll want to modify.
290 # The insert rows will have the same values for collection and
291 # dataset type.
292 protoInsertRow = {
293 self._collections.getCollectionForeignKeyName(): collection.key,
294 "dataset_type_id": self._dataset_type_id,
295 }
296 rowsToDelete = []
297 rowsToInsert = []
298 # Acquire a table lock to ensure there are no concurrent writes
299 # between the SELECT and the DELETE and INSERT queries based on it.
300 with self._db.transaction(lock=[self._calibs], savepoint=True):
301 for row in self._db.query(sql).mappings():
302 rowsToDelete.append({"id": row["id"]})
303 # Construct the insert row(s) by copying the prototype row,
304 # then adding the dimension column values, then adding what's
305 # left of the timespan from that row after we subtract the
306 # given timespan.
307 newInsertRow = protoInsertRow.copy()
308 newInsertRow["dataset_id"] = row["dataset_id"]
309 for name in self.datasetType.dimensions.required.names:
310 newInsertRow[name] = row[name]
311 rowTimespan = TimespanReprClass.extract(row)
312 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
313 for diffTimespan in rowTimespan.difference(timespan):
314 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
315 # Run the DELETE and INSERT queries.
316 self._db.delete(self._calibs, ["id"], *rowsToDelete)
317 self._db.insert(self._calibs, *rowsToInsert)
319 def select(
320 self,
321 *collections: CollectionRecord,
322 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
323 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
324 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
325 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
326 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
327 ) -> SimpleQuery:
328 # Docstring inherited from DatasetRecordStorage.
329 collection_types = {collection.type for collection in collections}
330 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
331 #
332 # There are two tables in play here:
333 #
334 # - the static dataset table (with the dataset ID, dataset type ID,
335 # run ID/name, and ingest date);
336 #
337 # - the dynamic tags/calibs table (with the dataset ID, dataset type
338 # type ID, collection ID/name, data ID, and possibly validity
339 # range).
340 #
341 # That means that we might want to return a query against either table
342 # or a JOIN of both, depending on which quantities the caller wants.
343 # But this method is documented/typed such that ``dataId`` is never
344 # `None` - i.e. we always constrain or retreive the data ID. That
345 # means we'll always include the tags/calibs table and join in the
346 # static dataset table only if we need things from it that we can't get
347 # from the tags/calibs table.
348 #
349 # Note that it's important that we include a WHERE constraint on both
350 # tables for any column (e.g. dataset_type_id) that is in both when
351 # it's given explicitly; not doing can prevent the query planner from
352 # using very important indexes. At present, we don't include those
353 # redundant columns in the JOIN ON expression, however, because the
354 # FOREIGN KEY (and its index) are defined only on dataset_id.
355 #
356 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass
357 # to its `join` method when we bring in the tags/calibs table.
358 query = SimpleQuery()
359 # We get the data ID or constrain it in the tags/calibs table, but
360 # that's multiple columns, not one, so we need to transform the one
361 # Select.Or argument into a dictionary of them.
362 kwargs: Dict[str, Any]
363 if dataId is SimpleQuery.Select:
364 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
365 else:
366 kwargs = dict(dataId.byName())
367 # We always constrain (never retrieve) the dataset type in at least the
368 # tags/calibs table.
369 kwargs["dataset_type_id"] = self._dataset_type_id
370 # Join in the tags or calibs table, turning those 'kwargs' entries into
371 # WHERE constraints or SELECT columns as appropriate.
372 if collection_types == {CollectionType.CALIBRATION}:
373 assert (
374 self._calibs is not None
375 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
376 TimespanReprClass = self._db.getTimespanRepresentation()
377 # Add the timespan column(s) to the result columns, or constrain
378 # the timespan via an overlap condition.
379 if timespan is SimpleQuery.Select:
380 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()})
381 elif timespan is not None: 381 ↛ 387line 381 didn't jump to line 387, because the condition on line 381 was never false
382 query.where.append(
383 TimespanReprClass.fromSelectable(self._calibs).overlaps(
384 TimespanReprClass.fromLiteral(timespan)
385 )
386 )
387 query.join(self._calibs, **kwargs)
388 dataset_id_col = self._calibs.columns.dataset_id
389 collection_col = self._calibs.columns[self._collections.getCollectionForeignKeyName()]
390 elif CollectionType.CALIBRATION not in collection_types: 390 ↛ 395line 390 didn't jump to line 395, because the condition on line 390 was never false
391 query.join(self._tags, **kwargs)
392 dataset_id_col = self._tags.columns.dataset_id
393 collection_col = self._tags.columns[self._collections.getCollectionForeignKeyName()]
394 else:
395 raise TypeError(
396 "Cannot query for CALIBRATION collections in the same "
397 "subquery as other kinds of collections."
398 )
399 # We always constrain (never retrieve) the collection(s) in the
400 # tags/calibs table.
401 if len(collections) == 1:
402 query.where.append(collection_col == collections[0].key)
403 elif len(collections) == 0:
404 # We support the case where there are no collections as a way to
405 # generate a valid SQL query that can't yield results. This should
406 # never get executed, but lots of downstream code will still try
407 # to access the SQLAlchemy objects representing the columns in the
408 # subquery. That's not idea, but it'd take a lot of refactoring to
409 # fix it.
410 query.where.append(sqlalchemy.sql.literal(False))
411 else:
412 query.where.append(collection_col.in_([collection.key for collection in collections]))
413 # We can always get the dataset_id from the tags/calibs table or
414 # constrain it there. Can't use kwargs for that because we need to
415 # alias it to 'id'.
416 if id is SimpleQuery.Select:
417 query.columns.append(dataset_id_col.label("id"))
418 elif id is not None: 418 ↛ 419line 418 didn't jump to line 419, because the condition on line 418 was never true
419 query.where.append(dataset_id_col == id)
420 # It's possible we now have everything we need, from just the
421 # tags/calibs table. The things we might need to get from the static
422 # dataset table are the run key and the ingest date.
423 need_static_table = False
424 static_kwargs: Dict[str, Any] = {}
425 if run is not None:
426 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection."
427 if len(collections) == 1 and collections[0].type is CollectionType.RUN:
428 # If we are searching exactly one RUN collection, we
429 # know that if we find the dataset in that collection,
430 # then that's the datasets's run; we don't need to
431 # query for it.
432 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn))
433 else:
434 static_kwargs[self._runKeyColumn] = SimpleQuery.Select
435 need_static_table = True
436 # Ingest date can only come from the static table.
437 if ingestDate is not None:
438 need_static_table = True
439 if ingestDate is SimpleQuery.Select: 439 ↛ 442line 439 didn't jump to line 442, because the condition on line 439 was never false
440 static_kwargs["ingest_date"] = SimpleQuery.Select
441 else:
442 assert isinstance(ingestDate, Timespan)
443 # Timespan is astropy Time (usually in TAI) and ingest_date is
444 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
445 if ingestDate.isEmpty():
446 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
447 if ingestDate.begin is not None:
448 begin = ingestDate.begin.utc.datetime # type: ignore
449 query.where.append(self._static.dataset.columns.ingest_date >= begin)
450 if ingestDate.end is not None:
451 end = ingestDate.end.utc.datetime # type: ignore
452 query.where.append(self._static.dataset.columns.ingest_date < end)
453 # If we need the static table, join it in via dataset_id and
454 # dataset_type_id
455 if need_static_table:
456 query.join(
457 self._static.dataset,
458 onclause=(dataset_id_col == self._static.dataset.columns.id),
459 **static_kwargs,
460 )
461 # Also constrain dataset_type_id in static table in case that helps
462 # generate a better plan.
463 # We could also include this in the JOIN ON clause, but my guess is
464 # that that's a good idea IFF it's in the foreign key, and right
465 # now it isn't.
466 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
467 return query
469 def getDataId(self, id: DatasetId) -> DataCoordinate:
470 """Return DataId for a dataset.
472 Parameters
473 ----------
474 id : `DatasetId`
475 Unique dataset identifier.
477 Returns
478 -------
479 dataId : `DataCoordinate`
480 DataId for the dataset.
481 """
482 # This query could return multiple rows (one for each tagged collection
483 # the dataset is in, plus one for its run collection), and we don't
484 # care which of those we get.
485 sql = (
486 self._tags.select()
487 .where(
488 sqlalchemy.sql.and_(
489 self._tags.columns.dataset_id == id,
490 self._tags.columns.dataset_type_id == self._dataset_type_id,
491 )
492 )
493 .limit(1)
494 )
495 row = self._db.query(sql).mappings().fetchone()
496 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
497 return DataCoordinate.standardize(
498 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
499 graph=self.datasetType.dimensions,
500 )
503class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
504 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
505 auto-incremented column for dataset IDs.
506 """
508 def insert(
509 self,
510 run: RunRecord,
511 dataIds: Iterable[DataCoordinate],
512 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
513 ) -> Iterator[DatasetRef]:
514 # Docstring inherited from DatasetRecordStorage.
516 # We only support UNIQUE mode for integer dataset IDs
517 if idMode != DatasetIdGenEnum.UNIQUE: 517 ↛ 518line 517 didn't jump to line 518, because the condition on line 517 was never true
518 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
520 # Transform a possibly-single-pass iterable into a list.
521 dataIdList = list(dataIds)
522 yield from self._insert(run, dataIdList)
524 def import_(
525 self,
526 run: RunRecord,
527 datasets: Iterable[DatasetRef],
528 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
529 reuseIds: bool = False,
530 ) -> Iterator[DatasetRef]:
531 # Docstring inherited from DatasetRecordStorage.
533 # We only support UNIQUE mode for integer dataset IDs
534 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 534 ↛ 535line 534 didn't jump to line 535, because the condition on line 534 was never true
535 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
537 # Make a list of dataIds and optionally dataset IDs.
538 dataIdList: List[DataCoordinate] = []
539 datasetIdList: List[int] = []
540 for dataset in datasets:
541 dataIdList.append(dataset.dataId)
543 # We only accept integer dataset IDs, but also allow None.
544 datasetId = dataset.id
545 if datasetId is None: 545 ↛ 547line 545 didn't jump to line 547, because the condition on line 545 was never true
546 # if reuseIds is set then all IDs must be known
547 if reuseIds:
548 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
549 elif isinstance(datasetId, int): 549 ↛ 553line 549 didn't jump to line 553, because the condition on line 549 was never false
550 if reuseIds:
551 datasetIdList.append(datasetId)
552 else:
553 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
555 yield from self._insert(run, dataIdList, datasetIdList)
557 def _insert(
558 self, run: RunRecord, dataIdList: List[DataCoordinate], datasetIdList: Optional[List[int]] = None
559 ) -> Iterator[DatasetRef]:
560 """Common part of implementation of `insert` and `import_` methods."""
562 # Remember any governor dimension values we see.
563 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
564 for dataId in dataIdList:
565 governorValues.update_extract(dataId)
567 staticRow = {
568 "dataset_type_id": self._dataset_type_id,
569 self._runKeyColumn: run.key,
570 }
571 with self._db.transaction():
572 # Insert into the static dataset table, generating autoincrement
573 # dataset_id values.
574 if datasetIdList:
575 # reuse existing IDs
576 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
577 self._db.insert(self._static.dataset, *rows)
578 else:
579 # use auto-incremented IDs
580 datasetIdList = self._db.insert(
581 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True
582 )
583 assert datasetIdList is not None
584 # Update the summary tables for this collection in case this is the
585 # first time this dataset type or these governor values will be
586 # inserted there.
587 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
588 # Combine the generated dataset_id values and data ID fields to
589 # form rows to be inserted into the tags table.
590 protoTagsRow = {
591 "dataset_type_id": self._dataset_type_id,
592 self._collections.getCollectionForeignKeyName(): run.key,
593 }
594 tagsRows = [
595 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
596 for dataId, dataset_id in zip(dataIdList, datasetIdList)
597 ]
598 # Insert those rows into the tags table. This is where we'll
599 # get any unique constraint violations.
600 self._db.insert(self._tags, *tagsRows)
602 for dataId, datasetId in zip(dataIdList, datasetIdList):
603 yield DatasetRef(
604 datasetType=self.datasetType,
605 dataId=dataId,
606 id=datasetId,
607 run=run.name,
608 )
611class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
612 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
613 dataset IDs.
614 """
616 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
617 """Namespace UUID used for UUID5 generation. Do not change. This was
618 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
619 """
621 def insert(
622 self,
623 run: RunRecord,
624 dataIds: Iterable[DataCoordinate],
625 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
626 ) -> Iterator[DatasetRef]:
627 # Docstring inherited from DatasetRecordStorage.
629 # Remember any governor dimension values we see.
630 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
632 # Iterate over data IDs, transforming a possibly-single-pass iterable
633 # into a list.
634 dataIdList = []
635 rows = []
636 for dataId in dataIds:
637 dataIdList.append(dataId)
638 rows.append(
639 {
640 "id": self._makeDatasetId(run, dataId, idMode),
641 "dataset_type_id": self._dataset_type_id,
642 self._runKeyColumn: run.key,
643 }
644 )
645 governorValues.update_extract(dataId)
647 with self._db.transaction():
648 # Insert into the static dataset table.
649 self._db.insert(self._static.dataset, *rows)
650 # Update the summary tables for this collection in case this is the
651 # first time this dataset type or these governor values will be
652 # inserted there.
653 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
654 # Combine the generated dataset_id values and data ID fields to
655 # form rows to be inserted into the tags table.
656 protoTagsRow = {
657 "dataset_type_id": self._dataset_type_id,
658 self._collections.getCollectionForeignKeyName(): run.key,
659 }
660 tagsRows = [
661 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
662 for dataId, row in zip(dataIdList, rows)
663 ]
664 # Insert those rows into the tags table.
665 self._db.insert(self._tags, *tagsRows)
667 for dataId, row in zip(dataIdList, rows):
668 yield DatasetRef(
669 datasetType=self.datasetType,
670 dataId=dataId,
671 id=row["id"],
672 run=run.name,
673 )
675 def import_(
676 self,
677 run: RunRecord,
678 datasets: Iterable[DatasetRef],
679 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
680 reuseIds: bool = False,
681 ) -> Iterator[DatasetRef]:
682 # Docstring inherited from DatasetRecordStorage.
684 # Remember any governor dimension values we see.
685 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
687 # Iterate over data IDs, transforming a possibly-single-pass iterable
688 # into a list.
689 dataIds = {}
690 for dataset in datasets:
691 # Ignore unknown ID types, normally all IDs have the same type but
692 # this code supports mixed types or missing IDs.
693 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
694 if datasetId is None:
695 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
696 dataIds[datasetId] = dataset.dataId
697 governorValues.update_extract(dataset.dataId)
699 with self._db.session() as session:
701 # insert all new rows into a temporary table
702 tableSpec = makeTagTableSpec(
703 self.datasetType, type(self._collections), ddl.GUID, constraints=False
704 )
705 tmp_tags = session.makeTemporaryTable(tableSpec)
707 collFkName = self._collections.getCollectionForeignKeyName()
708 protoTagsRow = {
709 "dataset_type_id": self._dataset_type_id,
710 collFkName: run.key,
711 }
712 tmpRows = [
713 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
714 for dataset_id, dataId in dataIds.items()
715 ]
717 with self._db.transaction():
719 # store all incoming data in a temporary table
720 self._db.insert(tmp_tags, *tmpRows)
722 # There are some checks that we want to make for consistency
723 # of the new datasets with existing ones.
724 self._validateImport(tmp_tags, run)
726 # Before we merge temporary table into dataset/tags we need to
727 # drop datasets which are already there (and do not conflict).
728 self._db.deleteWhere(
729 tmp_tags,
730 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
731 )
733 # Copy it into dataset table, need to re-label some columns.
734 self._db.insert(
735 self._static.dataset,
736 select=sqlalchemy.sql.select(
737 tmp_tags.columns.dataset_id.label("id"),
738 tmp_tags.columns.dataset_type_id,
739 tmp_tags.columns[collFkName].label(self._runKeyColumn),
740 ),
741 )
743 # Update the summary tables for this collection in case this
744 # is the first time this dataset type or these governor values
745 # will be inserted there.
746 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
748 # Copy it into tags table.
749 self._db.insert(self._tags, select=tmp_tags.select())
751 # Return refs in the same order as in the input list.
752 for dataset_id, dataId in dataIds.items():
753 yield DatasetRef(
754 datasetType=self.datasetType,
755 id=dataset_id,
756 dataId=dataId,
757 run=run.name,
758 )
760 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
761 """Validate imported refs against existing datasets.
763 Parameters
764 ----------
765 tmp_tags : `sqlalchemy.schema.Table`
766 Temporary table with new datasets and the same schema as tags
767 table.
768 run : `RunRecord`
769 The record object describing the `~CollectionType.RUN` collection.
771 Raises
772 ------
773 ConflictingDefinitionError
774 Raise if new datasets conflict with existing ones.
775 """
776 dataset = self._static.dataset
777 tags = self._tags
778 collFkName = self._collections.getCollectionForeignKeyName()
780 # Check that existing datasets have the same dataset type and
781 # run.
782 query = (
783 sqlalchemy.sql.select(
784 dataset.columns.id.label("dataset_id"),
785 dataset.columns.dataset_type_id.label("dataset_type_id"),
786 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"),
787 dataset.columns[self._runKeyColumn].label("run"),
788 tmp_tags.columns[collFkName].label("new run"),
789 )
790 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
791 .where(
792 sqlalchemy.sql.or_(
793 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
794 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
795 )
796 )
797 )
798 result = self._db.query(query)
799 if (row := result.first()) is not None:
800 # Only include the first one in the exception message
801 raise ConflictingDefinitionError(
802 f"Existing dataset type or run do not match new dataset: {row._asdict()}"
803 )
805 # Check that matching dataset in tags table has the same DataId.
806 query = (
807 sqlalchemy.sql.select(
808 tags.columns.dataset_id,
809 tags.columns.dataset_type_id.label("type_id"),
810 tmp_tags.columns.dataset_type_id.label("new type_id"),
811 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
812 *[
813 tmp_tags.columns[dim].label(f"new {dim}")
814 for dim in self.datasetType.dimensions.required.names
815 ],
816 )
817 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
818 .where(
819 sqlalchemy.sql.or_(
820 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
821 *[
822 tags.columns[dim] != tmp_tags.columns[dim]
823 for dim in self.datasetType.dimensions.required.names
824 ],
825 )
826 )
827 )
828 result = self._db.query(query)
829 if (row := result.first()) is not None:
830 # Only include the first one in the exception message
831 raise ConflictingDefinitionError(
832 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
833 )
835 # Check that matching run+dataId have the same dataset ID.
836 query = (
837 sqlalchemy.sql.select(
838 tags.columns.dataset_type_id.label("dataset_type_id"),
839 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
840 tags.columns.dataset_id,
841 tmp_tags.columns.dataset_id.label("new dataset_id"),
842 tags.columns[collFkName],
843 tmp_tags.columns[collFkName].label(f"new {collFkName}"),
844 )
845 .select_from(
846 tags.join(
847 tmp_tags,
848 sqlalchemy.sql.and_(
849 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
850 tags.columns[collFkName] == tmp_tags.columns[collFkName],
851 *[
852 tags.columns[dim] == tmp_tags.columns[dim]
853 for dim in self.datasetType.dimensions.required.names
854 ],
855 ),
856 )
857 )
858 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
859 )
860 result = self._db.query(query)
861 if (row := result.first()) is not None:
862 # only include the first one in the exception message
863 raise ConflictingDefinitionError(
864 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
865 )
867 def _makeDatasetId(
868 self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum
869 ) -> uuid.UUID:
870 """Generate dataset ID for a dataset.
872 Parameters
873 ----------
874 run : `RunRecord`
875 The record object describing the RUN collection for the dataset.
876 dataId : `DataCoordinate`
877 Expanded data ID for the dataset.
878 idGenerationMode : `DatasetIdGenEnum`
879 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
880 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
881 deterministic UUID5-type ID based on a dataset type name and
882 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
883 deterministic UUID5-type ID based on a dataset type name, run
884 collection name, and ``dataId``.
886 Returns
887 -------
888 datasetId : `uuid.UUID`
889 Dataset identifier.
890 """
891 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
892 return uuid.uuid4()
893 else:
894 # WARNING: If you modify this code make sure that the order of
895 # items in the `items` list below never changes.
896 items: List[Tuple[str, str]] = []
897 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
898 items = [
899 ("dataset_type", self.datasetType.name),
900 ]
901 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 901 ↛ 907line 901 didn't jump to line 907, because the condition on line 901 was never false
902 items = [
903 ("dataset_type", self.datasetType.name),
904 ("run", run.name),
905 ]
906 else:
907 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
909 for name, value in sorted(dataId.byName().items()):
910 items.append((name, str(value)))
911 data = ",".join(f"{key}={value}" for key, value in items)
912 return uuid.uuid5(self.NS_UUID, data)