Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 88%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5import uuid
6from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple
8import sqlalchemy
9from lsst.daf.butler import (
10 CollectionType,
11 DataCoordinate,
12 DataCoordinateSet,
13 DatasetId,
14 DatasetRef,
15 DatasetType,
16 SimpleQuery,
17 Timespan,
18 ddl,
19)
20from lsst.daf.butler.registry import ConflictingDefinitionError, UnsupportedIdGeneratorError
21from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage
23from ...summaries import GovernorDimensionRestriction
24from .tables import makeTagTableSpec
26if TYPE_CHECKING: 26 ↛ 27line 26 didn't jump to line 27, because the condition on line 26 was never true
27 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
28 from .summaries import CollectionSummaryManager
29 from .tables import StaticDatasetTablesTuple
32class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
33 """Dataset record storage implementation paired with
34 `ByDimensionsDatasetRecordStorageManager`; see that class for more
35 information.
37 Instances of this class should never be constructed directly; use
38 `DatasetRecordStorageManager.register` instead.
39 """
41 def __init__(
42 self,
43 *,
44 datasetType: DatasetType,
45 db: Database,
46 dataset_type_id: int,
47 collections: CollectionManager,
48 static: StaticDatasetTablesTuple,
49 summaries: CollectionSummaryManager,
50 tags: sqlalchemy.schema.Table,
51 calibs: Optional[sqlalchemy.schema.Table],
52 ):
53 super().__init__(datasetType=datasetType)
54 self._dataset_type_id = dataset_type_id
55 self._db = db
56 self._collections = collections
57 self._static = static
58 self._summaries = summaries
59 self._tags = tags
60 self._calibs = calibs
61 self._runKeyColumn = collections.getRunForeignKeyName()
63 def find(
64 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None
65 ) -> Optional[DatasetRef]:
66 # Docstring inherited from DatasetRecordStorage.
67 assert dataId.graph == self.datasetType.dimensions
68 if collection.type is CollectionType.CALIBRATION and timespan is None: 68 ↛ 69line 68 didn't jump to line 69, because the condition on line 68 was never true
69 raise TypeError(
70 f"Cannot search for dataset in CALIBRATION collection {collection.name} "
71 f"without an input timespan."
72 )
73 sql = self.select(
74 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan
75 )
76 sql = sql.combine()
77 results = self._db.query(sql)
78 row = results.fetchone()
79 if row is None:
80 return None
81 if collection.type is CollectionType.CALIBRATION:
82 # For temporal calibration lookups (only!) our invariants do not
83 # guarantee that the number of result rows is <= 1.
84 # They would if `select` constrained the given timespan to be
85 # _contained_ by the validity range in the self._calibs table,
86 # instead of simply _overlapping_ it, because we do guarantee that
87 # the validity ranges are disjoint for a particular dataset type,
88 # collection, and data ID. But using an overlap test and a check
89 # for multiple result rows here allows us to provide a more useful
90 # diagnostic, as well as allowing `select` to support more general
91 # queries where multiple results are not an error.
92 if results.fetchone() is not None:
93 raise RuntimeError(
94 f"Multiple matches found for calibration lookup in {collection.name} for "
95 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
96 )
97 return DatasetRef(
98 datasetType=self.datasetType,
99 dataId=dataId,
100 id=row.id,
101 run=self._collections[row._mapping[self._runKeyColumn]].name,
102 )
104 def delete(self, datasets: Iterable[DatasetRef]) -> None:
105 # Docstring inherited from DatasetRecordStorage.
106 # Only delete from common dataset table; ON DELETE foreign key clauses
107 # will handle the rest.
108 self._db.delete(
109 self._static.dataset,
110 ["id"],
111 *[{"id": dataset.getCheckedId()} for dataset in datasets],
112 )
114 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
115 # Docstring inherited from DatasetRecordStorage.
116 if collection.type is not CollectionType.TAGGED: 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true
117 raise TypeError(
118 f"Cannot associate into collection '{collection.name}' "
119 f"of type {collection.type.name}; must be TAGGED."
120 )
121 protoRow = {
122 self._collections.getCollectionForeignKeyName(): collection.key,
123 "dataset_type_id": self._dataset_type_id,
124 }
125 rows = []
126 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
127 for dataset in datasets:
128 row = dict(protoRow, dataset_id=dataset.getCheckedId())
129 for dimension, value in dataset.dataId.items():
130 row[dimension.name] = value
131 governorValues.update_extract(dataset.dataId)
132 rows.append(row)
133 # Update the summary tables for this collection in case this is the
134 # first time this dataset type or these governor values will be
135 # inserted there.
136 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
137 # Update the tag table itself.
138 self._db.replace(self._tags, *rows)
140 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
141 # Docstring inherited from DatasetRecordStorage.
142 if collection.type is not CollectionType.TAGGED: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true
143 raise TypeError(
144 f"Cannot disassociate from collection '{collection.name}' "
145 f"of type {collection.type.name}; must be TAGGED."
146 )
147 rows = [
148 {
149 "dataset_id": dataset.getCheckedId(),
150 self._collections.getCollectionForeignKeyName(): collection.key,
151 }
152 for dataset in datasets
153 ]
154 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows)
156 def _buildCalibOverlapQuery(
157 self, collection: CollectionRecord, dataIds: Optional[DataCoordinateSet], timespan: Timespan
158 ) -> SimpleQuery:
159 assert self._calibs is not None
160 # Start by building a SELECT query for any rows that would overlap
161 # this one.
162 query = SimpleQuery()
163 query.join(self._calibs)
164 # Add a WHERE clause matching the dataset type and collection.
165 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
166 query.where.append(
167 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
168 )
169 # Add a WHERE clause matching any of the given data IDs.
170 if dataIds is not None:
171 dataIds.constrain(
172 query,
173 lambda name: self._calibs.columns[name], # type: ignore
174 )
175 # Add WHERE clause for timespan overlaps.
176 TimespanReprClass = self._db.getTimespanRepresentation()
177 query.where.append(
178 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan))
179 )
180 return query
182 def certify(
183 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
184 ) -> None:
185 # Docstring inherited from DatasetRecordStorage.
186 if self._calibs is None: 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true
187 raise TypeError(
188 f"Cannot certify datasets of type {self.datasetType.name}, for which "
189 f"DatasetType.isCalibration() is False."
190 )
191 if collection.type is not CollectionType.CALIBRATION: 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true
192 raise TypeError(
193 f"Cannot certify into collection '{collection.name}' "
194 f"of type {collection.type.name}; must be CALIBRATION."
195 )
196 TimespanReprClass = self._db.getTimespanRepresentation()
197 protoRow = {
198 self._collections.getCollectionForeignKeyName(): collection.key,
199 "dataset_type_id": self._dataset_type_id,
200 }
201 rows = []
202 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
203 dataIds: Optional[Set[DataCoordinate]] = (
204 set() if not TimespanReprClass.hasExclusionConstraint() else None
205 )
206 for dataset in datasets:
207 row = dict(protoRow, dataset_id=dataset.getCheckedId())
208 for dimension, value in dataset.dataId.items():
209 row[dimension.name] = value
210 TimespanReprClass.update(timespan, result=row)
211 governorValues.update_extract(dataset.dataId)
212 rows.append(row)
213 if dataIds is not None: 213 ↛ 206line 213 didn't jump to line 206, because the condition on line 213 was never false
214 dataIds.add(dataset.dataId)
215 # Update the summary tables for this collection in case this is the
216 # first time this dataset type or these governor values will be
217 # inserted there.
218 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
219 # Update the association table itself.
220 if TimespanReprClass.hasExclusionConstraint(): 220 ↛ 223line 220 didn't jump to line 223, because the condition on line 220 was never true
221 # Rely on database constraint to enforce invariants; we just
222 # reraise the exception for consistency across DB engines.
223 try:
224 self._db.insert(self._calibs, *rows)
225 except sqlalchemy.exc.IntegrityError as err:
226 raise ConflictingDefinitionError(
227 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
228 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
229 ) from err
230 else:
231 # Have to implement exclusion constraint ourselves.
232 # Start by building a SELECT query for any rows that would overlap
233 # this one.
234 query = self._buildCalibOverlapQuery(
235 collection,
236 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
237 timespan,
238 )
239 query.columns.append(sqlalchemy.sql.func.count())
240 sql = query.combine()
241 # Acquire a table lock to ensure there are no concurrent writes
242 # could invalidate our checking before we finish the inserts. We
243 # use a SAVEPOINT in case there is an outer transaction that a
244 # failure here should not roll back.
245 with self._db.transaction(lock=[self._calibs], savepoint=True):
246 # Run the check SELECT query.
247 conflicting = self._db.query(sql).scalar()
248 if conflicting > 0:
249 raise ConflictingDefinitionError(
250 f"{conflicting} validity range conflicts certifying datasets of type "
251 f"{self.datasetType.name} into {collection.name} for range "
252 f"[{timespan.begin}, {timespan.end})."
253 )
254 # Proceed with the insert.
255 self._db.insert(self._calibs, *rows)
257 def decertify(
258 self,
259 collection: CollectionRecord,
260 timespan: Timespan,
261 *,
262 dataIds: Optional[Iterable[DataCoordinate]] = None,
263 ) -> None:
264 # Docstring inherited from DatasetRecordStorage.
265 if self._calibs is None: 265 ↛ 266line 265 didn't jump to line 266, because the condition on line 265 was never true
266 raise TypeError(
267 f"Cannot decertify datasets of type {self.datasetType.name}, for which "
268 f"DatasetType.isCalibration() is False."
269 )
270 if collection.type is not CollectionType.CALIBRATION: 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true
271 raise TypeError(
272 f"Cannot decertify from collection '{collection.name}' "
273 f"of type {collection.type.name}; must be CALIBRATION."
274 )
275 TimespanReprClass = self._db.getTimespanRepresentation()
276 # Construct a SELECT query to find all rows that overlap our inputs.
277 dataIdSet: Optional[DataCoordinateSet]
278 if dataIds is not None:
279 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
280 else:
281 dataIdSet = None
282 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
283 query.columns.extend(self._calibs.columns)
284 sql = query.combine()
285 # Set up collections to populate with the rows we'll want to modify.
286 # The insert rows will have the same values for collection and
287 # dataset type.
288 protoInsertRow = {
289 self._collections.getCollectionForeignKeyName(): collection.key,
290 "dataset_type_id": self._dataset_type_id,
291 }
292 rowsToDelete = []
293 rowsToInsert = []
294 # Acquire a table lock to ensure there are no concurrent writes
295 # between the SELECT and the DELETE and INSERT queries based on it.
296 with self._db.transaction(lock=[self._calibs], savepoint=True):
297 for row in self._db.query(sql).mappings():
298 rowsToDelete.append({"id": row["id"]})
299 # Construct the insert row(s) by copying the prototype row,
300 # then adding the dimension column values, then adding what's
301 # left of the timespan from that row after we subtract the
302 # given timespan.
303 newInsertRow = protoInsertRow.copy()
304 newInsertRow["dataset_id"] = row["dataset_id"]
305 for name in self.datasetType.dimensions.required.names:
306 newInsertRow[name] = row[name]
307 rowTimespan = TimespanReprClass.extract(row)
308 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
309 for diffTimespan in rowTimespan.difference(timespan):
310 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
311 # Run the DELETE and INSERT queries.
312 self._db.delete(self._calibs, ["id"], *rowsToDelete)
313 self._db.insert(self._calibs, *rowsToInsert)
315 def select(
316 self,
317 *collections: CollectionRecord,
318 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
319 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
320 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
321 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
322 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
323 ) -> SimpleQuery:
324 # Docstring inherited from DatasetRecordStorage.
325 collection_types = {collection.type for collection in collections}
326 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
327 #
328 # There are two tables in play here:
329 #
330 # - the static dataset table (with the dataset ID, dataset type ID,
331 # run ID/name, and ingest date);
332 #
333 # - the dynamic tags/calibs table (with the dataset ID, dataset type
334 # type ID, collection ID/name, data ID, and possibly validity
335 # range).
336 #
337 # That means that we might want to return a query against either table
338 # or a JOIN of both, depending on which quantities the caller wants.
339 # But this method is documented/typed such that ``dataId`` is never
340 # `None` - i.e. we always constrain or retreive the data ID. That
341 # means we'll always include the tags/calibs table and join in the
342 # static dataset table only if we need things from it that we can't get
343 # from the tags/calibs table.
344 #
345 # Note that it's important that we include a WHERE constraint on both
346 # tables for any column (e.g. dataset_type_id) that is in both when
347 # it's given explicitly; not doing can prevent the query planner from
348 # using very important indexes. At present, we don't include those
349 # redundant columns in the JOIN ON expression, however, because the
350 # FOREIGN KEY (and its index) are defined only on dataset_id.
351 #
352 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass
353 # to its `join` method when we bring in the tags/calibs table.
354 query = SimpleQuery()
355 # We get the data ID or constrain it in the tags/calibs table, but
356 # that's multiple columns, not one, so we need to transform the one
357 # Select.Or argument into a dictionary of them.
358 kwargs: Dict[str, Any]
359 if dataId is SimpleQuery.Select:
360 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
361 else:
362 kwargs = dict(dataId.byName())
363 # We always constrain (never retrieve) the dataset type in at least the
364 # tags/calibs table.
365 kwargs["dataset_type_id"] = self._dataset_type_id
366 # Join in the tags or calibs table, turning those 'kwargs' entries into
367 # WHERE constraints or SELECT columns as appropriate.
368 if collection_types == {CollectionType.CALIBRATION}:
369 assert (
370 self._calibs is not None
371 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
372 TimespanReprClass = self._db.getTimespanRepresentation()
373 # Add the timespan column(s) to the result columns, or constrain
374 # the timespan via an overlap condition.
375 if timespan is SimpleQuery.Select:
376 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()})
377 elif timespan is not None: 377 ↛ 383line 377 didn't jump to line 383, because the condition on line 377 was never false
378 query.where.append(
379 TimespanReprClass.fromSelectable(self._calibs).overlaps(
380 TimespanReprClass.fromLiteral(timespan)
381 )
382 )
383 query.join(self._calibs, **kwargs)
384 dataset_id_col = self._calibs.columns.dataset_id
385 collection_col = self._calibs.columns[self._collections.getCollectionForeignKeyName()]
386 elif CollectionType.CALIBRATION not in collection_types: 386 ↛ 391line 386 didn't jump to line 391, because the condition on line 386 was never false
387 query.join(self._tags, **kwargs)
388 dataset_id_col = self._tags.columns.dataset_id
389 collection_col = self._tags.columns[self._collections.getCollectionForeignKeyName()]
390 else:
391 raise TypeError(
392 "Cannot query for CALIBRATION collections in the same "
393 "subquery as other kinds of collections."
394 )
395 # We always constrain (never retrieve) the collection(s) in the
396 # tags/calibs table.
397 if len(collections) == 1:
398 query.where.append(collection_col == collections[0].key)
399 elif len(collections) == 0:
400 # We support the case where there are no collections as a way to
401 # generate a valid SQL query that can't yield results. This should
402 # never get executed, but lots of downstream code will still try
403 # to access the SQLAlchemy objects representing the columns in the
404 # subquery. That's not idea, but it'd take a lot of refactoring to
405 # fix it.
406 query.where.append(sqlalchemy.sql.literal(False))
407 else:
408 query.where.append(collection_col.in_([collection.key for collection in collections]))
409 # We can always get the dataset_id from the tags/calibs table or
410 # constrain it there. Can't use kwargs for that because we need to
411 # alias it to 'id'.
412 if id is SimpleQuery.Select:
413 query.columns.append(dataset_id_col.label("id"))
414 elif id is not None: 414 ↛ 415line 414 didn't jump to line 415, because the condition on line 414 was never true
415 query.where.append(dataset_id_col == id)
416 # It's possible we now have everything we need, from just the
417 # tags/calibs table. The things we might need to get from the static
418 # dataset table are the run key and the ingest date.
419 need_static_table = False
420 static_kwargs: Dict[str, Any] = {}
421 if run is not None:
422 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection."
423 if len(collections) == 1 and collections[0].type is CollectionType.RUN:
424 # If we are searching exactly one RUN collection, we
425 # know that if we find the dataset in that collection,
426 # then that's the datasets's run; we don't need to
427 # query for it.
428 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn))
429 else:
430 static_kwargs[self._runKeyColumn] = SimpleQuery.Select
431 need_static_table = True
432 # Ingest date can only come from the static table.
433 if ingestDate is not None:
434 need_static_table = True
435 if ingestDate is SimpleQuery.Select: 435 ↛ 438line 435 didn't jump to line 438, because the condition on line 435 was never false
436 static_kwargs["ingest_date"] = SimpleQuery.Select
437 else:
438 assert isinstance(ingestDate, Timespan)
439 # Timespan is astropy Time (usually in TAI) and ingest_date is
440 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
441 if ingestDate.isEmpty():
442 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
443 if ingestDate.begin is not None:
444 begin = ingestDate.begin.utc.datetime # type: ignore
445 query.where.append(self._static.dataset.columns.ingest_date >= begin)
446 if ingestDate.end is not None:
447 end = ingestDate.end.utc.datetime # type: ignore
448 query.where.append(self._static.dataset.columns.ingest_date < end)
449 # If we need the static table, join it in via dataset_id and
450 # dataset_type_id
451 if need_static_table:
452 query.join(
453 self._static.dataset,
454 onclause=(dataset_id_col == self._static.dataset.columns.id),
455 **static_kwargs,
456 )
457 # Also constrain dataset_type_id in static table in case that helps
458 # generate a better plan.
459 # We could also include this in the JOIN ON clause, but my guess is
460 # that that's a good idea IFF it's in the foreign key, and right
461 # now it isn't.
462 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
463 return query
465 def getDataId(self, id: DatasetId) -> DataCoordinate:
466 """Return DataId for a dataset.
468 Parameters
469 ----------
470 id : `DatasetId`
471 Unique dataset identifier.
473 Returns
474 -------
475 dataId : `DataCoordinate`
476 DataId for the dataset.
477 """
478 # This query could return multiple rows (one for each tagged collection
479 # the dataset is in, plus one for its run collection), and we don't
480 # care which of those we get.
481 sql = (
482 self._tags.select()
483 .where(
484 sqlalchemy.sql.and_(
485 self._tags.columns.dataset_id == id,
486 self._tags.columns.dataset_type_id == self._dataset_type_id,
487 )
488 )
489 .limit(1)
490 )
491 row = self._db.query(sql).mappings().fetchone()
492 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
493 return DataCoordinate.standardize(
494 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
495 graph=self.datasetType.dimensions,
496 )
499class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
500 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
501 auto-incremented column for dataset IDs.
502 """
504 def insert(
505 self,
506 run: RunRecord,
507 dataIds: Iterable[DataCoordinate],
508 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
509 ) -> Iterator[DatasetRef]:
510 # Docstring inherited from DatasetRecordStorage.
512 # We only support UNIQUE mode for integer dataset IDs
513 if idMode != DatasetIdGenEnum.UNIQUE: 513 ↛ 514line 513 didn't jump to line 514, because the condition on line 513 was never true
514 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
516 # Transform a possibly-single-pass iterable into a list.
517 dataIdList = list(dataIds)
518 yield from self._insert(run, dataIdList)
520 def import_(
521 self,
522 run: RunRecord,
523 datasets: Iterable[DatasetRef],
524 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
525 reuseIds: bool = False,
526 ) -> Iterator[DatasetRef]:
527 # Docstring inherited from DatasetRecordStorage.
529 # We only support UNIQUE mode for integer dataset IDs
530 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 530 ↛ 531line 530 didn't jump to line 531, because the condition on line 530 was never true
531 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
533 # Make a list of dataIds and optionally dataset IDs.
534 dataIdList: List[DataCoordinate] = []
535 datasetIdList: List[int] = []
536 for dataset in datasets:
537 dataIdList.append(dataset.dataId)
539 # We only accept integer dataset IDs, but also allow None.
540 datasetId = dataset.id
541 if datasetId is None: 541 ↛ 543line 541 didn't jump to line 543, because the condition on line 541 was never true
542 # if reuseIds is set then all IDs must be known
543 if reuseIds:
544 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
545 elif isinstance(datasetId, int): 545 ↛ 549line 545 didn't jump to line 549, because the condition on line 545 was never false
546 if reuseIds:
547 datasetIdList.append(datasetId)
548 else:
549 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
551 yield from self._insert(run, dataIdList, datasetIdList)
553 def _insert(
554 self, run: RunRecord, dataIdList: List[DataCoordinate], datasetIdList: Optional[List[int]] = None
555 ) -> Iterator[DatasetRef]:
556 """Common part of implementation of `insert` and `import_` methods."""
558 # Remember any governor dimension values we see.
559 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
560 for dataId in dataIdList:
561 governorValues.update_extract(dataId)
563 staticRow = {
564 "dataset_type_id": self._dataset_type_id,
565 self._runKeyColumn: run.key,
566 }
567 with self._db.transaction():
568 # Insert into the static dataset table, generating autoincrement
569 # dataset_id values.
570 if datasetIdList:
571 # reuse existing IDs
572 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
573 self._db.insert(self._static.dataset, *rows)
574 else:
575 # use auto-incremented IDs
576 datasetIdList = self._db.insert(
577 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True
578 )
579 assert datasetIdList is not None
580 # Update the summary tables for this collection in case this is the
581 # first time this dataset type or these governor values will be
582 # inserted there.
583 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
584 # Combine the generated dataset_id values and data ID fields to
585 # form rows to be inserted into the tags table.
586 protoTagsRow = {
587 "dataset_type_id": self._dataset_type_id,
588 self._collections.getCollectionForeignKeyName(): run.key,
589 }
590 tagsRows = [
591 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
592 for dataId, dataset_id in zip(dataIdList, datasetIdList)
593 ]
594 # Insert those rows into the tags table. This is where we'll
595 # get any unique constraint violations.
596 self._db.insert(self._tags, *tagsRows)
598 for dataId, datasetId in zip(dataIdList, datasetIdList):
599 yield DatasetRef(
600 datasetType=self.datasetType,
601 dataId=dataId,
602 id=datasetId,
603 run=run.name,
604 )
607class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
608 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
609 dataset IDs.
610 """
612 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
613 """Namespace UUID used for UUID5 generation. Do not change. This was
614 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
615 """
617 def insert(
618 self,
619 run: RunRecord,
620 dataIds: Iterable[DataCoordinate],
621 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
622 ) -> Iterator[DatasetRef]:
623 # Docstring inherited from DatasetRecordStorage.
625 # Remember any governor dimension values we see.
626 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
628 # Iterate over data IDs, transforming a possibly-single-pass iterable
629 # into a list.
630 dataIdList = []
631 rows = []
632 for dataId in dataIds:
633 dataIdList.append(dataId)
634 rows.append(
635 {
636 "id": self._makeDatasetId(run, dataId, idMode),
637 "dataset_type_id": self._dataset_type_id,
638 self._runKeyColumn: run.key,
639 }
640 )
641 governorValues.update_extract(dataId)
643 with self._db.transaction():
644 # Insert into the static dataset table.
645 self._db.insert(self._static.dataset, *rows)
646 # Update the summary tables for this collection in case this is the
647 # first time this dataset type or these governor values will be
648 # inserted there.
649 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
650 # Combine the generated dataset_id values and data ID fields to
651 # form rows to be inserted into the tags table.
652 protoTagsRow = {
653 "dataset_type_id": self._dataset_type_id,
654 self._collections.getCollectionForeignKeyName(): run.key,
655 }
656 tagsRows = [
657 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
658 for dataId, row in zip(dataIdList, rows)
659 ]
660 # Insert those rows into the tags table.
661 self._db.insert(self._tags, *tagsRows)
663 for dataId, row in zip(dataIdList, rows):
664 yield DatasetRef(
665 datasetType=self.datasetType,
666 dataId=dataId,
667 id=row["id"],
668 run=run.name,
669 )
671 def import_(
672 self,
673 run: RunRecord,
674 datasets: Iterable[DatasetRef],
675 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
676 reuseIds: bool = False,
677 ) -> Iterator[DatasetRef]:
678 # Docstring inherited from DatasetRecordStorage.
680 # Remember any governor dimension values we see.
681 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
683 # Iterate over data IDs, transforming a possibly-single-pass iterable
684 # into a list.
685 dataIds = {}
686 for dataset in datasets:
687 # Ignore unknown ID types, normally all IDs have the same type but
688 # this code supports mixed types or missing IDs.
689 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
690 if datasetId is None:
691 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
692 dataIds[datasetId] = dataset.dataId
693 governorValues.update_extract(dataset.dataId)
695 with self._db.session() as session:
697 # insert all new rows into a temporary table
698 tableSpec = makeTagTableSpec(
699 self.datasetType, type(self._collections), ddl.GUID, constraints=False
700 )
701 tmp_tags = session.makeTemporaryTable(tableSpec)
703 collFkName = self._collections.getCollectionForeignKeyName()
704 protoTagsRow = {
705 "dataset_type_id": self._dataset_type_id,
706 collFkName: run.key,
707 }
708 tmpRows = [
709 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
710 for dataset_id, dataId in dataIds.items()
711 ]
713 with self._db.transaction():
715 # store all incoming data in a temporary table
716 self._db.insert(tmp_tags, *tmpRows)
718 # There are some checks that we want to make for consistency
719 # of the new datasets with existing ones.
720 self._validateImport(tmp_tags, run)
722 # Before we merge temporary table into dataset/tags we need to
723 # drop datasets which are already there (and do not conflict).
724 self._db.deleteWhere(
725 tmp_tags,
726 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)),
727 )
729 # Copy it into dataset table, need to re-label some columns.
730 self._db.insert(
731 self._static.dataset,
732 select=sqlalchemy.sql.select(
733 tmp_tags.columns.dataset_id.label("id"),
734 tmp_tags.columns.dataset_type_id,
735 tmp_tags.columns[collFkName].label(self._runKeyColumn),
736 ),
737 )
739 # Update the summary tables for this collection in case this
740 # is the first time this dataset type or these governor values
741 # will be inserted there.
742 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
744 # Copy it into tags table.
745 self._db.insert(self._tags, select=tmp_tags.select())
747 # Return refs in the same order as in the input list.
748 for dataset_id, dataId in dataIds.items():
749 yield DatasetRef(
750 datasetType=self.datasetType,
751 id=dataset_id,
752 dataId=dataId,
753 run=run.name,
754 )
756 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None:
757 """Validate imported refs against existing datasets.
759 Parameters
760 ----------
761 tmp_tags : `sqlalchemy.schema.Table`
762 Temporary table with new datasets and the same schema as tags
763 table.
764 run : `RunRecord`
765 The record object describing the `~CollectionType.RUN` collection.
767 Raises
768 ------
769 ConflictingDefinitionError
770 Raise if new datasets conflict with existing ones.
771 """
772 dataset = self._static.dataset
773 tags = self._tags
774 collFkName = self._collections.getCollectionForeignKeyName()
776 # Check that existing datasets have the same dataset type and
777 # run.
778 query = (
779 sqlalchemy.sql.select(
780 dataset.columns.id.label("dataset_id"),
781 dataset.columns.dataset_type_id.label("dataset_type_id"),
782 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"),
783 dataset.columns[self._runKeyColumn].label("run"),
784 tmp_tags.columns[collFkName].label("new run"),
785 )
786 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id))
787 .where(
788 sqlalchemy.sql.or_(
789 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
790 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName],
791 )
792 )
793 )
794 result = self._db.query(query)
795 if (row := result.first()) is not None:
796 # Only include the first one in the exception message
797 raise ConflictingDefinitionError(
798 f"Existing dataset type or run do not match new dataset: {row._asdict()}"
799 )
801 # Check that matching dataset in tags table has the same DataId.
802 query = (
803 sqlalchemy.sql.select(
804 tags.columns.dataset_id,
805 tags.columns.dataset_type_id.label("type_id"),
806 tmp_tags.columns.dataset_type_id.label("new type_id"),
807 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
808 *[
809 tmp_tags.columns[dim].label(f"new {dim}")
810 for dim in self.datasetType.dimensions.required.names
811 ],
812 )
813 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id))
814 .where(
815 sqlalchemy.sql.or_(
816 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id,
817 *[
818 tags.columns[dim] != tmp_tags.columns[dim]
819 for dim in self.datasetType.dimensions.required.names
820 ],
821 )
822 )
823 )
824 result = self._db.query(query)
825 if (row := result.first()) is not None:
826 # Only include the first one in the exception message
827 raise ConflictingDefinitionError(
828 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}"
829 )
831 # Check that matching run+dataId have the same dataset ID.
832 query = (
833 sqlalchemy.sql.select(
834 tags.columns.dataset_type_id.label("dataset_type_id"),
835 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names],
836 tags.columns.dataset_id,
837 tmp_tags.columns.dataset_id.label("new dataset_id"),
838 tags.columns[collFkName],
839 tmp_tags.columns[collFkName].label(f"new {collFkName}"),
840 )
841 .select_from(
842 tags.join(
843 tmp_tags,
844 sqlalchemy.sql.and_(
845 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id,
846 tags.columns[collFkName] == tmp_tags.columns[collFkName],
847 *[
848 tags.columns[dim] == tmp_tags.columns[dim]
849 for dim in self.datasetType.dimensions.required.names
850 ],
851 ),
852 )
853 )
854 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id)
855 )
856 result = self._db.query(query)
857 if (row := result.first()) is not None:
858 # only include the first one in the exception message
859 raise ConflictingDefinitionError(
860 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
861 )
863 def _makeDatasetId(
864 self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum
865 ) -> uuid.UUID:
866 """Generate dataset ID for a dataset.
868 Parameters
869 ----------
870 run : `RunRecord`
871 The record object describing the RUN collection for the dataset.
872 dataId : `DataCoordinate`
873 Expanded data ID for the dataset.
874 idGenerationMode : `DatasetIdGenEnum`
875 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
876 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
877 deterministic UUID5-type ID based on a dataset type name and
878 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
879 deterministic UUID5-type ID based on a dataset type name, run
880 collection name, and ``dataId``.
882 Returns
883 -------
884 datasetId : `uuid.UUID`
885 Dataset identifier.
886 """
887 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
888 return uuid.uuid4()
889 else:
890 # WARNING: If you modify this code make sure that the order of
891 # items in the `items` list below never changes.
892 items: List[Tuple[str, str]] = []
893 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
894 items = [
895 ("dataset_type", self.datasetType.name),
896 ]
897 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 897 ↛ 903line 897 didn't jump to line 903, because the condition on line 897 was never false
898 items = [
899 ("dataset_type", self.datasetType.name),
900 ("run", run.name),
901 ]
902 else:
903 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
905 for name, value in sorted(dataId.byName().items()):
906 items.append((name, str(value)))
907 data = ",".join(f"{key}={value}" for key, value in items)
908 return uuid.uuid5(self.NS_UUID, data)