Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py : 80%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5from typing import (
6 Any,
7 Callable,
8 Dict,
9 Iterable,
10 Iterator,
11 List,
12 Optional,
13 Set,
14 Tuple,
15 TYPE_CHECKING,
16)
17import uuid
19import sqlalchemy
21from lsst.daf.butler import (
22 CollectionType,
23 DataCoordinate,
24 DataCoordinateSet,
25 DatasetId,
26 DatasetRef,
27 DatasetType,
28 SimpleQuery,
29 Timespan,
30)
31from lsst.daf.butler.registry import ConflictingDefinitionError, UnsupportedIdGeneratorError
32from lsst.daf.butler.registry.interfaces import DatasetRecordStorage, DatasetIdGenEnum
34from ...summaries import GovernorDimensionRestriction
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
38 from .tables import StaticDatasetTablesTuple
39 from .summaries import CollectionSummaryManager
42class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
43 """Dataset record storage implementation paired with
44 `ByDimensionsDatasetRecordStorageManager`; see that class for more
45 information.
47 Instances of this class should never be constructed directly; use
48 `DatasetRecordStorageManager.register` instead.
49 """
51 def __init__(self, *, datasetType: DatasetType,
52 db: Database,
53 dataset_type_id: int,
54 collections: CollectionManager,
55 static: StaticDatasetTablesTuple,
56 summaries: CollectionSummaryManager,
57 tags: sqlalchemy.schema.Table,
58 calibs: Optional[sqlalchemy.schema.Table]):
59 super().__init__(datasetType=datasetType)
60 self._dataset_type_id = dataset_type_id
61 self._db = db
62 self._collections = collections
63 self._static = static
64 self._summaries = summaries
65 self._tags = tags
66 self._calibs = calibs
67 self._runKeyColumn = collections.getRunForeignKeyName()
69 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
70 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
71 # Docstring inherited from DatasetRecordStorage.
72 assert dataId.graph == self.datasetType.dimensions
73 if collection.type is CollectionType.CALIBRATION and timespan is None: 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true
74 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} "
75 f"without an input timespan.")
76 sql = self.select(collection=collection, dataId=dataId, id=SimpleQuery.Select,
77 run=SimpleQuery.Select, timespan=timespan)
78 if sql is None: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true
79 return None
80 else:
81 sql = sql.combine()
82 results = self._db.query(sql)
83 row = results.fetchone()
84 if row is None:
85 return None
86 if collection.type is CollectionType.CALIBRATION:
87 # For temporal calibration lookups (only!) our invariants do not
88 # guarantee that the number of result rows is <= 1.
89 # They would if `select` constrained the given timespan to be
90 # _contained_ by the validity range in the self._calibs table,
91 # instead of simply _overlapping_ it, because we do guarantee that
92 # the validity ranges are disjoint for a particular dataset type,
93 # collection, and data ID. But using an overlap test and a check
94 # for multiple result rows here allows us to provide a more useful
95 # diagnostic, as well as allowing `select` to support more general
96 # queries where multiple results are not an error.
97 if results.fetchone() is not None:
98 raise RuntimeError(
99 f"Multiple matches found for calibration lookup in {collection.name} for "
100 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
101 )
102 return DatasetRef(
103 datasetType=self.datasetType,
104 dataId=dataId,
105 id=row["id"],
106 run=self._collections[row[self._runKeyColumn]].name
107 )
109 def delete(self, datasets: Iterable[DatasetRef]) -> None:
110 # Docstring inherited from DatasetRecordStorage.
111 # Only delete from common dataset table; ON DELETE foreign key clauses
112 # will handle the rest.
113 self._db.delete(
114 self._static.dataset,
115 ["id"],
116 *[{"id": dataset.getCheckedId()} for dataset in datasets],
117 )
119 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
120 # Docstring inherited from DatasetRecordStorage.
121 if collection.type is not CollectionType.TAGGED: 121 ↛ 122line 121 didn't jump to line 122, because the condition on line 121 was never true
122 raise TypeError(f"Cannot associate into collection '{collection.name}' "
123 f"of type {collection.type.name}; must be TAGGED.")
124 protoRow = {
125 self._collections.getCollectionForeignKeyName(): collection.key,
126 "dataset_type_id": self._dataset_type_id,
127 }
128 rows = []
129 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
130 for dataset in datasets:
131 row = dict(protoRow, dataset_id=dataset.getCheckedId())
132 for dimension, value in dataset.dataId.items():
133 row[dimension.name] = value
134 governorValues.update_extract(dataset.dataId)
135 rows.append(row)
136 # Update the summary tables for this collection in case this is the
137 # first time this dataset type or these governor values will be
138 # inserted there.
139 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
140 # Update the tag table itself.
141 self._db.replace(self._tags, *rows)
143 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
144 # Docstring inherited from DatasetRecordStorage.
145 if collection.type is not CollectionType.TAGGED: 145 ↛ 146line 145 didn't jump to line 146, because the condition on line 145 was never true
146 raise TypeError(f"Cannot disassociate from collection '{collection.name}' "
147 f"of type {collection.type.name}; must be TAGGED.")
148 rows = [
149 {
150 "dataset_id": dataset.getCheckedId(),
151 self._collections.getCollectionForeignKeyName(): collection.key
152 }
153 for dataset in datasets
154 ]
155 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()],
156 *rows)
158 def _buildCalibOverlapQuery(self, collection: CollectionRecord,
159 dataIds: Optional[DataCoordinateSet],
160 timespan: Timespan) -> SimpleQuery:
161 assert self._calibs is not None
162 # Start by building a SELECT query for any rows that would overlap
163 # this one.
164 query = SimpleQuery()
165 query.join(self._calibs)
166 # Add a WHERE clause matching the dataset type and collection.
167 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
168 query.where.append(
169 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
170 )
171 # Add a WHERE clause matching any of the given data IDs.
172 if dataIds is not None:
173 dataIds.constrain(
174 query,
175 lambda name: self._calibs.columns[name], # type: ignore
176 )
177 # Add WHERE clause for timespan overlaps.
178 TimespanReprClass = self._db.getTimespanRepresentation()
179 query.where.append(
180 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan))
181 )
182 return query
184 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
185 timespan: Timespan) -> None:
186 # Docstring inherited from DatasetRecordStorage.
187 if self._calibs is None: 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true
188 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which "
189 f"DatasetType.isCalibration() is False.")
190 if collection.type is not CollectionType.CALIBRATION: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true
191 raise TypeError(f"Cannot certify into collection '{collection.name}' "
192 f"of type {collection.type.name}; must be CALIBRATION.")
193 TimespanReprClass = self._db.getTimespanRepresentation()
194 protoRow = {
195 self._collections.getCollectionForeignKeyName(): collection.key,
196 "dataset_type_id": self._dataset_type_id,
197 }
198 rows = []
199 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
200 dataIds: Optional[Set[DataCoordinate]] = (
201 set() if not TimespanReprClass.hasExclusionConstraint() else None
202 )
203 for dataset in datasets:
204 row = dict(protoRow, dataset_id=dataset.getCheckedId())
205 for dimension, value in dataset.dataId.items():
206 row[dimension.name] = value
207 TimespanReprClass.update(timespan, result=row)
208 governorValues.update_extract(dataset.dataId)
209 rows.append(row)
210 if dataIds is not None: 210 ↛ 203line 210 didn't jump to line 203, because the condition on line 210 was never false
211 dataIds.add(dataset.dataId)
212 # Update the summary tables for this collection in case this is the
213 # first time this dataset type or these governor values will be
214 # inserted there.
215 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
216 # Update the association table itself.
217 if TimespanReprClass.hasExclusionConstraint(): 217 ↛ 220line 217 didn't jump to line 220, because the condition on line 217 was never true
218 # Rely on database constraint to enforce invariants; we just
219 # reraise the exception for consistency across DB engines.
220 try:
221 self._db.insert(self._calibs, *rows)
222 except sqlalchemy.exc.IntegrityError as err:
223 raise ConflictingDefinitionError(
224 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
225 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
226 ) from err
227 else:
228 # Have to implement exclusion constraint ourselves.
229 # Start by building a SELECT query for any rows that would overlap
230 # this one.
231 query = self._buildCalibOverlapQuery(
232 collection,
233 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
234 timespan
235 )
236 query.columns.append(sqlalchemy.sql.func.count())
237 sql = query.combine()
238 # Acquire a table lock to ensure there are no concurrent writes
239 # could invalidate our checking before we finish the inserts. We
240 # use a SAVEPOINT in case there is an outer transaction that a
241 # failure here should not roll back.
242 with self._db.transaction(lock=[self._calibs], savepoint=True):
243 # Run the check SELECT query.
244 conflicting = self._db.query(sql).scalar()
245 if conflicting > 0:
246 raise ConflictingDefinitionError(
247 f"{conflicting} validity range conflicts certifying datasets of type "
248 f"{self.datasetType.name} into {collection.name} for range "
249 f"[{timespan.begin}, {timespan.end})."
250 )
251 # Proceed with the insert.
252 self._db.insert(self._calibs, *rows)
254 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
255 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
256 # Docstring inherited from DatasetRecordStorage.
257 if self._calibs is None: 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true
258 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which "
259 f"DatasetType.isCalibration() is False.")
260 if collection.type is not CollectionType.CALIBRATION: 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true
261 raise TypeError(f"Cannot decertify from collection '{collection.name}' "
262 f"of type {collection.type.name}; must be CALIBRATION.")
263 TimespanReprClass = self._db.getTimespanRepresentation()
264 # Construct a SELECT query to find all rows that overlap our inputs.
265 dataIdSet: Optional[DataCoordinateSet]
266 if dataIds is not None:
267 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
268 else:
269 dataIdSet = None
270 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
271 query.columns.extend(self._calibs.columns)
272 sql = query.combine()
273 # Set up collections to populate with the rows we'll want to modify.
274 # The insert rows will have the same values for collection and
275 # dataset type.
276 protoInsertRow = {
277 self._collections.getCollectionForeignKeyName(): collection.key,
278 "dataset_type_id": self._dataset_type_id,
279 }
280 rowsToDelete = []
281 rowsToInsert = []
282 # Acquire a table lock to ensure there are no concurrent writes
283 # between the SELECT and the DELETE and INSERT queries based on it.
284 with self._db.transaction(lock=[self._calibs], savepoint=True):
285 for row in self._db.query(sql):
286 rowsToDelete.append({"id": row["id"]})
287 # Construct the insert row(s) by copying the prototype row,
288 # then adding the dimension column values, then adding what's
289 # left of the timespan from that row after we subtract the
290 # given timespan.
291 newInsertRow = protoInsertRow.copy()
292 newInsertRow["dataset_id"] = row["dataset_id"]
293 for name in self.datasetType.dimensions.required.names:
294 newInsertRow[name] = row[name]
295 rowTimespan = TimespanReprClass.extract(row)
296 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
297 for diffTimespan in rowTimespan.difference(timespan):
298 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
299 # Run the DELETE and INSERT queries.
300 self._db.delete(self._calibs, ["id"], *rowsToDelete)
301 self._db.insert(self._calibs, *rowsToInsert)
303 def select(self, collection: CollectionRecord,
304 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
305 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
306 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
307 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
308 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
309 ) -> Optional[SimpleQuery]:
310 # Docstring inherited from DatasetRecordStorage.
311 assert collection.type is not CollectionType.CHAINED
312 #
313 # There are two tables in play here:
314 #
315 # - the static dataset table (with the dataset ID, dataset type ID,
316 # run ID/name, and ingest date);
317 #
318 # - the dynamic tags/calibs table (with the dataset ID, dataset type
319 # type ID, collection ID/name, data ID, and possibly validity
320 # range).
321 #
322 # That means that we might want to return a query against either table
323 # or a JOIN of both, depending on which quantities the caller wants.
324 # But this method is documented/typed such that ``dataId`` is never
325 # `None` - i.e. we always constrain or retreive the data ID. That
326 # means we'll always include the tags/calibs table and join in the
327 # static dataset table only if we need things from it that we can't get
328 # from the tags/calibs table.
329 #
330 # Note that it's important that we include a WHERE constraint on both
331 # tables for any column (e.g. dataset_type_id) that is in both when
332 # it's given explicitly; not doing can prevent the query planner from
333 # using very important indexes. At present, we don't include those
334 # redundant columns in the JOIN ON expression, however, because the
335 # FOREIGN KEY (and its index) are defined only on dataset_id.
336 #
337 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass
338 # to its `join` method when we bring in the tags/calibs table.
339 query = SimpleQuery()
340 # We get the data ID or constrain it in the tags/calibs table, but
341 # that's multiple columns, not one, so we need to transform the one
342 # Select.Or argument into a dictionary of them.
343 kwargs: Dict[str, Any]
344 if dataId is SimpleQuery.Select:
345 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
346 else:
347 kwargs = dict(dataId.byName())
348 # We always constrain (never retrieve) the collection in the
349 # tags/calibs table.
350 kwargs[self._collections.getCollectionForeignKeyName()] = collection.key
351 # We always constrain (never retrieve) the dataset type in at least the
352 # tags/calibs table.
353 kwargs["dataset_type_id"] = self._dataset_type_id
354 # Join in the tags or calibs table, turning those 'kwargs' entries into
355 # WHERE constraints or SELECT columns as appropriate.
356 if collection.type is CollectionType.CALIBRATION:
357 assert self._calibs is not None, \
358 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
359 TimespanReprClass = self._db.getTimespanRepresentation()
360 # Add the timespan column(s) to the result columns, or constrain
361 # the timespan via an overlap condition.
362 if timespan is SimpleQuery.Select:
363 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()})
364 elif timespan is not None: 364 ↛ 370line 364 didn't jump to line 370, because the condition on line 364 was never false
365 query.where.append(
366 TimespanReprClass.fromSelectable(self._calibs).overlaps(
367 TimespanReprClass.fromLiteral(timespan)
368 )
369 )
370 query.join(self._calibs, **kwargs)
371 dataset_id_col = self._calibs.columns.dataset_id
372 else:
373 query.join(self._tags, **kwargs)
374 dataset_id_col = self._tags.columns.dataset_id
375 # We can always get the dataset_id from the tags/calibs table or
376 # constrain it there. Can't use kwargs for that because we need to
377 # alias it to 'id'.
378 if id is SimpleQuery.Select:
379 query.columns.append(dataset_id_col.label("id"))
380 elif id is not None: 380 ↛ 381line 380 didn't jump to line 381, because the condition on line 380 was never true
381 query.where.append(dataset_id_col == id)
382 # It's possible we now have everything we need, from just the
383 # tags/calibs table. The things we might need to get from the static
384 # dataset table are the run key and the ingest date.
385 need_static_table = False
386 static_kwargs = {}
387 if run is not None:
388 if collection.type is CollectionType.RUN:
389 if run is SimpleQuery.Select: 389 ↛ 394line 389 didn't jump to line 394, because the condition on line 389 was never false
390 # If the collection we're searching is a RUN, we know that
391 # if we find the dataset in that collection, then that's
392 # the datasets's run; we don't need to query for it.
393 query.columns.append(sqlalchemy.sql.literal(collection.key).label(self._runKeyColumn))
394 elif run != collection.name:
395 # This [sub]query is doomed to yield no results; dataset
396 # cannot be in more than one run.
397 return None
398 else:
399 query.where.append(self._static.dataset.columns[self._runKeyColumn] == collection.key)
400 else:
401 static_kwargs[self._runKeyColumn] = (
402 SimpleQuery.Select if run is SimpleQuery.Select else self._collections.find(run).key
403 )
404 need_static_table = True
405 # Ingest date can only come from the static table.
406 if ingestDate is not None:
407 need_static_table = True
408 if ingestDate is SimpleQuery.Select: 408 ↛ 411line 408 didn't jump to line 411, because the condition on line 408 was never false
409 static_kwargs["ingest_date"] = SimpleQuery.Select
410 else:
411 assert isinstance(ingestDate, Timespan)
412 # Timespan is astropy Time (usually in TAI) and ingest_date is
413 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
414 if ingestDate.isEmpty():
415 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
416 if ingestDate.begin is not None:
417 begin = ingestDate.begin.utc.datetime # type: ignore
418 query.where.append(self._static.dataset.columns.ingest_date >= begin)
419 if ingestDate.end is not None:
420 end = ingestDate.end.utc.datetime # type: ignore
421 query.where.append(self._static.dataset.columns.ingest_date < end)
422 # If we need the static table, join it in via dataset_id and
423 # dataset_type_id
424 if need_static_table:
425 query.join(
426 self._static.dataset,
427 onclause=(dataset_id_col == self._static.dataset.columns.id),
428 **static_kwargs,
429 )
430 # Also constrain dataset_type_id in static table in case that helps
431 # generate a better plan.
432 # We could also include this in the JOIN ON clause, but my guess is
433 # that that's a good idea IFF it's in the foreign key, and right
434 # now it isn't.
435 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
436 return query
438 def getDataId(self, id: DatasetId) -> DataCoordinate:
439 """Return DataId for a dataset.
441 Parameters
442 ----------
443 id : `DatasetId`
444 Unique dataset identifier.
446 Returns
447 -------
448 dataId : `DataCoordinate`
449 DataId for the dataset.
450 """
451 # This query could return multiple rows (one for each tagged collection
452 # the dataset is in, plus one for its run collection), and we don't
453 # care which of those we get.
454 sql = self._tags.select().where(
455 sqlalchemy.sql.and_(
456 self._tags.columns.dataset_id == id,
457 self._tags.columns.dataset_type_id == self._dataset_type_id
458 )
459 ).limit(1)
460 row = self._db.query(sql).fetchone()
461 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
462 return DataCoordinate.standardize(
463 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
464 graph=self.datasetType.dimensions
465 )
468class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
469 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
470 auto-incremented column for dataset IDs.
471 """
473 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
474 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
475 # Docstring inherited from DatasetRecordStorage.
477 # We only support UNIQUE mode for integer dataset IDs
478 if idMode != DatasetIdGenEnum.UNIQUE: 478 ↛ 479line 478 didn't jump to line 479, because the condition on line 478 was never true
479 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
481 # Transform a possibly-single-pass iterable into a list.
482 dataIdList = list(dataIds)
483 yield from self._insert(run, dataIdList)
485 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
486 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
487 reuseIds: bool = False) -> Iterator[DatasetRef]:
488 # Docstring inherited from DatasetRecordStorage.
490 # We only support UNIQUE mode for integer dataset IDs
491 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 491 ↛ 492line 491 didn't jump to line 492, because the condition on line 491 was never true
492 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
494 # Make a list of dataIds and optionally dataset IDs.
495 dataIdList: List[DataCoordinate] = []
496 datasetIdList: List[int] = []
497 for dataset in datasets:
498 dataIdList.append(dataset.dataId)
500 # We only accept integer dataset IDs, but also allow None.
501 datasetId = dataset.id
502 if datasetId is None: 502 ↛ 504line 502 didn't jump to line 504, because the condition on line 502 was never true
503 # if reuseIds is set then all IDs must be known
504 if reuseIds:
505 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
506 elif isinstance(datasetId, int): 506 ↛ 510line 506 didn't jump to line 510, because the condition on line 506 was never false
507 if reuseIds:
508 datasetIdList.append(datasetId)
509 else:
510 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
512 yield from self._insert(run, dataIdList, datasetIdList)
514 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate],
515 datasetIdList: Optional[List[int]] = None) -> Iterator[DatasetRef]:
516 """Common part of implementation of `insert` and `import_` methods.
517 """
519 # Remember any governor dimension values we see.
520 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
521 for dataId in dataIdList:
522 governorValues.update_extract(dataId)
524 staticRow = {
525 "dataset_type_id": self._dataset_type_id,
526 self._runKeyColumn: run.key,
527 }
528 with self._db.transaction():
529 # Insert into the static dataset table, generating autoincrement
530 # dataset_id values.
531 if datasetIdList:
532 # reuse existing IDs
533 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
534 self._db.insert(self._static.dataset, *rows)
535 else:
536 # use auto-incremented IDs
537 datasetIdList = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)),
538 returnIds=True)
539 assert datasetIdList is not None
540 # Update the summary tables for this collection in case this is the
541 # first time this dataset type or these governor values will be
542 # inserted there.
543 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
544 # Combine the generated dataset_id values and data ID fields to
545 # form rows to be inserted into the tags table.
546 protoTagsRow = {
547 "dataset_type_id": self._dataset_type_id,
548 self._collections.getCollectionForeignKeyName(): run.key,
549 }
550 tagsRows = [
551 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
552 for dataId, dataset_id in zip(dataIdList, datasetIdList)
553 ]
554 # Insert those rows into the tags table. This is where we'll
555 # get any unique constraint violations.
556 self._db.insert(self._tags, *tagsRows)
558 for dataId, datasetId in zip(dataIdList, datasetIdList):
559 yield DatasetRef(
560 datasetType=self.datasetType,
561 dataId=dataId,
562 id=datasetId,
563 run=run.name,
564 )
567class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
568 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
569 dataset IDs.
570 """
572 NS_UUID = uuid.UUID('840b31d9-05cd-5161-b2c8-00d32b280d0f')
573 """Namespace UUID used for UUID5 generation. Do not change. This was
574 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
575 """
577 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
578 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
579 # Docstring inherited from DatasetRecordStorage.
581 # Iterate over data IDs, transforming a possibly-single-pass iterable
582 # into a list.
583 dataIdList = []
584 rows = []
585 for dataId in dataIds:
586 dataIdList.append(dataId)
587 rows.append({
588 "id": self._makeDatasetId(run, dataId, idMode),
589 "dataset_type_id": self._dataset_type_id,
590 self._runKeyColumn: run.key,
591 })
593 yield from self._insert(run, dataIdList, rows, self._db.insert)
595 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
596 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
597 reuseIds: bool = False) -> Iterator[DatasetRef]:
598 # Docstring inherited from DatasetRecordStorage.
600 # Iterate over data IDs, transforming a possibly-single-pass iterable
601 # into a list.
602 dataIdList = []
603 rows = []
604 for dataset in datasets:
605 dataIdList.append(dataset.dataId)
606 # Ignore unknown ID types, normally all IDs have the same type but
607 # this code supports mixed types or missing IDs.
608 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
609 if datasetId is None:
610 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
611 rows.append({
612 "id": datasetId,
613 "dataset_type_id": self._dataset_type_id,
614 self._runKeyColumn: run.key,
615 })
617 yield from self._insert(run, dataIdList, rows, self._db.ensure)
619 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate],
620 rows: List[Dict], insertMethod: Callable) -> Iterator[DatasetRef]:
621 """Common part of implementation of `insert` and `import_` methods.
622 """
624 # Remember any governor dimension values we see.
625 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
626 for dataId in dataIdList:
627 governorValues.update_extract(dataId)
629 with self._db.transaction():
630 # Insert into the static dataset table.
631 insertMethod(self._static.dataset, *rows)
632 # Update the summary tables for this collection in case this is the
633 # first time this dataset type or these governor values will be
634 # inserted there.
635 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
636 # Combine the generated dataset_id values and data ID fields to
637 # form rows to be inserted into the tags table.
638 protoTagsRow = {
639 "dataset_type_id": self._dataset_type_id,
640 self._collections.getCollectionForeignKeyName(): run.key,
641 }
642 tagsRows = [
643 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
644 for dataId, row in zip(dataIdList, rows)
645 ]
646 # Insert those rows into the tags table.
647 insertMethod(self._tags, *tagsRows)
648 for dataId, row in zip(dataIdList, rows):
649 yield DatasetRef(
650 datasetType=self.datasetType,
651 dataId=dataId,
652 id=row["id"],
653 run=run.name,
654 )
656 def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate,
657 idGenerationMode: DatasetIdGenEnum) -> uuid.UUID:
658 """Generate dataset ID for a dataset.
660 Parameters
661 ----------
662 run : `RunRecord`
663 The record object describing the RUN collection for the dataset.
664 dataId : `DataCoordinate`
665 Expanded data ID for the dataset.
666 idGenerationMode : `DatasetIdGenEnum`
667 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
668 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
669 deterministic UUID5-type ID based on a dataset type name and
670 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
671 deterministic UUID5-type ID based on a dataset type name, run
672 collection name, and ``dataId``.
674 Returns
675 -------
676 datasetId : `uuid.UUID`
677 Dataset identifier.
678 """
679 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 679 ↛ 684line 679 didn't jump to line 684, because the condition on line 679 was never false
680 return uuid.uuid4()
681 else:
682 # WARNING: If you modify this code make sure that the order of
683 # items in the `items` list below never changes.
684 items: List[Tuple[str, str]] = []
685 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
686 items = [
687 ("dataset_type", self.datasetType.name),
688 ]
689 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
690 items = [
691 ("dataset_type", self.datasetType.name),
692 ("run", run.name),
693 ]
694 else:
695 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
697 for name, value in sorted(dataId.byName().items()):
698 items.append((name, str(value)))
699 data = ",".join(f"{key}={value}" for key, value in items)
700 return uuid.uuid5(self.NS_UUID, data)