Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 85%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5from typing import (
6 Any,
7 Callable,
8 Dict,
9 Iterable,
10 Iterator,
11 List,
12 Optional,
13 Set,
14 Tuple,
15 TYPE_CHECKING,
16)
17import uuid
19import sqlalchemy
21from lsst.daf.butler import (
22 CollectionType,
23 DataCoordinate,
24 DataCoordinateSet,
25 DatasetId,
26 DatasetRef,
27 DatasetType,
28 SimpleQuery,
29 Timespan,
30)
31from lsst.daf.butler.registry import ConflictingDefinitionError, UnsupportedIdGeneratorError
32from lsst.daf.butler.registry.interfaces import DatasetRecordStorage, DatasetIdGenEnum
34from ...summaries import GovernorDimensionRestriction
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
38 from .tables import StaticDatasetTablesTuple
39 from .summaries import CollectionSummaryManager
42class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
43 """Dataset record storage implementation paired with
44 `ByDimensionsDatasetRecordStorageManager`; see that class for more
45 information.
47 Instances of this class should never be constructed directly; use
48 `DatasetRecordStorageManager.register` instead.
49 """
51 def __init__(self, *, datasetType: DatasetType,
52 db: Database,
53 dataset_type_id: int,
54 collections: CollectionManager,
55 static: StaticDatasetTablesTuple,
56 summaries: CollectionSummaryManager,
57 tags: sqlalchemy.schema.Table,
58 calibs: Optional[sqlalchemy.schema.Table]):
59 super().__init__(datasetType=datasetType)
60 self._dataset_type_id = dataset_type_id
61 self._db = db
62 self._collections = collections
63 self._static = static
64 self._summaries = summaries
65 self._tags = tags
66 self._calibs = calibs
67 self._runKeyColumn = collections.getRunForeignKeyName()
69 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
70 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
71 # Docstring inherited from DatasetRecordStorage.
72 assert dataId.graph == self.datasetType.dimensions
73 if collection.type is CollectionType.CALIBRATION and timespan is None: 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true
74 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} "
75 f"without an input timespan.")
76 sql = self.select(collection, dataId=dataId, id=SimpleQuery.Select,
77 run=SimpleQuery.Select, timespan=timespan)
78 sql = sql.combine()
79 results = self._db.query(sql)
80 row = results.fetchone()
81 if row is None:
82 return None
83 if collection.type is CollectionType.CALIBRATION:
84 # For temporal calibration lookups (only!) our invariants do not
85 # guarantee that the number of result rows is <= 1.
86 # They would if `select` constrained the given timespan to be
87 # _contained_ by the validity range in the self._calibs table,
88 # instead of simply _overlapping_ it, because we do guarantee that
89 # the validity ranges are disjoint for a particular dataset type,
90 # collection, and data ID. But using an overlap test and a check
91 # for multiple result rows here allows us to provide a more useful
92 # diagnostic, as well as allowing `select` to support more general
93 # queries where multiple results are not an error.
94 if results.fetchone() is not None:
95 raise RuntimeError(
96 f"Multiple matches found for calibration lookup in {collection.name} for "
97 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
98 )
99 return DatasetRef(
100 datasetType=self.datasetType,
101 dataId=dataId,
102 id=row.id,
103 run=self._collections[row._mapping[self._runKeyColumn]].name
104 )
106 def delete(self, datasets: Iterable[DatasetRef]) -> None:
107 # Docstring inherited from DatasetRecordStorage.
108 # Only delete from common dataset table; ON DELETE foreign key clauses
109 # will handle the rest.
110 self._db.delete(
111 self._static.dataset,
112 ["id"],
113 *[{"id": dataset.getCheckedId()} for dataset in datasets],
114 )
116 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
117 # Docstring inherited from DatasetRecordStorage.
118 if collection.type is not CollectionType.TAGGED: 118 ↛ 119line 118 didn't jump to line 119, because the condition on line 118 was never true
119 raise TypeError(f"Cannot associate into collection '{collection.name}' "
120 f"of type {collection.type.name}; must be TAGGED.")
121 protoRow = {
122 self._collections.getCollectionForeignKeyName(): collection.key,
123 "dataset_type_id": self._dataset_type_id,
124 }
125 rows = []
126 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
127 for dataset in datasets:
128 row = dict(protoRow, dataset_id=dataset.getCheckedId())
129 for dimension, value in dataset.dataId.items():
130 row[dimension.name] = value
131 governorValues.update_extract(dataset.dataId)
132 rows.append(row)
133 # Update the summary tables for this collection in case this is the
134 # first time this dataset type or these governor values will be
135 # inserted there.
136 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
137 # Update the tag table itself.
138 self._db.replace(self._tags, *rows)
140 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
141 # Docstring inherited from DatasetRecordStorage.
142 if collection.type is not CollectionType.TAGGED: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true
143 raise TypeError(f"Cannot disassociate from collection '{collection.name}' "
144 f"of type {collection.type.name}; must be TAGGED.")
145 rows = [
146 {
147 "dataset_id": dataset.getCheckedId(),
148 self._collections.getCollectionForeignKeyName(): collection.key
149 }
150 for dataset in datasets
151 ]
152 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()],
153 *rows)
155 def _buildCalibOverlapQuery(self, collection: CollectionRecord,
156 dataIds: Optional[DataCoordinateSet],
157 timespan: Timespan) -> SimpleQuery:
158 assert self._calibs is not None
159 # Start by building a SELECT query for any rows that would overlap
160 # this one.
161 query = SimpleQuery()
162 query.join(self._calibs)
163 # Add a WHERE clause matching the dataset type and collection.
164 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
165 query.where.append(
166 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
167 )
168 # Add a WHERE clause matching any of the given data IDs.
169 if dataIds is not None:
170 dataIds.constrain(
171 query,
172 lambda name: self._calibs.columns[name], # type: ignore
173 )
174 # Add WHERE clause for timespan overlaps.
175 TimespanReprClass = self._db.getTimespanRepresentation()
176 query.where.append(
177 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan))
178 )
179 return query
181 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
182 timespan: Timespan) -> None:
183 # Docstring inherited from DatasetRecordStorage.
184 if self._calibs is None: 184 ↛ 185line 184 didn't jump to line 185, because the condition on line 184 was never true
185 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which "
186 f"DatasetType.isCalibration() is False.")
187 if collection.type is not CollectionType.CALIBRATION: 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true
188 raise TypeError(f"Cannot certify into collection '{collection.name}' "
189 f"of type {collection.type.name}; must be CALIBRATION.")
190 TimespanReprClass = self._db.getTimespanRepresentation()
191 protoRow = {
192 self._collections.getCollectionForeignKeyName(): collection.key,
193 "dataset_type_id": self._dataset_type_id,
194 }
195 rows = []
196 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
197 dataIds: Optional[Set[DataCoordinate]] = (
198 set() if not TimespanReprClass.hasExclusionConstraint() else None
199 )
200 for dataset in datasets:
201 row = dict(protoRow, dataset_id=dataset.getCheckedId())
202 for dimension, value in dataset.dataId.items():
203 row[dimension.name] = value
204 TimespanReprClass.update(timespan, result=row)
205 governorValues.update_extract(dataset.dataId)
206 rows.append(row)
207 if dataIds is not None: 207 ↛ 200line 207 didn't jump to line 200, because the condition on line 207 was never false
208 dataIds.add(dataset.dataId)
209 # Update the summary tables for this collection in case this is the
210 # first time this dataset type or these governor values will be
211 # inserted there.
212 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
213 # Update the association table itself.
214 if TimespanReprClass.hasExclusionConstraint(): 214 ↛ 217line 214 didn't jump to line 217, because the condition on line 214 was never true
215 # Rely on database constraint to enforce invariants; we just
216 # reraise the exception for consistency across DB engines.
217 try:
218 self._db.insert(self._calibs, *rows)
219 except sqlalchemy.exc.IntegrityError as err:
220 raise ConflictingDefinitionError(
221 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
222 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
223 ) from err
224 else:
225 # Have to implement exclusion constraint ourselves.
226 # Start by building a SELECT query for any rows that would overlap
227 # this one.
228 query = self._buildCalibOverlapQuery(
229 collection,
230 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
231 timespan
232 )
233 query.columns.append(sqlalchemy.sql.func.count())
234 sql = query.combine()
235 # Acquire a table lock to ensure there are no concurrent writes
236 # could invalidate our checking before we finish the inserts. We
237 # use a SAVEPOINT in case there is an outer transaction that a
238 # failure here should not roll back.
239 with self._db.transaction(lock=[self._calibs], savepoint=True):
240 # Run the check SELECT query.
241 conflicting = self._db.query(sql).scalar()
242 if conflicting > 0:
243 raise ConflictingDefinitionError(
244 f"{conflicting} validity range conflicts certifying datasets of type "
245 f"{self.datasetType.name} into {collection.name} for range "
246 f"[{timespan.begin}, {timespan.end})."
247 )
248 # Proceed with the insert.
249 self._db.insert(self._calibs, *rows)
251 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
252 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
253 # Docstring inherited from DatasetRecordStorage.
254 if self._calibs is None: 254 ↛ 255line 254 didn't jump to line 255, because the condition on line 254 was never true
255 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which "
256 f"DatasetType.isCalibration() is False.")
257 if collection.type is not CollectionType.CALIBRATION: 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true
258 raise TypeError(f"Cannot decertify from collection '{collection.name}' "
259 f"of type {collection.type.name}; must be CALIBRATION.")
260 TimespanReprClass = self._db.getTimespanRepresentation()
261 # Construct a SELECT query to find all rows that overlap our inputs.
262 dataIdSet: Optional[DataCoordinateSet]
263 if dataIds is not None:
264 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
265 else:
266 dataIdSet = None
267 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
268 query.columns.extend(self._calibs.columns)
269 sql = query.combine()
270 # Set up collections to populate with the rows we'll want to modify.
271 # The insert rows will have the same values for collection and
272 # dataset type.
273 protoInsertRow = {
274 self._collections.getCollectionForeignKeyName(): collection.key,
275 "dataset_type_id": self._dataset_type_id,
276 }
277 rowsToDelete = []
278 rowsToInsert = []
279 # Acquire a table lock to ensure there are no concurrent writes
280 # between the SELECT and the DELETE and INSERT queries based on it.
281 with self._db.transaction(lock=[self._calibs], savepoint=True):
282 for row in self._db.query(sql).mappings():
283 rowsToDelete.append({"id": row["id"]})
284 # Construct the insert row(s) by copying the prototype row,
285 # then adding the dimension column values, then adding what's
286 # left of the timespan from that row after we subtract the
287 # given timespan.
288 newInsertRow = protoInsertRow.copy()
289 newInsertRow["dataset_id"] = row["dataset_id"]
290 for name in self.datasetType.dimensions.required.names:
291 newInsertRow[name] = row[name]
292 rowTimespan = TimespanReprClass.extract(row)
293 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
294 for diffTimespan in rowTimespan.difference(timespan):
295 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
296 # Run the DELETE and INSERT queries.
297 self._db.delete(self._calibs, ["id"], *rowsToDelete)
298 self._db.insert(self._calibs, *rowsToInsert)
300 def select(self, *collections: CollectionRecord,
301 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
302 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
303 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
304 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
305 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
306 ) -> SimpleQuery:
307 # Docstring inherited from DatasetRecordStorage.
308 collection_types = {collection.type for collection in collections}
309 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened."
310 #
311 # There are two tables in play here:
312 #
313 # - the static dataset table (with the dataset ID, dataset type ID,
314 # run ID/name, and ingest date);
315 #
316 # - the dynamic tags/calibs table (with the dataset ID, dataset type
317 # type ID, collection ID/name, data ID, and possibly validity
318 # range).
319 #
320 # That means that we might want to return a query against either table
321 # or a JOIN of both, depending on which quantities the caller wants.
322 # But this method is documented/typed such that ``dataId`` is never
323 # `None` - i.e. we always constrain or retreive the data ID. That
324 # means we'll always include the tags/calibs table and join in the
325 # static dataset table only if we need things from it that we can't get
326 # from the tags/calibs table.
327 #
328 # Note that it's important that we include a WHERE constraint on both
329 # tables for any column (e.g. dataset_type_id) that is in both when
330 # it's given explicitly; not doing can prevent the query planner from
331 # using very important indexes. At present, we don't include those
332 # redundant columns in the JOIN ON expression, however, because the
333 # FOREIGN KEY (and its index) are defined only on dataset_id.
334 #
335 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass
336 # to its `join` method when we bring in the tags/calibs table.
337 query = SimpleQuery()
338 # We get the data ID or constrain it in the tags/calibs table, but
339 # that's multiple columns, not one, so we need to transform the one
340 # Select.Or argument into a dictionary of them.
341 kwargs: Dict[str, Any]
342 if dataId is SimpleQuery.Select:
343 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
344 else:
345 kwargs = dict(dataId.byName())
346 # We always constrain (never retrieve) the dataset type in at least the
347 # tags/calibs table.
348 kwargs["dataset_type_id"] = self._dataset_type_id
349 # Join in the tags or calibs table, turning those 'kwargs' entries into
350 # WHERE constraints or SELECT columns as appropriate.
351 if collection_types == {CollectionType.CALIBRATION}:
352 assert self._calibs is not None, \
353 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
354 TimespanReprClass = self._db.getTimespanRepresentation()
355 # Add the timespan column(s) to the result columns, or constrain
356 # the timespan via an overlap condition.
357 if timespan is SimpleQuery.Select:
358 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()})
359 elif timespan is not None: 359 ↛ 365line 359 didn't jump to line 365, because the condition on line 359 was never false
360 query.where.append(
361 TimespanReprClass.fromSelectable(self._calibs).overlaps(
362 TimespanReprClass.fromLiteral(timespan)
363 )
364 )
365 query.join(self._calibs, **kwargs)
366 dataset_id_col = self._calibs.columns.dataset_id
367 collection_col = self._calibs.columns[self._collections.getCollectionForeignKeyName()]
368 elif CollectionType.CALIBRATION not in collection_types: 368 ↛ 373line 368 didn't jump to line 373, because the condition on line 368 was never false
369 query.join(self._tags, **kwargs)
370 dataset_id_col = self._tags.columns.dataset_id
371 collection_col = self._tags.columns[self._collections.getCollectionForeignKeyName()]
372 else:
373 raise TypeError(
374 "Cannot query for CALIBRATION collections in the same "
375 "subquery as other kinds of collections."
376 )
377 # We always constrain (never retrieve) the collection(s) in the
378 # tags/calibs table.
379 if len(collections) == 1:
380 query.where.append(collection_col == collections[0].key)
381 elif len(collections) == 0:
382 # We support the case where there are no collections as a way to
383 # generate a valid SQL query that can't yield results. This should
384 # never get executed, but lots of downstream code will still try
385 # to access the SQLAlchemy objects representing the columns in the
386 # subquery. That's not idea, but it'd take a lot of refactoring to
387 # fix it.
388 query.where.append(sqlalchemy.sql.literal(False))
389 else:
390 query.where.append(collection_col.in_([collection.key for collection in collections]))
391 # We can always get the dataset_id from the tags/calibs table or
392 # constrain it there. Can't use kwargs for that because we need to
393 # alias it to 'id'.
394 if id is SimpleQuery.Select:
395 query.columns.append(dataset_id_col.label("id"))
396 elif id is not None: 396 ↛ 397line 396 didn't jump to line 397, because the condition on line 396 was never true
397 query.where.append(dataset_id_col == id)
398 # It's possible we now have everything we need, from just the
399 # tags/calibs table. The things we might need to get from the static
400 # dataset table are the run key and the ingest date.
401 need_static_table = False
402 static_kwargs: Dict[str, Any] = {}
403 if run is not None:
404 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection."
405 if len(collections) == 1 and collections[0].type is CollectionType.RUN:
406 # If we are searching exactly one RUN collection, we
407 # know that if we find the dataset in that collection,
408 # then that's the datasets's run; we don't need to
409 # query for it.
410 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn))
411 else:
412 static_kwargs[self._runKeyColumn] = SimpleQuery.Select
413 need_static_table = True
414 # Ingest date can only come from the static table.
415 if ingestDate is not None:
416 need_static_table = True
417 if ingestDate is SimpleQuery.Select: 417 ↛ 420line 417 didn't jump to line 420, because the condition on line 417 was never false
418 static_kwargs["ingest_date"] = SimpleQuery.Select
419 else:
420 assert isinstance(ingestDate, Timespan)
421 # Timespan is astropy Time (usually in TAI) and ingest_date is
422 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
423 if ingestDate.isEmpty():
424 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
425 if ingestDate.begin is not None:
426 begin = ingestDate.begin.utc.datetime # type: ignore
427 query.where.append(self._static.dataset.columns.ingest_date >= begin)
428 if ingestDate.end is not None:
429 end = ingestDate.end.utc.datetime # type: ignore
430 query.where.append(self._static.dataset.columns.ingest_date < end)
431 # If we need the static table, join it in via dataset_id and
432 # dataset_type_id
433 if need_static_table:
434 query.join(
435 self._static.dataset,
436 onclause=(dataset_id_col == self._static.dataset.columns.id),
437 **static_kwargs,
438 )
439 # Also constrain dataset_type_id in static table in case that helps
440 # generate a better plan.
441 # We could also include this in the JOIN ON clause, but my guess is
442 # that that's a good idea IFF it's in the foreign key, and right
443 # now it isn't.
444 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id)
445 return query
447 def getDataId(self, id: DatasetId) -> DataCoordinate:
448 """Return DataId for a dataset.
450 Parameters
451 ----------
452 id : `DatasetId`
453 Unique dataset identifier.
455 Returns
456 -------
457 dataId : `DataCoordinate`
458 DataId for the dataset.
459 """
460 # This query could return multiple rows (one for each tagged collection
461 # the dataset is in, plus one for its run collection), and we don't
462 # care which of those we get.
463 sql = self._tags.select().where(
464 sqlalchemy.sql.and_(
465 self._tags.columns.dataset_id == id,
466 self._tags.columns.dataset_type_id == self._dataset_type_id
467 )
468 ).limit(1)
469 row = self._db.query(sql).mappings().fetchone()
470 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
471 return DataCoordinate.standardize(
472 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
473 graph=self.datasetType.dimensions
474 )
477class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
478 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
479 auto-incremented column for dataset IDs.
480 """
482 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
483 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
484 # Docstring inherited from DatasetRecordStorage.
486 # We only support UNIQUE mode for integer dataset IDs
487 if idMode != DatasetIdGenEnum.UNIQUE: 487 ↛ 488line 487 didn't jump to line 488, because the condition on line 487 was never true
488 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
490 # Transform a possibly-single-pass iterable into a list.
491 dataIdList = list(dataIds)
492 yield from self._insert(run, dataIdList)
494 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
495 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
496 reuseIds: bool = False) -> Iterator[DatasetRef]:
497 # Docstring inherited from DatasetRecordStorage.
499 # We only support UNIQUE mode for integer dataset IDs
500 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 500 ↛ 501line 500 didn't jump to line 501, because the condition on line 500 was never true
501 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.")
503 # Make a list of dataIds and optionally dataset IDs.
504 dataIdList: List[DataCoordinate] = []
505 datasetIdList: List[int] = []
506 for dataset in datasets:
507 dataIdList.append(dataset.dataId)
509 # We only accept integer dataset IDs, but also allow None.
510 datasetId = dataset.id
511 if datasetId is None: 511 ↛ 513line 511 didn't jump to line 513, because the condition on line 511 was never true
512 # if reuseIds is set then all IDs must be known
513 if reuseIds:
514 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
515 elif isinstance(datasetId, int): 515 ↛ 519line 515 didn't jump to line 519, because the condition on line 515 was never false
516 if reuseIds:
517 datasetIdList.append(datasetId)
518 else:
519 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
521 yield from self._insert(run, dataIdList, datasetIdList)
523 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate],
524 datasetIdList: Optional[List[int]] = None) -> Iterator[DatasetRef]:
525 """Common part of implementation of `insert` and `import_` methods.
526 """
528 # Remember any governor dimension values we see.
529 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
530 for dataId in dataIdList:
531 governorValues.update_extract(dataId)
533 staticRow = {
534 "dataset_type_id": self._dataset_type_id,
535 self._runKeyColumn: run.key,
536 }
537 with self._db.transaction():
538 # Insert into the static dataset table, generating autoincrement
539 # dataset_id values.
540 if datasetIdList:
541 # reuse existing IDs
542 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
543 self._db.insert(self._static.dataset, *rows)
544 else:
545 # use auto-incremented IDs
546 datasetIdList = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)),
547 returnIds=True)
548 assert datasetIdList is not None
549 # Update the summary tables for this collection in case this is the
550 # first time this dataset type or these governor values will be
551 # inserted there.
552 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
553 # Combine the generated dataset_id values and data ID fields to
554 # form rows to be inserted into the tags table.
555 protoTagsRow = {
556 "dataset_type_id": self._dataset_type_id,
557 self._collections.getCollectionForeignKeyName(): run.key,
558 }
559 tagsRows = [
560 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
561 for dataId, dataset_id in zip(dataIdList, datasetIdList)
562 ]
563 # Insert those rows into the tags table. This is where we'll
564 # get any unique constraint violations.
565 self._db.insert(self._tags, *tagsRows)
567 for dataId, datasetId in zip(dataIdList, datasetIdList):
568 yield DatasetRef(
569 datasetType=self.datasetType,
570 dataId=dataId,
571 id=datasetId,
572 run=run.name,
573 )
576class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
577 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
578 dataset IDs.
579 """
581 NS_UUID = uuid.UUID('840b31d9-05cd-5161-b2c8-00d32b280d0f')
582 """Namespace UUID used for UUID5 generation. Do not change. This was
583 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
584 """
586 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
587 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
588 # Docstring inherited from DatasetRecordStorage.
590 # Iterate over data IDs, transforming a possibly-single-pass iterable
591 # into a list.
592 dataIdList = []
593 rows = []
594 for dataId in dataIds:
595 dataIdList.append(dataId)
596 rows.append({
597 "id": self._makeDatasetId(run, dataId, idMode),
598 "dataset_type_id": self._dataset_type_id,
599 self._runKeyColumn: run.key,
600 })
602 yield from self._insert(run, dataIdList, rows, self._db.insert)
604 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
605 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
606 reuseIds: bool = False) -> Iterator[DatasetRef]:
607 # Docstring inherited from DatasetRecordStorage.
609 # Iterate over data IDs, transforming a possibly-single-pass iterable
610 # into a list.
611 dataIdList = []
612 rows = []
613 for dataset in datasets:
614 dataIdList.append(dataset.dataId)
615 # Ignore unknown ID types, normally all IDs have the same type but
616 # this code supports mixed types or missing IDs.
617 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
618 if datasetId is None:
619 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
620 rows.append({
621 "id": datasetId,
622 "dataset_type_id": self._dataset_type_id,
623 self._runKeyColumn: run.key,
624 })
626 yield from self._insert(run, dataIdList, rows, self._db.ensure)
628 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate],
629 rows: List[Dict], insertMethod: Callable) -> Iterator[DatasetRef]:
630 """Common part of implementation of `insert` and `import_` methods.
631 """
633 # Remember any governor dimension values we see.
634 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
635 for dataId in dataIdList:
636 governorValues.update_extract(dataId)
638 with self._db.transaction():
639 # Insert into the static dataset table.
640 insertMethod(self._static.dataset, *rows)
641 # Update the summary tables for this collection in case this is the
642 # first time this dataset type or these governor values will be
643 # inserted there.
644 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
645 # Combine the generated dataset_id values and data ID fields to
646 # form rows to be inserted into the tags table.
647 protoTagsRow = {
648 "dataset_type_id": self._dataset_type_id,
649 self._collections.getCollectionForeignKeyName(): run.key,
650 }
651 tagsRows = [
652 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
653 for dataId, row in zip(dataIdList, rows)
654 ]
655 # Insert those rows into the tags table.
656 insertMethod(self._tags, *tagsRows)
657 for dataId, row in zip(dataIdList, rows):
658 yield DatasetRef(
659 datasetType=self.datasetType,
660 dataId=dataId,
661 id=row["id"],
662 run=run.name,
663 )
665 def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate,
666 idGenerationMode: DatasetIdGenEnum) -> uuid.UUID:
667 """Generate dataset ID for a dataset.
669 Parameters
670 ----------
671 run : `RunRecord`
672 The record object describing the RUN collection for the dataset.
673 dataId : `DataCoordinate`
674 Expanded data ID for the dataset.
675 idGenerationMode : `DatasetIdGenEnum`
676 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
677 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
678 deterministic UUID5-type ID based on a dataset type name and
679 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
680 deterministic UUID5-type ID based on a dataset type name, run
681 collection name, and ``dataId``.
683 Returns
684 -------
685 datasetId : `uuid.UUID`
686 Dataset identifier.
687 """
688 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
689 return uuid.uuid4()
690 else:
691 # WARNING: If you modify this code make sure that the order of
692 # items in the `items` list below never changes.
693 items: List[Tuple[str, str]] = []
694 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 694 ↛ 698line 694 didn't jump to line 698, because the condition on line 694 was never false
695 items = [
696 ("dataset_type", self.datasetType.name),
697 ]
698 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
699 items = [
700 ("dataset_type", self.datasetType.name),
701 ("run", run.name),
702 ]
703 else:
704 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
706 for name, value in sorted(dataId.byName().items()):
707 items.append((name, str(value)))
708 data = ",".join(f"{key}={value}" for key, value in items)
709 return uuid.uuid5(self.NS_UUID, data)