Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py : 81%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5from typing import (
6 Any,
7 Callable,
8 Dict,
9 Iterable,
10 Iterator,
11 List,
12 Optional,
13 Set,
14 Tuple,
15 TYPE_CHECKING,
16)
17import uuid
19import sqlalchemy
21from lsst.daf.butler import (
22 CollectionType,
23 DataCoordinate,
24 DataCoordinateSet,
25 DatasetId,
26 DatasetRef,
27 DatasetType,
28 SimpleQuery,
29 Timespan,
30)
31from lsst.daf.butler.registry import ConflictingDefinitionError
32from lsst.daf.butler.registry.interfaces import DatasetRecordStorage, DatasetIdGenEnum
34from ...summaries import GovernorDimensionRestriction
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
38 from .tables import StaticDatasetTablesTuple
39 from .summaries import CollectionSummaryManager
42class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
43 """Dataset record storage implementation paired with
44 `ByDimensionsDatasetRecordStorageManager`; see that class for more
45 information.
47 Instances of this class should never be constructed directly; use
48 `DatasetRecordStorageManager.register` instead.
49 """
50 def __init__(self, *, datasetType: DatasetType,
51 db: Database,
52 dataset_type_id: int,
53 collections: CollectionManager,
54 static: StaticDatasetTablesTuple,
55 summaries: CollectionSummaryManager,
56 tags: sqlalchemy.schema.Table,
57 calibs: Optional[sqlalchemy.schema.Table]):
58 super().__init__(datasetType=datasetType)
59 self._dataset_type_id = dataset_type_id
60 self._db = db
61 self._collections = collections
62 self._static = static
63 self._summaries = summaries
64 self._tags = tags
65 self._calibs = calibs
66 self._runKeyColumn = collections.getRunForeignKeyName()
68 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
69 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
70 # Docstring inherited from DatasetRecordStorage.
71 assert dataId.graph == self.datasetType.dimensions
72 if collection.type is CollectionType.CALIBRATION and timespan is None: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true
73 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} "
74 f"without an input timespan.")
75 sql = self.select(collection=collection, dataId=dataId, id=SimpleQuery.Select,
76 run=SimpleQuery.Select, timespan=timespan).combine()
77 results = self._db.query(sql)
78 row = results.fetchone()
79 if row is None:
80 return None
81 if collection.type is CollectionType.CALIBRATION:
82 # For temporal calibration lookups (only!) our invariants do not
83 # guarantee that the number of result rows is <= 1.
84 # They would if `select` constrained the given timespan to be
85 # _contained_ by the validity range in the self._calibs table,
86 # instead of simply _overlapping_ it, because we do guarantee that
87 # the validity ranges are disjoint for a particular dataset type,
88 # collection, and data ID. But using an overlap test and a check
89 # for multiple result rows here allows us to provide a more useful
90 # diagnostic, as well as allowing `select` to support more general
91 # queries where multiple results are not an error.
92 if results.fetchone() is not None:
93 raise RuntimeError(
94 f"Multiple matches found for calibration lookup in {collection.name} for "
95 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
96 )
97 return DatasetRef(
98 datasetType=self.datasetType,
99 dataId=dataId,
100 id=row["id"],
101 run=self._collections[row[self._runKeyColumn]].name
102 )
104 def delete(self, datasets: Iterable[DatasetRef]) -> None:
105 # Docstring inherited from DatasetRecordStorage.
106 # Only delete from common dataset table; ON DELETE foreign key clauses
107 # will handle the rest.
108 self._db.delete(
109 self._static.dataset,
110 ["id"],
111 *[{"id": dataset.getCheckedId()} for dataset in datasets],
112 )
114 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
115 # Docstring inherited from DatasetRecordStorage.
116 if collection.type is not CollectionType.TAGGED: 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true
117 raise TypeError(f"Cannot associate into collection '{collection.name}' "
118 f"of type {collection.type.name}; must be TAGGED.")
119 protoRow = {
120 self._collections.getCollectionForeignKeyName(): collection.key,
121 "dataset_type_id": self._dataset_type_id,
122 }
123 rows = []
124 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
125 for dataset in datasets:
126 row = dict(protoRow, dataset_id=dataset.getCheckedId())
127 for dimension, value in dataset.dataId.items():
128 row[dimension.name] = value
129 governorValues.update_extract(dataset.dataId)
130 rows.append(row)
131 # Update the summary tables for this collection in case this is the
132 # first time this dataset type or these governor values will be
133 # inserted there.
134 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
135 # Update the tag table itself.
136 self._db.replace(self._tags, *rows)
138 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
139 # Docstring inherited from DatasetRecordStorage.
140 if collection.type is not CollectionType.TAGGED: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 raise TypeError(f"Cannot disassociate from collection '{collection.name}' "
142 f"of type {collection.type.name}; must be TAGGED.")
143 rows = [
144 {
145 "dataset_id": dataset.getCheckedId(),
146 self._collections.getCollectionForeignKeyName(): collection.key
147 }
148 for dataset in datasets
149 ]
150 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()],
151 *rows)
153 def _buildCalibOverlapQuery(self, collection: CollectionRecord,
154 dataIds: Optional[DataCoordinateSet],
155 timespan: Timespan) -> SimpleQuery:
156 assert self._calibs is not None
157 # Start by building a SELECT query for any rows that would overlap
158 # this one.
159 query = SimpleQuery()
160 query.join(self._calibs)
161 # Add a WHERE clause matching the dataset type and collection.
162 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
163 query.where.append(
164 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
165 )
166 # Add a WHERE clause matching any of the given data IDs.
167 if dataIds is not None:
168 dataIds.constrain(
169 query,
170 lambda name: self._calibs.columns[name], # type: ignore
171 )
172 # Add WHERE clause for timespan overlaps.
173 TimespanReprClass = self._db.getTimespanRepresentation()
174 query.where.append(
175 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan))
176 )
177 return query
179 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
180 timespan: Timespan) -> None:
181 # Docstring inherited from DatasetRecordStorage.
182 if self._calibs is None: 182 ↛ 183line 182 didn't jump to line 183, because the condition on line 182 was never true
183 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which "
184 f"DatasetType.isCalibration() is False.")
185 if collection.type is not CollectionType.CALIBRATION: 185 ↛ 186line 185 didn't jump to line 186, because the condition on line 185 was never true
186 raise TypeError(f"Cannot certify into collection '{collection.name}' "
187 f"of type {collection.type.name}; must be CALIBRATION.")
188 TimespanReprClass = self._db.getTimespanRepresentation()
189 protoRow = {
190 self._collections.getCollectionForeignKeyName(): collection.key,
191 "dataset_type_id": self._dataset_type_id,
192 }
193 rows = []
194 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
195 dataIds: Optional[Set[DataCoordinate]] = (
196 set() if not TimespanReprClass.hasExclusionConstraint() else None
197 )
198 for dataset in datasets:
199 row = dict(protoRow, dataset_id=dataset.getCheckedId())
200 for dimension, value in dataset.dataId.items():
201 row[dimension.name] = value
202 TimespanReprClass.update(timespan, result=row)
203 governorValues.update_extract(dataset.dataId)
204 rows.append(row)
205 if dataIds is not None: 205 ↛ 198line 205 didn't jump to line 198, because the condition on line 205 was never false
206 dataIds.add(dataset.dataId)
207 # Update the summary tables for this collection in case this is the
208 # first time this dataset type or these governor values will be
209 # inserted there.
210 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues)
211 # Update the association table itself.
212 if TimespanReprClass.hasExclusionConstraint(): 212 ↛ 215line 212 didn't jump to line 215, because the condition on line 212 was never true
213 # Rely on database constraint to enforce invariants; we just
214 # reraise the exception for consistency across DB engines.
215 try:
216 self._db.insert(self._calibs, *rows)
217 except sqlalchemy.exc.IntegrityError as err:
218 raise ConflictingDefinitionError(
219 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
220 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
221 ) from err
222 else:
223 # Have to implement exclusion constraint ourselves.
224 # Start by building a SELECT query for any rows that would overlap
225 # this one.
226 query = self._buildCalibOverlapQuery(
227 collection,
228 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
229 timespan
230 )
231 query.columns.append(sqlalchemy.sql.func.count())
232 sql = query.combine()
233 # Acquire a table lock to ensure there are no concurrent writes
234 # could invalidate our checking before we finish the inserts. We
235 # use a SAVEPOINT in case there is an outer transaction that a
236 # failure here should not roll back.
237 with self._db.transaction(lock=[self._calibs], savepoint=True):
238 # Run the check SELECT query.
239 conflicting = self._db.query(sql).scalar()
240 if conflicting > 0:
241 raise ConflictingDefinitionError(
242 f"{conflicting} validity range conflicts certifying datasets of type "
243 f"{self.datasetType.name} into {collection.name} for range "
244 f"[{timespan.begin}, {timespan.end})."
245 )
246 # Proceed with the insert.
247 self._db.insert(self._calibs, *rows)
249 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
250 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
251 # Docstring inherited from DatasetRecordStorage.
252 if self._calibs is None: 252 ↛ 253line 252 didn't jump to line 253, because the condition on line 252 was never true
253 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which "
254 f"DatasetType.isCalibration() is False.")
255 if collection.type is not CollectionType.CALIBRATION: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true
256 raise TypeError(f"Cannot decertify from collection '{collection.name}' "
257 f"of type {collection.type.name}; must be CALIBRATION.")
258 TimespanReprClass = self._db.getTimespanRepresentation()
259 # Construct a SELECT query to find all rows that overlap our inputs.
260 dataIdSet: Optional[DataCoordinateSet]
261 if dataIds is not None:
262 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
263 else:
264 dataIdSet = None
265 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
266 query.columns.extend(self._calibs.columns)
267 sql = query.combine()
268 # Set up collections to populate with the rows we'll want to modify.
269 # The insert rows will have the same values for collection and
270 # dataset type.
271 protoInsertRow = {
272 self._collections.getCollectionForeignKeyName(): collection.key,
273 "dataset_type_id": self._dataset_type_id,
274 }
275 rowsToDelete = []
276 rowsToInsert = []
277 # Acquire a table lock to ensure there are no concurrent writes
278 # between the SELECT and the DELETE and INSERT queries based on it.
279 with self._db.transaction(lock=[self._calibs], savepoint=True):
280 for row in self._db.query(sql):
281 rowsToDelete.append({"id": row["id"]})
282 # Construct the insert row(s) by copying the prototype row,
283 # then adding the dimension column values, then adding what's
284 # left of the timespan from that row after we subtract the
285 # given timespan.
286 newInsertRow = protoInsertRow.copy()
287 newInsertRow["dataset_id"] = row["dataset_id"]
288 for name in self.datasetType.dimensions.required.names:
289 newInsertRow[name] = row[name]
290 rowTimespan = TimespanReprClass.extract(row)
291 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
292 for diffTimespan in rowTimespan.difference(timespan):
293 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
294 # Run the DELETE and INSERT queries.
295 self._db.delete(self._calibs, ["id"], *rowsToDelete)
296 self._db.insert(self._calibs, *rowsToInsert)
298 def select(self, collection: CollectionRecord,
299 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
300 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
301 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
302 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
303 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
304 ) -> SimpleQuery:
305 # Docstring inherited from DatasetRecordStorage.
306 assert collection.type is not CollectionType.CHAINED
307 query = SimpleQuery()
308 # We always include the _static.dataset table, and we can always get
309 # the id and run fields from that; passing them as kwargs here tells
310 # SimpleQuery to handle them whether they're constraints or results.
311 # We always constraint the dataset_type_id here as well.
312 static_kwargs = {self._runKeyColumn: run}
313 if ingestDate is not None:
314 static_kwargs["ingest_date"] = SimpleQuery.Select
315 query.join(
316 self._static.dataset,
317 id=id,
318 dataset_type_id=self._dataset_type_id,
319 **static_kwargs
320 )
321 # If and only if the collection is a RUN, we constrain it in the static
322 # table (and also the tags or calibs table below)
323 if collection.type is CollectionType.RUN:
324 query.where.append(self._static.dataset.columns[self._runKeyColumn]
325 == collection.key)
326 # We get or constrain the data ID from the tags/calibs table, but
327 # that's multiple columns, not one, so we need to transform the one
328 # Select.Or argument into a dictionary of them.
329 kwargs: Dict[str, Any]
330 if dataId is SimpleQuery.Select:
331 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
332 else:
333 kwargs = dict(dataId.byName())
334 # We always constrain (never retrieve) the collection from the tags
335 # table.
336 kwargs[self._collections.getCollectionForeignKeyName()] = collection.key
337 # constrain ingest time
338 if isinstance(ingestDate, Timespan): 338 ↛ 341line 338 didn't jump to line 341, because the condition on line 338 was never true
339 # Tmespan is astropy Time (usually in TAI) and ingest_date is
340 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
341 if ingestDate.isEmpty():
342 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
343 if ingestDate.begin is not None:
344 begin = ingestDate.begin.utc.datetime # type: ignore
345 query.where.append(self._static.dataset.ingest_date >= begin)
346 if ingestDate.end is not None:
347 end = ingestDate.end.utc.datetime # type: ignore
348 query.where.append(self._static.dataset.ingest_date < end)
349 # And now we finally join in the tags or calibs table.
350 if collection.type is CollectionType.CALIBRATION:
351 assert self._calibs is not None, \
352 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
353 TimespanReprClass = self._db.getTimespanRepresentation()
354 # Add the timespan column(s) to the result columns, or constrain
355 # the timespan via an overlap condition.
356 if timespan is SimpleQuery.Select:
357 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()})
358 elif timespan is not None: 358 ↛ 364line 358 didn't jump to line 364, because the condition on line 358 was never false
359 query.where.append(
360 TimespanReprClass.fromSelectable(self._calibs).overlaps(
361 TimespanReprClass.fromLiteral(timespan)
362 )
363 )
364 query.join(
365 self._calibs,
366 onclause=(self._static.dataset.columns.id == self._calibs.columns.dataset_id),
367 **kwargs
368 )
369 else:
370 query.join(
371 self._tags,
372 onclause=(self._static.dataset.columns.id == self._tags.columns.dataset_id),
373 **kwargs
374 )
375 return query
377 def getDataId(self, id: DatasetId) -> DataCoordinate:
378 """Return DataId for a dataset.
380 Parameters
381 ----------
382 id : `DatasetId`
383 Unique dataset identifier.
385 Returns
386 -------
387 dataId : `DataCoordinate`
388 DataId for the dataset.
389 """
390 # This query could return multiple rows (one for each tagged collection
391 # the dataset is in, plus one for its run collection), and we don't
392 # care which of those we get.
393 sql = self._tags.select().where(
394 sqlalchemy.sql.and_(
395 self._tags.columns.dataset_id == id,
396 self._tags.columns.dataset_type_id == self._dataset_type_id
397 )
398 ).limit(1)
399 row = self._db.query(sql).fetchone()
400 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
401 return DataCoordinate.standardize(
402 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
403 graph=self.datasetType.dimensions
404 )
407class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage):
408 """Implementation of ByDimensionsDatasetRecordStorage which uses integer
409 auto-incremented column for dataset IDs.
410 """
412 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
413 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
414 # Docstring inherited from DatasetRecordStorage.
416 # We only support UNIQUE mode for integer dataset IDs
417 if idMode != DatasetIdGenEnum.UNIQUE: 417 ↛ 418line 417 didn't jump to line 418, because the condition on line 417 was never true
418 raise ValueError("Only UNIQUE mode can be used with integer dataset IDs.")
420 # Transform a possibly-single-pass iterable into a list.
421 dataIdList = list(dataIds)
422 yield from self._insert(run, dataIdList)
424 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
425 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
426 reuseIds: bool = False) -> Iterator[DatasetRef]:
427 # Docstring inherited from DatasetRecordStorage.
429 # We only support UNIQUE mode for integer dataset IDs
430 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 430 ↛ 431line 430 didn't jump to line 431, because the condition on line 430 was never true
431 raise ValueError("Only UNIQUE mode can be used with integer dataset IDs.")
433 # Make a list of dataIds and optionally dataset IDs.
434 dataIdList: List[DataCoordinate] = []
435 datasetIdList: List[int] = []
436 for dataset in datasets:
437 dataIdList.append(dataset.dataId)
439 # We only accept integer dataset IDs, but also allow None.
440 datasetId = dataset.id
441 if datasetId is None: 441 ↛ 443line 441 didn't jump to line 443, because the condition on line 441 was never true
442 # if reuseIds is set then all IDs must be known
443 if reuseIds:
444 raise TypeError("All dataset IDs must be known if `reuseIds` is set")
445 elif isinstance(datasetId, int): 445 ↛ 449line 445 didn't jump to line 449, because the condition on line 445 was never false
446 if reuseIds:
447 datasetIdList.append(datasetId)
448 else:
449 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}")
451 yield from self._insert(run, dataIdList, datasetIdList)
453 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate],
454 datasetIdList: Optional[List[int]] = None) -> Iterator[DatasetRef]:
455 """Common part of implementation of `insert` and `import_` methods.
456 """
458 # Remember any governor dimension values we see.
459 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
460 for dataId in dataIdList:
461 governorValues.update_extract(dataId)
463 staticRow = {
464 "dataset_type_id": self._dataset_type_id,
465 self._runKeyColumn: run.key,
466 }
467 with self._db.transaction():
468 # Insert into the static dataset table, generating autoincrement
469 # dataset_id values.
470 if datasetIdList:
471 # reuse existing IDs
472 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList]
473 self._db.insert(self._static.dataset, *rows)
474 else:
475 # use auto-incremented IDs
476 datasetIdList = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)),
477 returnIds=True)
478 assert datasetIdList is not None
479 # Update the summary tables for this collection in case this is the
480 # first time this dataset type or these governor values will be
481 # inserted there.
482 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
483 # Combine the generated dataset_id values and data ID fields to
484 # form rows to be inserted into the tags table.
485 protoTagsRow = {
486 "dataset_type_id": self._dataset_type_id,
487 self._collections.getCollectionForeignKeyName(): run.key,
488 }
489 tagsRows = [
490 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
491 for dataId, dataset_id in zip(dataIdList, datasetIdList)
492 ]
493 # Insert those rows into the tags table. This is where we'll
494 # get any unique constraint violations.
495 self._db.insert(self._tags, *tagsRows)
497 for dataId, datasetId in zip(dataIdList, datasetIdList):
498 yield DatasetRef(
499 datasetType=self.datasetType,
500 dataId=dataId,
501 id=datasetId,
502 run=run.name,
503 )
506class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
507 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for
508 dataset IDs.
509 """
511 NS_UUID = uuid.UUID('840b31d9-05cd-5161-b2c8-00d32b280d0f')
512 """Namespace UUID used for UUID5 generation. Do not change. This was
513 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
514 """
516 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
517 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
518 # Docstring inherited from DatasetRecordStorage.
520 # Iterate over data IDs, transforming a possibly-single-pass iterable
521 # into a list.
522 dataIdList = []
523 rows = []
524 for dataId in dataIds:
525 dataIdList.append(dataId)
526 rows.append({
527 "id": self._makeDatasetId(run, dataId, idMode),
528 "dataset_type_id": self._dataset_type_id,
529 self._runKeyColumn: run.key,
530 })
532 yield from self._insert(run, dataIdList, rows, self._db.insert)
534 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
535 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
536 reuseIds: bool = False) -> Iterator[DatasetRef]:
537 # Docstring inherited from DatasetRecordStorage.
539 # Iterate over data IDs, transforming a possibly-single-pass iterable
540 # into a list.
541 dataIdList = []
542 rows = []
543 for dataset in datasets:
544 dataIdList.append(dataset.dataId)
545 # Ignore unknown ID types, normally all IDs have the same type but
546 # this code supports mixed types or missing IDs.
547 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
548 if datasetId is None:
549 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
550 rows.append({
551 "id": datasetId,
552 "dataset_type_id": self._dataset_type_id,
553 self._runKeyColumn: run.key,
554 })
556 yield from self._insert(run, dataIdList, rows, self._db.ensure)
558 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate],
559 rows: List[Dict], insertMethod: Callable) -> Iterator[DatasetRef]:
560 """Common part of implementation of `insert` and `import_` methods.
561 """
563 # Remember any governor dimension values we see.
564 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe)
565 for dataId in dataIdList:
566 governorValues.update_extract(dataId)
568 with self._db.transaction():
569 # Insert into the static dataset table.
570 insertMethod(self._static.dataset, *rows)
571 # Update the summary tables for this collection in case this is the
572 # first time this dataset type or these governor values will be
573 # inserted there.
574 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues)
575 # Combine the generated dataset_id values and data ID fields to
576 # form rows to be inserted into the tags table.
577 protoTagsRow = {
578 "dataset_type_id": self._dataset_type_id,
579 self._collections.getCollectionForeignKeyName(): run.key,
580 }
581 tagsRows = [
582 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName())
583 for dataId, row in zip(dataIdList, rows)
584 ]
585 # Insert those rows into the tags table.
586 insertMethod(self._tags, *tagsRows)
587 for dataId, row in zip(dataIdList, rows):
588 yield DatasetRef(
589 datasetType=self.datasetType,
590 dataId=dataId,
591 id=row["id"],
592 run=run.name,
593 )
595 def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate,
596 idGenerationMode: DatasetIdGenEnum) -> uuid.UUID:
597 """Generate dataset ID for a dataset.
599 Parameters
600 ----------
601 run : `RunRecord`
602 The record object describing the RUN collection for the dataset.
603 dataId : `DataCoordinate`
604 Expanded data ID for the dataset.
605 idGenerationMode : `DatasetIdGenEnum`
606 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
607 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
608 deterministic UUID5-type ID based on a dataset type name and
609 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
610 deterministic UUID5-type ID based on a dataset type name, run
611 collection name, and ``dataId``.
613 Returns
614 -------
615 datasetId : `uuid.UUID`
616 Dataset identifier.
617 """
618 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 618 ↛ 623line 618 didn't jump to line 623, because the condition on line 618 was never false
619 return uuid.uuid4()
620 else:
621 # WARNING: If you modify this code make sure that the order of
622 # items in the `items` list below never changes.
623 items: List[Tuple[str, str]] = []
624 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
625 items = [
626 ("dataset_type", self.datasetType.name),
627 ]
628 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
629 items = [
630 ("dataset_type", self.datasetType.name),
631 ("run", run.name),
632 ]
633 else:
634 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
636 for name, value in sorted(dataId.byName().items()):
637 items.append((name, str(value)))
638 data = ",".join(f"{key}={value}" for key, value in items)
639 return uuid.uuid5(self.NS_UUID, data)