Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 85%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

276 statements  

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorage",) 

4 

5from typing import ( 

6 Any, 

7 Callable, 

8 Dict, 

9 Iterable, 

10 Iterator, 

11 List, 

12 Optional, 

13 Set, 

14 Tuple, 

15 TYPE_CHECKING, 

16) 

17import uuid 

18 

19import sqlalchemy 

20 

21from lsst.daf.butler import ( 

22 CollectionType, 

23 DataCoordinate, 

24 DataCoordinateSet, 

25 DatasetId, 

26 DatasetRef, 

27 DatasetType, 

28 SimpleQuery, 

29 Timespan, 

30) 

31from lsst.daf.butler.registry import ConflictingDefinitionError, UnsupportedIdGeneratorError 

32from lsst.daf.butler.registry.interfaces import DatasetRecordStorage, DatasetIdGenEnum 

33 

34from ...summaries import GovernorDimensionRestriction 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

38 from .tables import StaticDatasetTablesTuple 

39 from .summaries import CollectionSummaryManager 

40 

41 

42class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

43 """Dataset record storage implementation paired with 

44 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

45 information. 

46 

47 Instances of this class should never be constructed directly; use 

48 `DatasetRecordStorageManager.register` instead. 

49 """ 

50 

51 def __init__(self, *, datasetType: DatasetType, 

52 db: Database, 

53 dataset_type_id: int, 

54 collections: CollectionManager, 

55 static: StaticDatasetTablesTuple, 

56 summaries: CollectionSummaryManager, 

57 tags: sqlalchemy.schema.Table, 

58 calibs: Optional[sqlalchemy.schema.Table]): 

59 super().__init__(datasetType=datasetType) 

60 self._dataset_type_id = dataset_type_id 

61 self._db = db 

62 self._collections = collections 

63 self._static = static 

64 self._summaries = summaries 

65 self._tags = tags 

66 self._calibs = calibs 

67 self._runKeyColumn = collections.getRunForeignKeyName() 

68 

69 def find(self, collection: CollectionRecord, dataId: DataCoordinate, 

70 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]: 

71 # Docstring inherited from DatasetRecordStorage. 

72 assert dataId.graph == self.datasetType.dimensions 

73 if collection.type is CollectionType.CALIBRATION and timespan is None: 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true

74 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

75 f"without an input timespan.") 

76 sql = self.select(collection, dataId=dataId, id=SimpleQuery.Select, 

77 run=SimpleQuery.Select, timespan=timespan) 

78 sql = sql.combine() 

79 results = self._db.query(sql) 

80 row = results.fetchone() 

81 if row is None: 

82 return None 

83 if collection.type is CollectionType.CALIBRATION: 

84 # For temporal calibration lookups (only!) our invariants do not 

85 # guarantee that the number of result rows is <= 1. 

86 # They would if `select` constrained the given timespan to be 

87 # _contained_ by the validity range in the self._calibs table, 

88 # instead of simply _overlapping_ it, because we do guarantee that 

89 # the validity ranges are disjoint for a particular dataset type, 

90 # collection, and data ID. But using an overlap test and a check 

91 # for multiple result rows here allows us to provide a more useful 

92 # diagnostic, as well as allowing `select` to support more general 

93 # queries where multiple results are not an error. 

94 if results.fetchone() is not None: 

95 raise RuntimeError( 

96 f"Multiple matches found for calibration lookup in {collection.name} for " 

97 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

98 ) 

99 return DatasetRef( 

100 datasetType=self.datasetType, 

101 dataId=dataId, 

102 id=row.id, 

103 run=self._collections[row._mapping[self._runKeyColumn]].name 

104 ) 

105 

106 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

107 # Docstring inherited from DatasetRecordStorage. 

108 # Only delete from common dataset table; ON DELETE foreign key clauses 

109 # will handle the rest. 

110 self._db.delete( 

111 self._static.dataset, 

112 ["id"], 

113 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

114 ) 

115 

116 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

117 # Docstring inherited from DatasetRecordStorage. 

118 if collection.type is not CollectionType.TAGGED: 118 ↛ 119line 118 didn't jump to line 119, because the condition on line 118 was never true

119 raise TypeError(f"Cannot associate into collection '{collection.name}' " 

120 f"of type {collection.type.name}; must be TAGGED.") 

121 protoRow = { 

122 self._collections.getCollectionForeignKeyName(): collection.key, 

123 "dataset_type_id": self._dataset_type_id, 

124 } 

125 rows = [] 

126 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

127 for dataset in datasets: 

128 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

129 for dimension, value in dataset.dataId.items(): 

130 row[dimension.name] = value 

131 governorValues.update_extract(dataset.dataId) 

132 rows.append(row) 

133 # Update the summary tables for this collection in case this is the 

134 # first time this dataset type or these governor values will be 

135 # inserted there. 

136 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

137 # Update the tag table itself. 

138 self._db.replace(self._tags, *rows) 

139 

140 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

141 # Docstring inherited from DatasetRecordStorage. 

142 if collection.type is not CollectionType.TAGGED: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true

143 raise TypeError(f"Cannot disassociate from collection '{collection.name}' " 

144 f"of type {collection.type.name}; must be TAGGED.") 

145 rows = [ 

146 { 

147 "dataset_id": dataset.getCheckedId(), 

148 self._collections.getCollectionForeignKeyName(): collection.key 

149 } 

150 for dataset in datasets 

151 ] 

152 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], 

153 *rows) 

154 

155 def _buildCalibOverlapQuery(self, collection: CollectionRecord, 

156 dataIds: Optional[DataCoordinateSet], 

157 timespan: Timespan) -> SimpleQuery: 

158 assert self._calibs is not None 

159 # Start by building a SELECT query for any rows that would overlap 

160 # this one. 

161 query = SimpleQuery() 

162 query.join(self._calibs) 

163 # Add a WHERE clause matching the dataset type and collection. 

164 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

165 query.where.append( 

166 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

167 ) 

168 # Add a WHERE clause matching any of the given data IDs. 

169 if dataIds is not None: 

170 dataIds.constrain( 

171 query, 

172 lambda name: self._calibs.columns[name], # type: ignore 

173 ) 

174 # Add WHERE clause for timespan overlaps. 

175 TimespanReprClass = self._db.getTimespanRepresentation() 

176 query.where.append( 

177 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan)) 

178 ) 

179 return query 

180 

181 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef], 

182 timespan: Timespan) -> None: 

183 # Docstring inherited from DatasetRecordStorage. 

184 if self._calibs is None: 184 ↛ 185line 184 didn't jump to line 185, because the condition on line 184 was never true

185 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which " 

186 f"DatasetType.isCalibration() is False.") 

187 if collection.type is not CollectionType.CALIBRATION: 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true

188 raise TypeError(f"Cannot certify into collection '{collection.name}' " 

189 f"of type {collection.type.name}; must be CALIBRATION.") 

190 TimespanReprClass = self._db.getTimespanRepresentation() 

191 protoRow = { 

192 self._collections.getCollectionForeignKeyName(): collection.key, 

193 "dataset_type_id": self._dataset_type_id, 

194 } 

195 rows = [] 

196 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

197 dataIds: Optional[Set[DataCoordinate]] = ( 

198 set() if not TimespanReprClass.hasExclusionConstraint() else None 

199 ) 

200 for dataset in datasets: 

201 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

202 for dimension, value in dataset.dataId.items(): 

203 row[dimension.name] = value 

204 TimespanReprClass.update(timespan, result=row) 

205 governorValues.update_extract(dataset.dataId) 

206 rows.append(row) 

207 if dataIds is not None: 207 ↛ 200line 207 didn't jump to line 200, because the condition on line 207 was never false

208 dataIds.add(dataset.dataId) 

209 # Update the summary tables for this collection in case this is the 

210 # first time this dataset type or these governor values will be 

211 # inserted there. 

212 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

213 # Update the association table itself. 

214 if TimespanReprClass.hasExclusionConstraint(): 214 ↛ 217line 214 didn't jump to line 217, because the condition on line 214 was never true

215 # Rely on database constraint to enforce invariants; we just 

216 # reraise the exception for consistency across DB engines. 

217 try: 

218 self._db.insert(self._calibs, *rows) 

219 except sqlalchemy.exc.IntegrityError as err: 

220 raise ConflictingDefinitionError( 

221 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

222 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

223 ) from err 

224 else: 

225 # Have to implement exclusion constraint ourselves. 

226 # Start by building a SELECT query for any rows that would overlap 

227 # this one. 

228 query = self._buildCalibOverlapQuery( 

229 collection, 

230 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

231 timespan 

232 ) 

233 query.columns.append(sqlalchemy.sql.func.count()) 

234 sql = query.combine() 

235 # Acquire a table lock to ensure there are no concurrent writes 

236 # could invalidate our checking before we finish the inserts. We 

237 # use a SAVEPOINT in case there is an outer transaction that a 

238 # failure here should not roll back. 

239 with self._db.transaction(lock=[self._calibs], savepoint=True): 

240 # Run the check SELECT query. 

241 conflicting = self._db.query(sql).scalar() 

242 if conflicting > 0: 

243 raise ConflictingDefinitionError( 

244 f"{conflicting} validity range conflicts certifying datasets of type " 

245 f"{self.datasetType.name} into {collection.name} for range " 

246 f"[{timespan.begin}, {timespan.end})." 

247 ) 

248 # Proceed with the insert. 

249 self._db.insert(self._calibs, *rows) 

250 

251 def decertify(self, collection: CollectionRecord, timespan: Timespan, *, 

252 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None: 

253 # Docstring inherited from DatasetRecordStorage. 

254 if self._calibs is None: 254 ↛ 255line 254 didn't jump to line 255, because the condition on line 254 was never true

255 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

256 f"DatasetType.isCalibration() is False.") 

257 if collection.type is not CollectionType.CALIBRATION: 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true

258 raise TypeError(f"Cannot decertify from collection '{collection.name}' " 

259 f"of type {collection.type.name}; must be CALIBRATION.") 

260 TimespanReprClass = self._db.getTimespanRepresentation() 

261 # Construct a SELECT query to find all rows that overlap our inputs. 

262 dataIdSet: Optional[DataCoordinateSet] 

263 if dataIds is not None: 

264 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

265 else: 

266 dataIdSet = None 

267 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

268 query.columns.extend(self._calibs.columns) 

269 sql = query.combine() 

270 # Set up collections to populate with the rows we'll want to modify. 

271 # The insert rows will have the same values for collection and 

272 # dataset type. 

273 protoInsertRow = { 

274 self._collections.getCollectionForeignKeyName(): collection.key, 

275 "dataset_type_id": self._dataset_type_id, 

276 } 

277 rowsToDelete = [] 

278 rowsToInsert = [] 

279 # Acquire a table lock to ensure there are no concurrent writes 

280 # between the SELECT and the DELETE and INSERT queries based on it. 

281 with self._db.transaction(lock=[self._calibs], savepoint=True): 

282 for row in self._db.query(sql).mappings(): 

283 rowsToDelete.append({"id": row["id"]}) 

284 # Construct the insert row(s) by copying the prototype row, 

285 # then adding the dimension column values, then adding what's 

286 # left of the timespan from that row after we subtract the 

287 # given timespan. 

288 newInsertRow = protoInsertRow.copy() 

289 newInsertRow["dataset_id"] = row["dataset_id"] 

290 for name in self.datasetType.dimensions.required.names: 

291 newInsertRow[name] = row[name] 

292 rowTimespan = TimespanReprClass.extract(row) 

293 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

294 for diffTimespan in rowTimespan.difference(timespan): 

295 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

296 # Run the DELETE and INSERT queries. 

297 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

298 self._db.insert(self._calibs, *rowsToInsert) 

299 

300 def select(self, *collections: CollectionRecord, 

301 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

302 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select, 

303 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

304 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

305 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

306 ) -> SimpleQuery: 

307 # Docstring inherited from DatasetRecordStorage. 

308 collection_types = {collection.type for collection in collections} 

309 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

310 # 

311 # There are two tables in play here: 

312 # 

313 # - the static dataset table (with the dataset ID, dataset type ID, 

314 # run ID/name, and ingest date); 

315 # 

316 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

317 # type ID, collection ID/name, data ID, and possibly validity 

318 # range). 

319 # 

320 # That means that we might want to return a query against either table 

321 # or a JOIN of both, depending on which quantities the caller wants. 

322 # But this method is documented/typed such that ``dataId`` is never 

323 # `None` - i.e. we always constrain or retreive the data ID. That 

324 # means we'll always include the tags/calibs table and join in the 

325 # static dataset table only if we need things from it that we can't get 

326 # from the tags/calibs table. 

327 # 

328 # Note that it's important that we include a WHERE constraint on both 

329 # tables for any column (e.g. dataset_type_id) that is in both when 

330 # it's given explicitly; not doing can prevent the query planner from 

331 # using very important indexes. At present, we don't include those 

332 # redundant columns in the JOIN ON expression, however, because the 

333 # FOREIGN KEY (and its index) are defined only on dataset_id. 

334 # 

335 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass 

336 # to its `join` method when we bring in the tags/calibs table. 

337 query = SimpleQuery() 

338 # We get the data ID or constrain it in the tags/calibs table, but 

339 # that's multiple columns, not one, so we need to transform the one 

340 # Select.Or argument into a dictionary of them. 

341 kwargs: Dict[str, Any] 

342 if dataId is SimpleQuery.Select: 

343 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

344 else: 

345 kwargs = dict(dataId.byName()) 

346 # We always constrain (never retrieve) the dataset type in at least the 

347 # tags/calibs table. 

348 kwargs["dataset_type_id"] = self._dataset_type_id 

349 # Join in the tags or calibs table, turning those 'kwargs' entries into 

350 # WHERE constraints or SELECT columns as appropriate. 

351 if collection_types == {CollectionType.CALIBRATION}: 

352 assert self._calibs is not None, \ 

353 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

354 TimespanReprClass = self._db.getTimespanRepresentation() 

355 # Add the timespan column(s) to the result columns, or constrain 

356 # the timespan via an overlap condition. 

357 if timespan is SimpleQuery.Select: 

358 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()}) 

359 elif timespan is not None: 359 ↛ 365line 359 didn't jump to line 365, because the condition on line 359 was never false

360 query.where.append( 

361 TimespanReprClass.fromSelectable(self._calibs).overlaps( 

362 TimespanReprClass.fromLiteral(timespan) 

363 ) 

364 ) 

365 query.join(self._calibs, **kwargs) 

366 dataset_id_col = self._calibs.columns.dataset_id 

367 collection_col = self._calibs.columns[self._collections.getCollectionForeignKeyName()] 

368 elif CollectionType.CALIBRATION not in collection_types: 368 ↛ 373line 368 didn't jump to line 373, because the condition on line 368 was never false

369 query.join(self._tags, **kwargs) 

370 dataset_id_col = self._tags.columns.dataset_id 

371 collection_col = self._tags.columns[self._collections.getCollectionForeignKeyName()] 

372 else: 

373 raise TypeError( 

374 "Cannot query for CALIBRATION collections in the same " 

375 "subquery as other kinds of collections." 

376 ) 

377 # We always constrain (never retrieve) the collection(s) in the 

378 # tags/calibs table. 

379 if len(collections) == 1: 

380 query.where.append(collection_col == collections[0].key) 

381 elif len(collections) == 0: 

382 # We support the case where there are no collections as a way to 

383 # generate a valid SQL query that can't yield results. This should 

384 # never get executed, but lots of downstream code will still try 

385 # to access the SQLAlchemy objects representing the columns in the 

386 # subquery. That's not idea, but it'd take a lot of refactoring to 

387 # fix it. 

388 query.where.append(sqlalchemy.sql.literal(False)) 

389 else: 

390 query.where.append(collection_col.in_([collection.key for collection in collections])) 

391 # We can always get the dataset_id from the tags/calibs table or 

392 # constrain it there. Can't use kwargs for that because we need to 

393 # alias it to 'id'. 

394 if id is SimpleQuery.Select: 

395 query.columns.append(dataset_id_col.label("id")) 

396 elif id is not None: 396 ↛ 397line 396 didn't jump to line 397, because the condition on line 396 was never true

397 query.where.append(dataset_id_col == id) 

398 # It's possible we now have everything we need, from just the 

399 # tags/calibs table. The things we might need to get from the static 

400 # dataset table are the run key and the ingest date. 

401 need_static_table = False 

402 static_kwargs: Dict[str, Any] = {} 

403 if run is not None: 

404 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection." 

405 if len(collections) == 1 and collections[0].type is CollectionType.RUN: 

406 # If we are searching exactly one RUN collection, we 

407 # know that if we find the dataset in that collection, 

408 # then that's the datasets's run; we don't need to 

409 # query for it. 

410 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn)) 

411 else: 

412 static_kwargs[self._runKeyColumn] = SimpleQuery.Select 

413 need_static_table = True 

414 # Ingest date can only come from the static table. 

415 if ingestDate is not None: 

416 need_static_table = True 

417 if ingestDate is SimpleQuery.Select: 417 ↛ 420line 417 didn't jump to line 420, because the condition on line 417 was never false

418 static_kwargs["ingest_date"] = SimpleQuery.Select 

419 else: 

420 assert isinstance(ingestDate, Timespan) 

421 # Timespan is astropy Time (usually in TAI) and ingest_date is 

422 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

423 if ingestDate.isEmpty(): 

424 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

425 if ingestDate.begin is not None: 

426 begin = ingestDate.begin.utc.datetime # type: ignore 

427 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

428 if ingestDate.end is not None: 

429 end = ingestDate.end.utc.datetime # type: ignore 

430 query.where.append(self._static.dataset.columns.ingest_date < end) 

431 # If we need the static table, join it in via dataset_id and 

432 # dataset_type_id 

433 if need_static_table: 

434 query.join( 

435 self._static.dataset, 

436 onclause=(dataset_id_col == self._static.dataset.columns.id), 

437 **static_kwargs, 

438 ) 

439 # Also constrain dataset_type_id in static table in case that helps 

440 # generate a better plan. 

441 # We could also include this in the JOIN ON clause, but my guess is 

442 # that that's a good idea IFF it's in the foreign key, and right 

443 # now it isn't. 

444 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

445 return query 

446 

447 def getDataId(self, id: DatasetId) -> DataCoordinate: 

448 """Return DataId for a dataset. 

449 

450 Parameters 

451 ---------- 

452 id : `DatasetId` 

453 Unique dataset identifier. 

454 

455 Returns 

456 ------- 

457 dataId : `DataCoordinate` 

458 DataId for the dataset. 

459 """ 

460 # This query could return multiple rows (one for each tagged collection 

461 # the dataset is in, plus one for its run collection), and we don't 

462 # care which of those we get. 

463 sql = self._tags.select().where( 

464 sqlalchemy.sql.and_( 

465 self._tags.columns.dataset_id == id, 

466 self._tags.columns.dataset_type_id == self._dataset_type_id 

467 ) 

468 ).limit(1) 

469 row = self._db.query(sql).mappings().fetchone() 

470 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

471 return DataCoordinate.standardize( 

472 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

473 graph=self.datasetType.dimensions 

474 ) 

475 

476 

477class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

478 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

479 auto-incremented column for dataset IDs. 

480 """ 

481 

482 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

483 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

484 # Docstring inherited from DatasetRecordStorage. 

485 

486 # We only support UNIQUE mode for integer dataset IDs 

487 if idMode != DatasetIdGenEnum.UNIQUE: 487 ↛ 488line 487 didn't jump to line 488, because the condition on line 487 was never true

488 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

489 

490 # Transform a possibly-single-pass iterable into a list. 

491 dataIdList = list(dataIds) 

492 yield from self._insert(run, dataIdList) 

493 

494 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

495 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

496 reuseIds: bool = False) -> Iterator[DatasetRef]: 

497 # Docstring inherited from DatasetRecordStorage. 

498 

499 # We only support UNIQUE mode for integer dataset IDs 

500 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 500 ↛ 501line 500 didn't jump to line 501, because the condition on line 500 was never true

501 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

502 

503 # Make a list of dataIds and optionally dataset IDs. 

504 dataIdList: List[DataCoordinate] = [] 

505 datasetIdList: List[int] = [] 

506 for dataset in datasets: 

507 dataIdList.append(dataset.dataId) 

508 

509 # We only accept integer dataset IDs, but also allow None. 

510 datasetId = dataset.id 

511 if datasetId is None: 511 ↛ 513line 511 didn't jump to line 513, because the condition on line 511 was never true

512 # if reuseIds is set then all IDs must be known 

513 if reuseIds: 

514 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

515 elif isinstance(datasetId, int): 515 ↛ 519line 515 didn't jump to line 519, because the condition on line 515 was never false

516 if reuseIds: 

517 datasetIdList.append(datasetId) 

518 else: 

519 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

520 

521 yield from self._insert(run, dataIdList, datasetIdList) 

522 

523 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate], 

524 datasetIdList: Optional[List[int]] = None) -> Iterator[DatasetRef]: 

525 """Common part of implementation of `insert` and `import_` methods. 

526 """ 

527 

528 # Remember any governor dimension values we see. 

529 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

530 for dataId in dataIdList: 

531 governorValues.update_extract(dataId) 

532 

533 staticRow = { 

534 "dataset_type_id": self._dataset_type_id, 

535 self._runKeyColumn: run.key, 

536 } 

537 with self._db.transaction(): 

538 # Insert into the static dataset table, generating autoincrement 

539 # dataset_id values. 

540 if datasetIdList: 

541 # reuse existing IDs 

542 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

543 self._db.insert(self._static.dataset, *rows) 

544 else: 

545 # use auto-incremented IDs 

546 datasetIdList = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)), 

547 returnIds=True) 

548 assert datasetIdList is not None 

549 # Update the summary tables for this collection in case this is the 

550 # first time this dataset type or these governor values will be 

551 # inserted there. 

552 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

553 # Combine the generated dataset_id values and data ID fields to 

554 # form rows to be inserted into the tags table. 

555 protoTagsRow = { 

556 "dataset_type_id": self._dataset_type_id, 

557 self._collections.getCollectionForeignKeyName(): run.key, 

558 } 

559 tagsRows = [ 

560 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

561 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

562 ] 

563 # Insert those rows into the tags table. This is where we'll 

564 # get any unique constraint violations. 

565 self._db.insert(self._tags, *tagsRows) 

566 

567 for dataId, datasetId in zip(dataIdList, datasetIdList): 

568 yield DatasetRef( 

569 datasetType=self.datasetType, 

570 dataId=dataId, 

571 id=datasetId, 

572 run=run.name, 

573 ) 

574 

575 

576class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

577 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

578 dataset IDs. 

579 """ 

580 

581 NS_UUID = uuid.UUID('840b31d9-05cd-5161-b2c8-00d32b280d0f') 

582 """Namespace UUID used for UUID5 generation. Do not change. This was 

583 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

584 """ 

585 

586 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

587 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

588 # Docstring inherited from DatasetRecordStorage. 

589 

590 # Iterate over data IDs, transforming a possibly-single-pass iterable 

591 # into a list. 

592 dataIdList = [] 

593 rows = [] 

594 for dataId in dataIds: 

595 dataIdList.append(dataId) 

596 rows.append({ 

597 "id": self._makeDatasetId(run, dataId, idMode), 

598 "dataset_type_id": self._dataset_type_id, 

599 self._runKeyColumn: run.key, 

600 }) 

601 

602 yield from self._insert(run, dataIdList, rows, self._db.insert) 

603 

604 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

605 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

606 reuseIds: bool = False) -> Iterator[DatasetRef]: 

607 # Docstring inherited from DatasetRecordStorage. 

608 

609 # Iterate over data IDs, transforming a possibly-single-pass iterable 

610 # into a list. 

611 dataIdList = [] 

612 rows = [] 

613 for dataset in datasets: 

614 dataIdList.append(dataset.dataId) 

615 # Ignore unknown ID types, normally all IDs have the same type but 

616 # this code supports mixed types or missing IDs. 

617 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

618 if datasetId is None: 

619 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode) 

620 rows.append({ 

621 "id": datasetId, 

622 "dataset_type_id": self._dataset_type_id, 

623 self._runKeyColumn: run.key, 

624 }) 

625 

626 yield from self._insert(run, dataIdList, rows, self._db.ensure) 

627 

628 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate], 

629 rows: List[Dict], insertMethod: Callable) -> Iterator[DatasetRef]: 

630 """Common part of implementation of `insert` and `import_` methods. 

631 """ 

632 

633 # Remember any governor dimension values we see. 

634 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

635 for dataId in dataIdList: 

636 governorValues.update_extract(dataId) 

637 

638 with self._db.transaction(): 

639 # Insert into the static dataset table. 

640 insertMethod(self._static.dataset, *rows) 

641 # Update the summary tables for this collection in case this is the 

642 # first time this dataset type or these governor values will be 

643 # inserted there. 

644 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

645 # Combine the generated dataset_id values and data ID fields to 

646 # form rows to be inserted into the tags table. 

647 protoTagsRow = { 

648 "dataset_type_id": self._dataset_type_id, 

649 self._collections.getCollectionForeignKeyName(): run.key, 

650 } 

651 tagsRows = [ 

652 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

653 for dataId, row in zip(dataIdList, rows) 

654 ] 

655 # Insert those rows into the tags table. 

656 insertMethod(self._tags, *tagsRows) 

657 for dataId, row in zip(dataIdList, rows): 

658 yield DatasetRef( 

659 datasetType=self.datasetType, 

660 dataId=dataId, 

661 id=row["id"], 

662 run=run.name, 

663 ) 

664 

665 def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate, 

666 idGenerationMode: DatasetIdGenEnum) -> uuid.UUID: 

667 """Generate dataset ID for a dataset. 

668 

669 Parameters 

670 ---------- 

671 run : `RunRecord` 

672 The record object describing the RUN collection for the dataset. 

673 dataId : `DataCoordinate` 

674 Expanded data ID for the dataset. 

675 idGenerationMode : `DatasetIdGenEnum` 

676 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random 

677 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

678 deterministic UUID5-type ID based on a dataset type name and 

679 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

680 deterministic UUID5-type ID based on a dataset type name, run 

681 collection name, and ``dataId``. 

682 

683 Returns 

684 ------- 

685 datasetId : `uuid.UUID` 

686 Dataset identifier. 

687 """ 

688 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

689 return uuid.uuid4() 

690 else: 

691 # WARNING: If you modify this code make sure that the order of 

692 # items in the `items` list below never changes. 

693 items: List[Tuple[str, str]] = [] 

694 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 694 ↛ 698line 694 didn't jump to line 698, because the condition on line 694 was never false

695 items = [ 

696 ("dataset_type", self.datasetType.name), 

697 ] 

698 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 

699 items = [ 

700 ("dataset_type", self.datasetType.name), 

701 ("run", run.name), 

702 ] 

703 else: 

704 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

705 

706 for name, value in sorted(dataId.byName().items()): 

707 items.append((name, str(value))) 

708 data = ",".join(f"{key}={value}" for key, value in items) 

709 return uuid.uuid5(self.NS_UUID, data)