Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorage",) 

4 

5from typing import ( 

6 Any, 

7 Dict, 

8 Iterable, 

9 Iterator, 

10 List, 

11 Optional, 

12 Set, 

13 Tuple, 

14 TYPE_CHECKING, 

15) 

16import uuid 

17 

18import sqlalchemy 

19 

20from lsst.daf.butler import ( 

21 CollectionType, 

22 DataCoordinate, 

23 DataCoordinateSet, 

24 DatasetId, 

25 DatasetRef, 

26 DatasetType, 

27 SimpleQuery, 

28 Timespan, 

29 ddl 

30) 

31from lsst.daf.butler.registry import ConflictingDefinitionError, UnsupportedIdGeneratorError 

32from lsst.daf.butler.registry.interfaces import DatasetRecordStorage, DatasetIdGenEnum 

33 

34from ...summaries import GovernorDimensionRestriction 

35from .tables import makeTagTableSpec 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true

38 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

39 from .tables import StaticDatasetTablesTuple 

40 from .summaries import CollectionSummaryManager 

41 

42 

43class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

44 """Dataset record storage implementation paired with 

45 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

46 information. 

47 

48 Instances of this class should never be constructed directly; use 

49 `DatasetRecordStorageManager.register` instead. 

50 """ 

51 

52 def __init__(self, *, datasetType: DatasetType, 

53 db: Database, 

54 dataset_type_id: int, 

55 collections: CollectionManager, 

56 static: StaticDatasetTablesTuple, 

57 summaries: CollectionSummaryManager, 

58 tags: sqlalchemy.schema.Table, 

59 calibs: Optional[sqlalchemy.schema.Table]): 

60 super().__init__(datasetType=datasetType) 

61 self._dataset_type_id = dataset_type_id 

62 self._db = db 

63 self._collections = collections 

64 self._static = static 

65 self._summaries = summaries 

66 self._tags = tags 

67 self._calibs = calibs 

68 self._runKeyColumn = collections.getRunForeignKeyName() 

69 

70 def find(self, collection: CollectionRecord, dataId: DataCoordinate, 

71 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]: 

72 # Docstring inherited from DatasetRecordStorage. 

73 assert dataId.graph == self.datasetType.dimensions 

74 if collection.type is CollectionType.CALIBRATION and timespan is None: 74 ↛ 75line 74 didn't jump to line 75, because the condition on line 74 was never true

75 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

76 f"without an input timespan.") 

77 sql = self.select(collection, dataId=dataId, id=SimpleQuery.Select, 

78 run=SimpleQuery.Select, timespan=timespan) 

79 sql = sql.combine() 

80 results = self._db.query(sql) 

81 row = results.fetchone() 

82 if row is None: 

83 return None 

84 if collection.type is CollectionType.CALIBRATION: 

85 # For temporal calibration lookups (only!) our invariants do not 

86 # guarantee that the number of result rows is <= 1. 

87 # They would if `select` constrained the given timespan to be 

88 # _contained_ by the validity range in the self._calibs table, 

89 # instead of simply _overlapping_ it, because we do guarantee that 

90 # the validity ranges are disjoint for a particular dataset type, 

91 # collection, and data ID. But using an overlap test and a check 

92 # for multiple result rows here allows us to provide a more useful 

93 # diagnostic, as well as allowing `select` to support more general 

94 # queries where multiple results are not an error. 

95 if results.fetchone() is not None: 

96 raise RuntimeError( 

97 f"Multiple matches found for calibration lookup in {collection.name} for " 

98 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

99 ) 

100 return DatasetRef( 

101 datasetType=self.datasetType, 

102 dataId=dataId, 

103 id=row.id, 

104 run=self._collections[row._mapping[self._runKeyColumn]].name 

105 ) 

106 

107 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

108 # Docstring inherited from DatasetRecordStorage. 

109 # Only delete from common dataset table; ON DELETE foreign key clauses 

110 # will handle the rest. 

111 self._db.delete( 

112 self._static.dataset, 

113 ["id"], 

114 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

115 ) 

116 

117 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

118 # Docstring inherited from DatasetRecordStorage. 

119 if collection.type is not CollectionType.TAGGED: 119 ↛ 120line 119 didn't jump to line 120, because the condition on line 119 was never true

120 raise TypeError(f"Cannot associate into collection '{collection.name}' " 

121 f"of type {collection.type.name}; must be TAGGED.") 

122 protoRow = { 

123 self._collections.getCollectionForeignKeyName(): collection.key, 

124 "dataset_type_id": self._dataset_type_id, 

125 } 

126 rows = [] 

127 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

128 for dataset in datasets: 

129 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

130 for dimension, value in dataset.dataId.items(): 

131 row[dimension.name] = value 

132 governorValues.update_extract(dataset.dataId) 

133 rows.append(row) 

134 # Update the summary tables for this collection in case this is the 

135 # first time this dataset type or these governor values will be 

136 # inserted there. 

137 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

138 # Update the tag table itself. 

139 self._db.replace(self._tags, *rows) 

140 

141 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

142 # Docstring inherited from DatasetRecordStorage. 

143 if collection.type is not CollectionType.TAGGED: 143 ↛ 144line 143 didn't jump to line 144, because the condition on line 143 was never true

144 raise TypeError(f"Cannot disassociate from collection '{collection.name}' " 

145 f"of type {collection.type.name}; must be TAGGED.") 

146 rows = [ 

147 { 

148 "dataset_id": dataset.getCheckedId(), 

149 self._collections.getCollectionForeignKeyName(): collection.key 

150 } 

151 for dataset in datasets 

152 ] 

153 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], 

154 *rows) 

155 

156 def _buildCalibOverlapQuery(self, collection: CollectionRecord, 

157 dataIds: Optional[DataCoordinateSet], 

158 timespan: Timespan) -> SimpleQuery: 

159 assert self._calibs is not None 

160 # Start by building a SELECT query for any rows that would overlap 

161 # this one. 

162 query = SimpleQuery() 

163 query.join(self._calibs) 

164 # Add a WHERE clause matching the dataset type and collection. 

165 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

166 query.where.append( 

167 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

168 ) 

169 # Add a WHERE clause matching any of the given data IDs. 

170 if dataIds is not None: 

171 dataIds.constrain( 

172 query, 

173 lambda name: self._calibs.columns[name], # type: ignore 

174 ) 

175 # Add WHERE clause for timespan overlaps. 

176 TimespanReprClass = self._db.getTimespanRepresentation() 

177 query.where.append( 

178 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan)) 

179 ) 

180 return query 

181 

182 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef], 

183 timespan: Timespan) -> None: 

184 # Docstring inherited from DatasetRecordStorage. 

185 if self._calibs is None: 185 ↛ 186line 185 didn't jump to line 186, because the condition on line 185 was never true

186 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which " 

187 f"DatasetType.isCalibration() is False.") 

188 if collection.type is not CollectionType.CALIBRATION: 188 ↛ 189line 188 didn't jump to line 189, because the condition on line 188 was never true

189 raise TypeError(f"Cannot certify into collection '{collection.name}' " 

190 f"of type {collection.type.name}; must be CALIBRATION.") 

191 TimespanReprClass = self._db.getTimespanRepresentation() 

192 protoRow = { 

193 self._collections.getCollectionForeignKeyName(): collection.key, 

194 "dataset_type_id": self._dataset_type_id, 

195 } 

196 rows = [] 

197 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

198 dataIds: Optional[Set[DataCoordinate]] = ( 

199 set() if not TimespanReprClass.hasExclusionConstraint() else None 

200 ) 

201 for dataset in datasets: 

202 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

203 for dimension, value in dataset.dataId.items(): 

204 row[dimension.name] = value 

205 TimespanReprClass.update(timespan, result=row) 

206 governorValues.update_extract(dataset.dataId) 

207 rows.append(row) 

208 if dataIds is not None: 208 ↛ 201line 208 didn't jump to line 201, because the condition on line 208 was never false

209 dataIds.add(dataset.dataId) 

210 # Update the summary tables for this collection in case this is the 

211 # first time this dataset type or these governor values will be 

212 # inserted there. 

213 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

214 # Update the association table itself. 

215 if TimespanReprClass.hasExclusionConstraint(): 215 ↛ 218line 215 didn't jump to line 218, because the condition on line 215 was never true

216 # Rely on database constraint to enforce invariants; we just 

217 # reraise the exception for consistency across DB engines. 

218 try: 

219 self._db.insert(self._calibs, *rows) 

220 except sqlalchemy.exc.IntegrityError as err: 

221 raise ConflictingDefinitionError( 

222 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

223 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

224 ) from err 

225 else: 

226 # Have to implement exclusion constraint ourselves. 

227 # Start by building a SELECT query for any rows that would overlap 

228 # this one. 

229 query = self._buildCalibOverlapQuery( 

230 collection, 

231 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

232 timespan 

233 ) 

234 query.columns.append(sqlalchemy.sql.func.count()) 

235 sql = query.combine() 

236 # Acquire a table lock to ensure there are no concurrent writes 

237 # could invalidate our checking before we finish the inserts. We 

238 # use a SAVEPOINT in case there is an outer transaction that a 

239 # failure here should not roll back. 

240 with self._db.transaction(lock=[self._calibs], savepoint=True): 

241 # Run the check SELECT query. 

242 conflicting = self._db.query(sql).scalar() 

243 if conflicting > 0: 

244 raise ConflictingDefinitionError( 

245 f"{conflicting} validity range conflicts certifying datasets of type " 

246 f"{self.datasetType.name} into {collection.name} for range " 

247 f"[{timespan.begin}, {timespan.end})." 

248 ) 

249 # Proceed with the insert. 

250 self._db.insert(self._calibs, *rows) 

251 

252 def decertify(self, collection: CollectionRecord, timespan: Timespan, *, 

253 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None: 

254 # Docstring inherited from DatasetRecordStorage. 

255 if self._calibs is None: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true

256 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

257 f"DatasetType.isCalibration() is False.") 

258 if collection.type is not CollectionType.CALIBRATION: 258 ↛ 259line 258 didn't jump to line 259, because the condition on line 258 was never true

259 raise TypeError(f"Cannot decertify from collection '{collection.name}' " 

260 f"of type {collection.type.name}; must be CALIBRATION.") 

261 TimespanReprClass = self._db.getTimespanRepresentation() 

262 # Construct a SELECT query to find all rows that overlap our inputs. 

263 dataIdSet: Optional[DataCoordinateSet] 

264 if dataIds is not None: 

265 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

266 else: 

267 dataIdSet = None 

268 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

269 query.columns.extend(self._calibs.columns) 

270 sql = query.combine() 

271 # Set up collections to populate with the rows we'll want to modify. 

272 # The insert rows will have the same values for collection and 

273 # dataset type. 

274 protoInsertRow = { 

275 self._collections.getCollectionForeignKeyName(): collection.key, 

276 "dataset_type_id": self._dataset_type_id, 

277 } 

278 rowsToDelete = [] 

279 rowsToInsert = [] 

280 # Acquire a table lock to ensure there are no concurrent writes 

281 # between the SELECT and the DELETE and INSERT queries based on it. 

282 with self._db.transaction(lock=[self._calibs], savepoint=True): 

283 for row in self._db.query(sql).mappings(): 

284 rowsToDelete.append({"id": row["id"]}) 

285 # Construct the insert row(s) by copying the prototype row, 

286 # then adding the dimension column values, then adding what's 

287 # left of the timespan from that row after we subtract the 

288 # given timespan. 

289 newInsertRow = protoInsertRow.copy() 

290 newInsertRow["dataset_id"] = row["dataset_id"] 

291 for name in self.datasetType.dimensions.required.names: 

292 newInsertRow[name] = row[name] 

293 rowTimespan = TimespanReprClass.extract(row) 

294 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

295 for diffTimespan in rowTimespan.difference(timespan): 

296 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

297 # Run the DELETE and INSERT queries. 

298 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

299 self._db.insert(self._calibs, *rowsToInsert) 

300 

301 def select(self, *collections: CollectionRecord, 

302 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

303 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select, 

304 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

305 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

306 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

307 ) -> SimpleQuery: 

308 # Docstring inherited from DatasetRecordStorage. 

309 collection_types = {collection.type for collection in collections} 

310 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

311 # 

312 # There are two tables in play here: 

313 # 

314 # - the static dataset table (with the dataset ID, dataset type ID, 

315 # run ID/name, and ingest date); 

316 # 

317 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

318 # type ID, collection ID/name, data ID, and possibly validity 

319 # range). 

320 # 

321 # That means that we might want to return a query against either table 

322 # or a JOIN of both, depending on which quantities the caller wants. 

323 # But this method is documented/typed such that ``dataId`` is never 

324 # `None` - i.e. we always constrain or retreive the data ID. That 

325 # means we'll always include the tags/calibs table and join in the 

326 # static dataset table only if we need things from it that we can't get 

327 # from the tags/calibs table. 

328 # 

329 # Note that it's important that we include a WHERE constraint on both 

330 # tables for any column (e.g. dataset_type_id) that is in both when 

331 # it's given explicitly; not doing can prevent the query planner from 

332 # using very important indexes. At present, we don't include those 

333 # redundant columns in the JOIN ON expression, however, because the 

334 # FOREIGN KEY (and its index) are defined only on dataset_id. 

335 # 

336 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass 

337 # to its `join` method when we bring in the tags/calibs table. 

338 query = SimpleQuery() 

339 # We get the data ID or constrain it in the tags/calibs table, but 

340 # that's multiple columns, not one, so we need to transform the one 

341 # Select.Or argument into a dictionary of them. 

342 kwargs: Dict[str, Any] 

343 if dataId is SimpleQuery.Select: 

344 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

345 else: 

346 kwargs = dict(dataId.byName()) 

347 # We always constrain (never retrieve) the dataset type in at least the 

348 # tags/calibs table. 

349 kwargs["dataset_type_id"] = self._dataset_type_id 

350 # Join in the tags or calibs table, turning those 'kwargs' entries into 

351 # WHERE constraints or SELECT columns as appropriate. 

352 if collection_types == {CollectionType.CALIBRATION}: 

353 assert self._calibs is not None, \ 

354 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

355 TimespanReprClass = self._db.getTimespanRepresentation() 

356 # Add the timespan column(s) to the result columns, or constrain 

357 # the timespan via an overlap condition. 

358 if timespan is SimpleQuery.Select: 

359 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()}) 

360 elif timespan is not None: 360 ↛ 366line 360 didn't jump to line 366, because the condition on line 360 was never false

361 query.where.append( 

362 TimespanReprClass.fromSelectable(self._calibs).overlaps( 

363 TimespanReprClass.fromLiteral(timespan) 

364 ) 

365 ) 

366 query.join(self._calibs, **kwargs) 

367 dataset_id_col = self._calibs.columns.dataset_id 

368 collection_col = self._calibs.columns[self._collections.getCollectionForeignKeyName()] 

369 elif CollectionType.CALIBRATION not in collection_types: 369 ↛ 374line 369 didn't jump to line 374, because the condition on line 369 was never false

370 query.join(self._tags, **kwargs) 

371 dataset_id_col = self._tags.columns.dataset_id 

372 collection_col = self._tags.columns[self._collections.getCollectionForeignKeyName()] 

373 else: 

374 raise TypeError( 

375 "Cannot query for CALIBRATION collections in the same " 

376 "subquery as other kinds of collections." 

377 ) 

378 # We always constrain (never retrieve) the collection(s) in the 

379 # tags/calibs table. 

380 if len(collections) == 1: 

381 query.where.append(collection_col == collections[0].key) 

382 elif len(collections) == 0: 

383 # We support the case where there are no collections as a way to 

384 # generate a valid SQL query that can't yield results. This should 

385 # never get executed, but lots of downstream code will still try 

386 # to access the SQLAlchemy objects representing the columns in the 

387 # subquery. That's not idea, but it'd take a lot of refactoring to 

388 # fix it. 

389 query.where.append(sqlalchemy.sql.literal(False)) 

390 else: 

391 query.where.append(collection_col.in_([collection.key for collection in collections])) 

392 # We can always get the dataset_id from the tags/calibs table or 

393 # constrain it there. Can't use kwargs for that because we need to 

394 # alias it to 'id'. 

395 if id is SimpleQuery.Select: 

396 query.columns.append(dataset_id_col.label("id")) 

397 elif id is not None: 397 ↛ 398line 397 didn't jump to line 398, because the condition on line 397 was never true

398 query.where.append(dataset_id_col == id) 

399 # It's possible we now have everything we need, from just the 

400 # tags/calibs table. The things we might need to get from the static 

401 # dataset table are the run key and the ingest date. 

402 need_static_table = False 

403 static_kwargs: Dict[str, Any] = {} 

404 if run is not None: 

405 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection." 

406 if len(collections) == 1 and collections[0].type is CollectionType.RUN: 

407 # If we are searching exactly one RUN collection, we 

408 # know that if we find the dataset in that collection, 

409 # then that's the datasets's run; we don't need to 

410 # query for it. 

411 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn)) 

412 else: 

413 static_kwargs[self._runKeyColumn] = SimpleQuery.Select 

414 need_static_table = True 

415 # Ingest date can only come from the static table. 

416 if ingestDate is not None: 

417 need_static_table = True 

418 if ingestDate is SimpleQuery.Select: 418 ↛ 421line 418 didn't jump to line 421, because the condition on line 418 was never false

419 static_kwargs["ingest_date"] = SimpleQuery.Select 

420 else: 

421 assert isinstance(ingestDate, Timespan) 

422 # Timespan is astropy Time (usually in TAI) and ingest_date is 

423 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

424 if ingestDate.isEmpty(): 

425 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

426 if ingestDate.begin is not None: 

427 begin = ingestDate.begin.utc.datetime # type: ignore 

428 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

429 if ingestDate.end is not None: 

430 end = ingestDate.end.utc.datetime # type: ignore 

431 query.where.append(self._static.dataset.columns.ingest_date < end) 

432 # If we need the static table, join it in via dataset_id and 

433 # dataset_type_id 

434 if need_static_table: 

435 query.join( 

436 self._static.dataset, 

437 onclause=(dataset_id_col == self._static.dataset.columns.id), 

438 **static_kwargs, 

439 ) 

440 # Also constrain dataset_type_id in static table in case that helps 

441 # generate a better plan. 

442 # We could also include this in the JOIN ON clause, but my guess is 

443 # that that's a good idea IFF it's in the foreign key, and right 

444 # now it isn't. 

445 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

446 return query 

447 

448 def getDataId(self, id: DatasetId) -> DataCoordinate: 

449 """Return DataId for a dataset. 

450 

451 Parameters 

452 ---------- 

453 id : `DatasetId` 

454 Unique dataset identifier. 

455 

456 Returns 

457 ------- 

458 dataId : `DataCoordinate` 

459 DataId for the dataset. 

460 """ 

461 # This query could return multiple rows (one for each tagged collection 

462 # the dataset is in, plus one for its run collection), and we don't 

463 # care which of those we get. 

464 sql = self._tags.select().where( 

465 sqlalchemy.sql.and_( 

466 self._tags.columns.dataset_id == id, 

467 self._tags.columns.dataset_type_id == self._dataset_type_id 

468 ) 

469 ).limit(1) 

470 row = self._db.query(sql).mappings().fetchone() 

471 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

472 return DataCoordinate.standardize( 

473 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

474 graph=self.datasetType.dimensions 

475 ) 

476 

477 

478class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

479 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

480 auto-incremented column for dataset IDs. 

481 """ 

482 

483 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

484 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

485 # Docstring inherited from DatasetRecordStorage. 

486 

487 # We only support UNIQUE mode for integer dataset IDs 

488 if idMode != DatasetIdGenEnum.UNIQUE: 488 ↛ 489line 488 didn't jump to line 489, because the condition on line 488 was never true

489 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

490 

491 # Transform a possibly-single-pass iterable into a list. 

492 dataIdList = list(dataIds) 

493 yield from self._insert(run, dataIdList) 

494 

495 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

496 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

497 reuseIds: bool = False) -> Iterator[DatasetRef]: 

498 # Docstring inherited from DatasetRecordStorage. 

499 

500 # We only support UNIQUE mode for integer dataset IDs 

501 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 501 ↛ 502line 501 didn't jump to line 502, because the condition on line 501 was never true

502 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

503 

504 # Make a list of dataIds and optionally dataset IDs. 

505 dataIdList: List[DataCoordinate] = [] 

506 datasetIdList: List[int] = [] 

507 for dataset in datasets: 

508 dataIdList.append(dataset.dataId) 

509 

510 # We only accept integer dataset IDs, but also allow None. 

511 datasetId = dataset.id 

512 if datasetId is None: 512 ↛ 514line 512 didn't jump to line 514, because the condition on line 512 was never true

513 # if reuseIds is set then all IDs must be known 

514 if reuseIds: 

515 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

516 elif isinstance(datasetId, int): 516 ↛ 520line 516 didn't jump to line 520, because the condition on line 516 was never false

517 if reuseIds: 

518 datasetIdList.append(datasetId) 

519 else: 

520 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

521 

522 yield from self._insert(run, dataIdList, datasetIdList) 

523 

524 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate], 

525 datasetIdList: Optional[List[int]] = None) -> Iterator[DatasetRef]: 

526 """Common part of implementation of `insert` and `import_` methods. 

527 """ 

528 

529 # Remember any governor dimension values we see. 

530 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

531 for dataId in dataIdList: 

532 governorValues.update_extract(dataId) 

533 

534 staticRow = { 

535 "dataset_type_id": self._dataset_type_id, 

536 self._runKeyColumn: run.key, 

537 } 

538 with self._db.transaction(): 

539 # Insert into the static dataset table, generating autoincrement 

540 # dataset_id values. 

541 if datasetIdList: 

542 # reuse existing IDs 

543 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

544 self._db.insert(self._static.dataset, *rows) 

545 else: 

546 # use auto-incremented IDs 

547 datasetIdList = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)), 

548 returnIds=True) 

549 assert datasetIdList is not None 

550 # Update the summary tables for this collection in case this is the 

551 # first time this dataset type or these governor values will be 

552 # inserted there. 

553 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

554 # Combine the generated dataset_id values and data ID fields to 

555 # form rows to be inserted into the tags table. 

556 protoTagsRow = { 

557 "dataset_type_id": self._dataset_type_id, 

558 self._collections.getCollectionForeignKeyName(): run.key, 

559 } 

560 tagsRows = [ 

561 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

562 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

563 ] 

564 # Insert those rows into the tags table. This is where we'll 

565 # get any unique constraint violations. 

566 self._db.insert(self._tags, *tagsRows) 

567 

568 for dataId, datasetId in zip(dataIdList, datasetIdList): 

569 yield DatasetRef( 

570 datasetType=self.datasetType, 

571 dataId=dataId, 

572 id=datasetId, 

573 run=run.name, 

574 ) 

575 

576 

577class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

578 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

579 dataset IDs. 

580 """ 

581 

582 NS_UUID = uuid.UUID('840b31d9-05cd-5161-b2c8-00d32b280d0f') 

583 """Namespace UUID used for UUID5 generation. Do not change. This was 

584 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

585 """ 

586 

587 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

588 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

589 # Docstring inherited from DatasetRecordStorage. 

590 

591 # Remember any governor dimension values we see. 

592 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

593 

594 # Iterate over data IDs, transforming a possibly-single-pass iterable 

595 # into a list. 

596 dataIdList = [] 

597 rows = [] 

598 for dataId in dataIds: 

599 dataIdList.append(dataId) 

600 rows.append({ 

601 "id": self._makeDatasetId(run, dataId, idMode), 

602 "dataset_type_id": self._dataset_type_id, 

603 self._runKeyColumn: run.key, 

604 }) 

605 governorValues.update_extract(dataId) 

606 

607 with self._db.transaction(): 

608 # Insert into the static dataset table. 

609 self._db.insert(self._static.dataset, *rows) 

610 # Update the summary tables for this collection in case this is the 

611 # first time this dataset type or these governor values will be 

612 # inserted there. 

613 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

614 # Combine the generated dataset_id values and data ID fields to 

615 # form rows to be inserted into the tags table. 

616 protoTagsRow = { 

617 "dataset_type_id": self._dataset_type_id, 

618 self._collections.getCollectionForeignKeyName(): run.key, 

619 } 

620 tagsRows = [ 

621 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

622 for dataId, row in zip(dataIdList, rows) 

623 ] 

624 # Insert those rows into the tags table. 

625 self._db.insert(self._tags, *tagsRows) 

626 

627 for dataId, row in zip(dataIdList, rows): 

628 yield DatasetRef( 

629 datasetType=self.datasetType, 

630 dataId=dataId, 

631 id=row["id"], 

632 run=run.name, 

633 ) 

634 

635 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

636 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

637 reuseIds: bool = False) -> Iterator[DatasetRef]: 

638 # Docstring inherited from DatasetRecordStorage. 

639 

640 # Remember any governor dimension values we see. 

641 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

642 

643 # Iterate over data IDs, transforming a possibly-single-pass iterable 

644 # into a list. 

645 dataIds = {} 

646 for dataset in datasets: 

647 # Ignore unknown ID types, normally all IDs have the same type but 

648 # this code supports mixed types or missing IDs. 

649 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

650 if datasetId is None: 

651 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode) 

652 dataIds[datasetId] = dataset.dataId 

653 governorValues.update_extract(dataset.dataId) 

654 

655 with self._db.session() as session: 

656 

657 # insert all new rows into a temporary table 

658 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), 

659 ddl.GUID, constraints=False) 

660 tmp_tags = session.makeTemporaryTable(tableSpec) 

661 

662 collFkName = self._collections.getCollectionForeignKeyName() 

663 protoTagsRow = { 

664 "dataset_type_id": self._dataset_type_id, 

665 collFkName: run.key, 

666 } 

667 tmpRows = [dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

668 for dataset_id, dataId in dataIds.items()] 

669 

670 with self._db.transaction(): 

671 

672 # store all incoming data in a temporary table 

673 self._db.insert(tmp_tags, *tmpRows) 

674 

675 # There are some checks that we want to make for consistency 

676 # of the new datasets with existing ones. 

677 self._validateImport(tmp_tags, run) 

678 

679 # Before we merge temporary table into dataset/tags we need to 

680 # drop datasets which are already there (and do not conflict). 

681 self._db.deleteWhere(tmp_tags, tmp_tags.columns.dataset_id.in_( 

682 sqlalchemy.sql.select(self._static.dataset.columns.id) 

683 )) 

684 

685 # Copy it into dataset table, need to re-label some columns. 

686 self._db.insert(self._static.dataset, select=sqlalchemy.sql.select( 

687 tmp_tags.columns.dataset_id.label("id"), 

688 tmp_tags.columns.dataset_type_id, 

689 tmp_tags.columns[collFkName].label(self._runKeyColumn) 

690 )) 

691 

692 # Update the summary tables for this collection in case this 

693 # is the first time this dataset type or these governor values 

694 # will be inserted there. 

695 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

696 

697 # Copy it into tags table. 

698 self._db.insert(self._tags, select=tmp_tags.select()) 

699 

700 # Return refs in the same order as in the input list. 

701 for dataset_id, dataId in dataIds.items(): 

702 yield DatasetRef( 

703 datasetType=self.datasetType, 

704 id=dataset_id, 

705 dataId=dataId, 

706 run=run.name, 

707 ) 

708 

709 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

710 """Validate imported refs against existing datasets. 

711 

712 Parameters 

713 ---------- 

714 tmp_tags : `sqlalchemy.schema.Table` 

715 Temporary table with new datasets and the same schema as tags 

716 table. 

717 run : `RunRecord` 

718 The record object describing the `~CollectionType.RUN` collection. 

719 

720 Raises 

721 ------ 

722 ConflictingDefinitionError 

723 Raise if new datasets conflict with existing ones. 

724 """ 

725 dataset = self._static.dataset 

726 tags = self._tags 

727 collFkName = self._collections.getCollectionForeignKeyName() 

728 

729 # Check that existing datasets have the same dataset type and 

730 # run. 

731 query = sqlalchemy.sql.select( 

732 dataset.columns.id.label("dataset_id"), 

733 dataset.columns.dataset_type_id.label("dataset_type_id"), 

734 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"), 

735 dataset.columns[self._runKeyColumn].label("run"), 

736 tmp_tags.columns[collFkName].label("new run") 

737 ).select_from( 

738 dataset.join( 

739 tmp_tags, 

740 dataset.columns.id == tmp_tags.columns.dataset_id 

741 ) 

742 ).where( 

743 sqlalchemy.sql.or_( 

744 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

745 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName] 

746 ) 

747 ) 

748 result = self._db.query(query) 

749 if (row := result.first()) is not None: 

750 # Only include the first one in the exception message 

751 raise ConflictingDefinitionError( 

752 f"Existing dataset type or run do not match new dataset: {row._asdict()}" 

753 ) 

754 

755 # Check that matching dataset in tags table has the same DataId. 

756 query = sqlalchemy.sql.select( 

757 tags.columns.dataset_id, 

758 tags.columns.dataset_type_id.label("type_id"), 

759 tmp_tags.columns.dataset_type_id.label("new type_id"), 

760 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

761 *[tmp_tags.columns[dim].label(f"new {dim}") 

762 for dim in self.datasetType.dimensions.required.names], 

763 ).select_from( 

764 tags.join( 

765 tmp_tags, 

766 tags.columns.dataset_id == tmp_tags.columns.dataset_id 

767 ) 

768 ).where( 

769 sqlalchemy.sql.or_( 

770 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

771 *[tags.columns[dim] != tmp_tags.columns[dim] 

772 for dim in self.datasetType.dimensions.required.names] 

773 ) 

774 ) 

775 result = self._db.query(query) 

776 if (row := result.first()) is not None: 

777 # Only include the first one in the exception message 

778 raise ConflictingDefinitionError( 

779 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

780 ) 

781 

782 # Check that matching run+dataId have the same dataset ID. 

783 query = sqlalchemy.sql.select( 

784 tags.columns.dataset_type_id.label("dataset_type_id"), 

785 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

786 tags.columns.dataset_id, 

787 tmp_tags.columns.dataset_id.label("new dataset_id"), 

788 tags.columns[collFkName], 

789 tmp_tags.columns[collFkName].label(f"new {collFkName}") 

790 ).select_from( 

791 tags.join( 

792 tmp_tags, 

793 sqlalchemy.sql.and_( 

794 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

795 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

796 *[tags.columns[dim] == tmp_tags.columns[dim] 

797 for dim in self.datasetType.dimensions.required.names] 

798 ) 

799 ) 

800 ).where( 

801 tags.columns.dataset_id != tmp_tags.columns.dataset_id 

802 ) 

803 result = self._db.query(query) 

804 if (row := result.first()) is not None: 

805 # only include the first one in the exception message 

806 raise ConflictingDefinitionError( 

807 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" 

808 ) 

809 

810 def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate, 

811 idGenerationMode: DatasetIdGenEnum) -> uuid.UUID: 

812 """Generate dataset ID for a dataset. 

813 

814 Parameters 

815 ---------- 

816 run : `RunRecord` 

817 The record object describing the RUN collection for the dataset. 

818 dataId : `DataCoordinate` 

819 Expanded data ID for the dataset. 

820 idGenerationMode : `DatasetIdGenEnum` 

821 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random 

822 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

823 deterministic UUID5-type ID based on a dataset type name and 

824 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

825 deterministic UUID5-type ID based on a dataset type name, run 

826 collection name, and ``dataId``. 

827 

828 Returns 

829 ------- 

830 datasetId : `uuid.UUID` 

831 Dataset identifier. 

832 """ 

833 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

834 return uuid.uuid4() 

835 else: 

836 # WARNING: If you modify this code make sure that the order of 

837 # items in the `items` list below never changes. 

838 items: List[Tuple[str, str]] = [] 

839 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

840 items = [ 

841 ("dataset_type", self.datasetType.name), 

842 ] 

843 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 843 ↛ 849line 843 didn't jump to line 849, because the condition on line 843 was never false

844 items = [ 

845 ("dataset_type", self.datasetType.name), 

846 ("run", run.name), 

847 ] 

848 else: 

849 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

850 

851 for name, value in sorted(dataId.byName().items()): 

852 items.append((name, str(value))) 

853 data = ",".join(f"{key}={value}" for key, value in items) 

854 return uuid.uuid5(self.NS_UUID, data)