Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 89%

317 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-24 23:49 -0700

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorage",) 

4 

5import uuid 

6from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple 

7 

8import sqlalchemy 

9from lsst.daf.butler import ( 

10 CollectionType, 

11 DataCoordinate, 

12 DataCoordinateSet, 

13 DatasetId, 

14 DatasetRef, 

15 DatasetType, 

16 SimpleQuery, 

17 Timespan, 

18 ddl, 

19) 

20from lsst.daf.butler.registry import ( 

21 CollectionTypeError, 

22 ConflictingDefinitionError, 

23 UnsupportedIdGeneratorError, 

24) 

25from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage 

26 

27from ...summaries import GovernorDimensionRestriction 

28from .tables import makeTagTableSpec 

29 

30if TYPE_CHECKING: 30 ↛ 31line 30 didn't jump to line 31, because the condition on line 30 was never true

31 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

32 from .summaries import CollectionSummaryManager 

33 from .tables import StaticDatasetTablesTuple 

34 

35 

36class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

37 """Dataset record storage implementation paired with 

38 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

39 information. 

40 

41 Instances of this class should never be constructed directly; use 

42 `DatasetRecordStorageManager.register` instead. 

43 """ 

44 

45 def __init__( 

46 self, 

47 *, 

48 datasetType: DatasetType, 

49 db: Database, 

50 dataset_type_id: int, 

51 collections: CollectionManager, 

52 static: StaticDatasetTablesTuple, 

53 summaries: CollectionSummaryManager, 

54 tags: sqlalchemy.schema.Table, 

55 calibs: Optional[sqlalchemy.schema.Table], 

56 ): 

57 super().__init__(datasetType=datasetType) 

58 self._dataset_type_id = dataset_type_id 

59 self._db = db 

60 self._collections = collections 

61 self._static = static 

62 self._summaries = summaries 

63 self._tags = tags 

64 self._calibs = calibs 

65 self._runKeyColumn = collections.getRunForeignKeyName() 

66 

67 def find( 

68 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None 

69 ) -> Optional[DatasetRef]: 

70 # Docstring inherited from DatasetRecordStorage. 

71 assert dataId.graph == self.datasetType.dimensions 

72 if collection.type is CollectionType.CALIBRATION and timespan is None: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true

73 raise TypeError( 

74 f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

75 f"without an input timespan." 

76 ) 

77 sql = self.select( 

78 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan 

79 ) 

80 with self._db.query(sql) as results: 

81 row = results.fetchone() 

82 if row is None: 

83 return None 

84 if collection.type is CollectionType.CALIBRATION: 

85 # For temporal calibration lookups (only!) our invariants do 

86 # not guarantee that the number of result rows is <= 1. They 

87 # would if `select` constrained the given timespan to be 

88 # _contained_ by the validity range in the self._calibs table, 

89 # instead of simply _overlapping_ it, because we do guarantee 

90 # that the validity ranges are disjoint for a particular 

91 # dataset type, collection, and data ID. But using an overlap 

92 # test and a check for multiple result rows here allows us to 

93 # provide a more useful diagnostic, as well as allowing 

94 # `select` to support more general queries where multiple 

95 # results are not an error. 

96 if results.fetchone() is not None: 

97 raise RuntimeError( 

98 f"Multiple matches found for calibration lookup in {collection.name} for " 

99 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

100 ) 

101 return DatasetRef( 

102 datasetType=self.datasetType, 

103 dataId=dataId, 

104 id=row.id, 

105 run=self._collections[row._mapping[self._runKeyColumn]].name, 

106 ) 

107 

108 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

109 # Docstring inherited from DatasetRecordStorage. 

110 # Only delete from common dataset table; ON DELETE foreign key clauses 

111 # will handle the rest. 

112 self._db.delete( 

113 self._static.dataset, 

114 ["id"], 

115 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

116 ) 

117 

118 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

119 # Docstring inherited from DatasetRecordStorage. 

120 if collection.type is not CollectionType.TAGGED: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true

121 raise TypeError( 

122 f"Cannot associate into collection '{collection.name}' " 

123 f"of type {collection.type.name}; must be TAGGED." 

124 ) 

125 protoRow = { 

126 self._collections.getCollectionForeignKeyName(): collection.key, 

127 "dataset_type_id": self._dataset_type_id, 

128 } 

129 rows = [] 

130 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

131 for dataset in datasets: 

132 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

133 for dimension, value in dataset.dataId.items(): 

134 row[dimension.name] = value 

135 governorValues.update_extract(dataset.dataId) 

136 rows.append(row) 

137 # Update the summary tables for this collection in case this is the 

138 # first time this dataset type or these governor values will be 

139 # inserted there. 

140 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

141 # Update the tag table itself. 

142 self._db.replace(self._tags, *rows) 

143 

144 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

145 # Docstring inherited from DatasetRecordStorage. 

146 if collection.type is not CollectionType.TAGGED: 146 ↛ 147line 146 didn't jump to line 147, because the condition on line 146 was never true

147 raise TypeError( 

148 f"Cannot disassociate from collection '{collection.name}' " 

149 f"of type {collection.type.name}; must be TAGGED." 

150 ) 

151 rows = [ 

152 { 

153 "dataset_id": dataset.getCheckedId(), 

154 self._collections.getCollectionForeignKeyName(): collection.key, 

155 } 

156 for dataset in datasets 

157 ] 

158 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

159 

160 def _buildCalibOverlapQuery( 

161 self, collection: CollectionRecord, dataIds: Optional[DataCoordinateSet], timespan: Timespan 

162 ) -> SimpleQuery: 

163 assert self._calibs is not None 

164 # Start by building a SELECT query for any rows that would overlap 

165 # this one. 

166 query = SimpleQuery() 

167 query.join(self._calibs) 

168 # Add a WHERE clause matching the dataset type and collection. 

169 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

170 query.where.append( 

171 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

172 ) 

173 # Add a WHERE clause matching any of the given data IDs. 

174 if dataIds is not None: 

175 dataIds.constrain( 

176 query, 

177 lambda name: self._calibs.columns[name], # type: ignore 

178 ) 

179 # Add WHERE clause for timespan overlaps. 

180 TimespanReprClass = self._db.getTimespanRepresentation() 

181 query.where.append( 

182 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan)) 

183 ) 

184 return query 

185 

186 def certify( 

187 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan 

188 ) -> None: 

189 # Docstring inherited from DatasetRecordStorage. 

190 if self._calibs is None: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true

191 raise CollectionTypeError( 

192 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

193 f"DatasetType.isCalibration() is False." 

194 ) 

195 if collection.type is not CollectionType.CALIBRATION: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true

196 raise CollectionTypeError( 

197 f"Cannot certify into collection '{collection.name}' " 

198 f"of type {collection.type.name}; must be CALIBRATION." 

199 ) 

200 TimespanReprClass = self._db.getTimespanRepresentation() 

201 protoRow = { 

202 self._collections.getCollectionForeignKeyName(): collection.key, 

203 "dataset_type_id": self._dataset_type_id, 

204 } 

205 rows = [] 

206 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

207 dataIds: Optional[Set[DataCoordinate]] = ( 

208 set() if not TimespanReprClass.hasExclusionConstraint() else None 

209 ) 

210 for dataset in datasets: 

211 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

212 for dimension, value in dataset.dataId.items(): 

213 row[dimension.name] = value 

214 TimespanReprClass.update(timespan, result=row) 

215 governorValues.update_extract(dataset.dataId) 

216 rows.append(row) 

217 if dataIds is not None: 217 ↛ 210line 217 didn't jump to line 210, because the condition on line 217 was never false

218 dataIds.add(dataset.dataId) 

219 # Update the summary tables for this collection in case this is the 

220 # first time this dataset type or these governor values will be 

221 # inserted there. 

222 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

223 # Update the association table itself. 

224 if TimespanReprClass.hasExclusionConstraint(): 224 ↛ 227line 224 didn't jump to line 227, because the condition on line 224 was never true

225 # Rely on database constraint to enforce invariants; we just 

226 # reraise the exception for consistency across DB engines. 

227 try: 

228 self._db.insert(self._calibs, *rows) 

229 except sqlalchemy.exc.IntegrityError as err: 

230 raise ConflictingDefinitionError( 

231 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

232 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

233 ) from err 

234 else: 

235 # Have to implement exclusion constraint ourselves. 

236 # Start by building a SELECT query for any rows that would overlap 

237 # this one. 

238 query = self._buildCalibOverlapQuery( 

239 collection, 

240 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

241 timespan, 

242 ) 

243 query.columns.append(sqlalchemy.sql.func.count()) 

244 sql = query.combine() 

245 # Acquire a table lock to ensure there are no concurrent writes 

246 # could invalidate our checking before we finish the inserts. We 

247 # use a SAVEPOINT in case there is an outer transaction that a 

248 # failure here should not roll back. 

249 with self._db.transaction(lock=[self._calibs], savepoint=True): 

250 # Run the check SELECT query. 

251 with self._db.query(sql) as sql_result: 

252 conflicting = sql_result.scalar() 

253 if conflicting > 0: 

254 raise ConflictingDefinitionError( 

255 f"{conflicting} validity range conflicts certifying datasets of type " 

256 f"{self.datasetType.name} into {collection.name} for range " 

257 f"[{timespan.begin}, {timespan.end})." 

258 ) 

259 # Proceed with the insert. 

260 self._db.insert(self._calibs, *rows) 

261 

262 def decertify( 

263 self, 

264 collection: CollectionRecord, 

265 timespan: Timespan, 

266 *, 

267 dataIds: Optional[Iterable[DataCoordinate]] = None, 

268 ) -> None: 

269 # Docstring inherited from DatasetRecordStorage. 

270 if self._calibs is None: 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true

271 raise CollectionTypeError( 

272 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

273 f"DatasetType.isCalibration() is False." 

274 ) 

275 if collection.type is not CollectionType.CALIBRATION: 275 ↛ 276line 275 didn't jump to line 276, because the condition on line 275 was never true

276 raise CollectionTypeError( 

277 f"Cannot decertify from collection '{collection.name}' " 

278 f"of type {collection.type.name}; must be CALIBRATION." 

279 ) 

280 TimespanReprClass = self._db.getTimespanRepresentation() 

281 # Construct a SELECT query to find all rows that overlap our inputs. 

282 dataIdSet: Optional[DataCoordinateSet] 

283 if dataIds is not None: 

284 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

285 else: 

286 dataIdSet = None 

287 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

288 query.columns.extend(self._calibs.columns) 

289 sql = query.combine() 

290 # Set up collections to populate with the rows we'll want to modify. 

291 # The insert rows will have the same values for collection and 

292 # dataset type. 

293 protoInsertRow = { 

294 self._collections.getCollectionForeignKeyName(): collection.key, 

295 "dataset_type_id": self._dataset_type_id, 

296 } 

297 rowsToDelete = [] 

298 rowsToInsert = [] 

299 # Acquire a table lock to ensure there are no concurrent writes 

300 # between the SELECT and the DELETE and INSERT queries based on it. 

301 with self._db.transaction(lock=[self._calibs], savepoint=True): 

302 with self._db.query(sql) as sql_result: 

303 sql_rows = sql_result.mappings().fetchall() 

304 for row in sql_rows: 

305 rowsToDelete.append({"id": row["id"]}) 

306 # Construct the insert row(s) by copying the prototype row, 

307 # then adding the dimension column values, then adding what's 

308 # left of the timespan from that row after we subtract the 

309 # given timespan. 

310 newInsertRow = protoInsertRow.copy() 

311 newInsertRow["dataset_id"] = row["dataset_id"] 

312 for name in self.datasetType.dimensions.required.names: 

313 newInsertRow[name] = row[name] 

314 rowTimespan = TimespanReprClass.extract(row) 

315 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

316 for diffTimespan in rowTimespan.difference(timespan): 

317 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

318 # Run the DELETE and INSERT queries. 

319 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

320 self._db.insert(self._calibs, *rowsToInsert) 

321 

322 def select( 

323 self, 

324 *collections: CollectionRecord, 

325 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

326 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select, 

327 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

328 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

329 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

330 ) -> sqlalchemy.sql.Selectable: 

331 # Docstring inherited from DatasetRecordStorage. 

332 collection_types = {collection.type for collection in collections} 

333 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

334 # 

335 # There are two kinds of table in play here: 

336 # 

337 # - the static dataset table (with the dataset ID, dataset type ID, 

338 # run ID/name, and ingest date); 

339 # 

340 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

341 # type ID, collection ID/name, data ID, and possibly validity 

342 # range). 

343 # 

344 # That means that we might want to return a query against either table 

345 # or a JOIN of both, depending on which quantities the caller wants. 

346 # But this method is documented/typed such that ``dataId`` is never 

347 # `None` - i.e. we always constrain or retreive the data ID. That 

348 # means we'll always include the tags/calibs table and join in the 

349 # static dataset table only if we need things from it that we can't get 

350 # from the tags/calibs table. 

351 # 

352 # Note that it's important that we include a WHERE constraint on both 

353 # tables for any column (e.g. dataset_type_id) that is in both when 

354 # it's given explicitly; not doing can prevent the query planner from 

355 # using very important indexes. At present, we don't include those 

356 # redundant columns in the JOIN ON expression, however, because the 

357 # FOREIGN KEY (and its index) are defined only on dataset_id. 

358 # 

359 # We'll start by accumulating kwargs to pass to SimpleQuery.join when 

360 # we bring in the tags/calibs table. We get the data ID or constrain 

361 # it in the tags/calibs table(s), but that's multiple columns, not one, 

362 # so we need to transform the one Select.Or argument into a dictionary 

363 # of them. 

364 kwargs: Dict[str, Any] 

365 if dataId is SimpleQuery.Select: 

366 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

367 else: 

368 kwargs = dict(dataId.byName()) 

369 # We always constrain (never retrieve) the dataset type in at least the 

370 # tags/calibs table. 

371 kwargs["dataset_type_id"] = self._dataset_type_id 

372 # Join in the tags and/or calibs tables, turning those 'kwargs' entries 

373 # into WHERE constraints or SELECT columns as appropriate. 

374 if collection_types != {CollectionType.CALIBRATION}: 

375 # We'll need a subquery for the tags table if any of the given 

376 # collections are not a CALIBRATION collection. This intentionally 

377 # also fires when the list of collections is empty as a way to 

378 # create a dummy subquery that we know will fail. 

379 tags_query = SimpleQuery() 

380 tags_query.join(self._tags, **kwargs) 

381 self._finish_single_select( 

382 tags_query, self._tags, collections, id=id, run=run, ingestDate=ingestDate 

383 ) 

384 else: 

385 tags_query = None 

386 if CollectionType.CALIBRATION in collection_types: 

387 # If at least one collection is a CALIBRATION collection, we'll 

388 # need a subquery for the calibs table, and could include the 

389 # timespan as a result or constraint. 

390 calibs_query = SimpleQuery() 

391 assert ( 

392 self._calibs is not None 

393 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

394 TimespanReprClass = self._db.getTimespanRepresentation() 

395 # Add the timespan column(s) to the result columns, or constrain 

396 # the timespan via an overlap condition. 

397 if timespan is SimpleQuery.Select: 

398 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()}) 

399 elif timespan is not None: 

400 calibs_query.where.append( 

401 TimespanReprClass.fromSelectable(self._calibs).overlaps( 

402 TimespanReprClass.fromLiteral(timespan) 

403 ) 

404 ) 

405 calibs_query.join(self._calibs, **kwargs) 

406 self._finish_single_select( 

407 calibs_query, self._calibs, collections, id=id, run=run, ingestDate=ingestDate 

408 ) 

409 else: 

410 calibs_query = None 

411 if calibs_query is not None: 

412 if tags_query is not None: 

413 if timespan is not None: 413 ↛ 414line 413 didn't jump to line 414, because the condition on line 413 was never true

414 raise TypeError( 

415 "Cannot query for timespan when the collections include both calibration and " 

416 "non-calibration collections." 

417 ) 

418 return tags_query.combine().union(calibs_query.combine()) 

419 else: 

420 return calibs_query.combine() 

421 else: 

422 assert tags_query is not None, "Earlier logic should guaranteed at least one is not None." 

423 return tags_query.combine() 

424 

425 def _finish_single_select( 

426 self, 

427 query: SimpleQuery, 

428 table: sqlalchemy.schema.Table, 

429 collections: Sequence[CollectionRecord], 

430 id: SimpleQuery.Select.Or[Optional[int]], 

431 run: SimpleQuery.Select.Or[None], 

432 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]], 

433 ) -> None: 

434 dataset_id_col = table.columns.dataset_id 

435 collection_col = table.columns[self._collections.getCollectionForeignKeyName()] 

436 # We always constrain (never retrieve) the collection(s) in the 

437 # tags/calibs table. 

438 if len(collections) == 1: 

439 query.where.append(collection_col == collections[0].key) 

440 elif len(collections) == 0: 

441 # We support the case where there are no collections as a way to 

442 # generate a valid SQL query that can't yield results. This should 

443 # never get executed, but lots of downstream code will still try 

444 # to access the SQLAlchemy objects representing the columns in the 

445 # subquery. That's not ideal, but it'd take a lot of refactoring 

446 # to fix it (DM-31725). 

447 query.where.append(sqlalchemy.sql.literal(False)) 

448 else: 

449 query.where.append(collection_col.in_([collection.key for collection in collections])) 

450 # We can always get the dataset_id from the tags/calibs table or 

451 # constrain it there. Can't use kwargs for that because we need to 

452 # alias it to 'id'. 

453 if id is SimpleQuery.Select: 

454 query.columns.append(dataset_id_col.label("id")) 

455 elif id is not None: 455 ↛ 456line 455 didn't jump to line 456, because the condition on line 455 was never true

456 query.where.append(dataset_id_col == id) 

457 # It's possible we now have everything we need, from just the 

458 # tags/calibs table. The things we might need to get from the static 

459 # dataset table are the run key and the ingest date. 

460 need_static_table = False 

461 static_kwargs: Dict[str, Any] = {} 

462 if run is not None: 

463 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection." 

464 if len(collections) == 1 and collections[0].type is CollectionType.RUN: 

465 # If we are searching exactly one RUN collection, we 

466 # know that if we find the dataset in that collection, 

467 # then that's the datasets's run; we don't need to 

468 # query for it. 

469 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn)) 

470 else: 

471 static_kwargs[self._runKeyColumn] = SimpleQuery.Select 

472 need_static_table = True 

473 # Ingest date can only come from the static table. 

474 if ingestDate is not None: 

475 need_static_table = True 

476 if ingestDate is SimpleQuery.Select: 476 ↛ 479line 476 didn't jump to line 479, because the condition on line 476 was never false

477 static_kwargs["ingest_date"] = SimpleQuery.Select 

478 else: 

479 assert isinstance(ingestDate, Timespan) 

480 # Timespan is astropy Time (usually in TAI) and ingest_date is 

481 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

482 if ingestDate.isEmpty(): 

483 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

484 if ingestDate.begin is not None: 

485 begin = ingestDate.begin.utc.datetime # type: ignore 

486 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

487 if ingestDate.end is not None: 

488 end = ingestDate.end.utc.datetime # type: ignore 

489 query.where.append(self._static.dataset.columns.ingest_date < end) 

490 # If we need the static table, join it in via dataset_id and 

491 # dataset_type_id 

492 if need_static_table: 

493 query.join( 

494 self._static.dataset, 

495 onclause=(dataset_id_col == self._static.dataset.columns.id), 

496 **static_kwargs, 

497 ) 

498 # Also constrain dataset_type_id in static table in case that helps 

499 # generate a better plan. 

500 # We could also include this in the JOIN ON clause, but my guess is 

501 # that that's a good idea IFF it's in the foreign key, and right 

502 # now it isn't. 

503 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

504 

505 def getDataId(self, id: DatasetId) -> DataCoordinate: 

506 """Return DataId for a dataset. 

507 

508 Parameters 

509 ---------- 

510 id : `DatasetId` 

511 Unique dataset identifier. 

512 

513 Returns 

514 ------- 

515 dataId : `DataCoordinate` 

516 DataId for the dataset. 

517 """ 

518 # This query could return multiple rows (one for each tagged collection 

519 # the dataset is in, plus one for its run collection), and we don't 

520 # care which of those we get. 

521 sql = ( 

522 self._tags.select() 

523 .where( 

524 sqlalchemy.sql.and_( 

525 self._tags.columns.dataset_id == id, 

526 self._tags.columns.dataset_type_id == self._dataset_type_id, 

527 ) 

528 ) 

529 .limit(1) 

530 ) 

531 with self._db.query(sql) as sql_result: 

532 row = sql_result.mappings().fetchone() 

533 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

534 return DataCoordinate.standardize( 

535 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

536 graph=self.datasetType.dimensions, 

537 ) 

538 

539 

540class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

541 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

542 auto-incremented column for dataset IDs. 

543 """ 

544 

545 def insert( 

546 self, 

547 run: RunRecord, 

548 dataIds: Iterable[DataCoordinate], 

549 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

550 ) -> Iterator[DatasetRef]: 

551 # Docstring inherited from DatasetRecordStorage. 

552 

553 # We only support UNIQUE mode for integer dataset IDs 

554 if idMode != DatasetIdGenEnum.UNIQUE: 554 ↛ 555line 554 didn't jump to line 555, because the condition on line 554 was never true

555 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

556 

557 # Transform a possibly-single-pass iterable into a list. 

558 dataIdList = list(dataIds) 

559 yield from self._insert(run, dataIdList) 

560 

561 def import_( 

562 self, 

563 run: RunRecord, 

564 datasets: Iterable[DatasetRef], 

565 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

566 reuseIds: bool = False, 

567 ) -> Iterator[DatasetRef]: 

568 # Docstring inherited from DatasetRecordStorage. 

569 

570 # We only support UNIQUE mode for integer dataset IDs 

571 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 571 ↛ 572line 571 didn't jump to line 572, because the condition on line 571 was never true

572 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

573 

574 # Make a list of dataIds and optionally dataset IDs. 

575 dataIdList: List[DataCoordinate] = [] 

576 datasetIdList: List[int] = [] 

577 for dataset in datasets: 

578 dataIdList.append(dataset.dataId) 

579 

580 # We only accept integer dataset IDs, but also allow None. 

581 datasetId = dataset.id 

582 if datasetId is None: 582 ↛ 584line 582 didn't jump to line 584, because the condition on line 582 was never true

583 # if reuseIds is set then all IDs must be known 

584 if reuseIds: 

585 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

586 elif isinstance(datasetId, int): 586 ↛ 590line 586 didn't jump to line 590, because the condition on line 586 was never false

587 if reuseIds: 

588 datasetIdList.append(datasetId) 

589 else: 

590 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

591 

592 yield from self._insert(run, dataIdList, datasetIdList) 

593 

594 def _insert( 

595 self, run: RunRecord, dataIdList: List[DataCoordinate], datasetIdList: Optional[List[int]] = None 

596 ) -> Iterator[DatasetRef]: 

597 """Common part of implementation of `insert` and `import_` methods.""" 

598 

599 # Remember any governor dimension values we see. 

600 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

601 for dataId in dataIdList: 

602 governorValues.update_extract(dataId) 

603 

604 staticRow = { 

605 "dataset_type_id": self._dataset_type_id, 

606 self._runKeyColumn: run.key, 

607 } 

608 with self._db.transaction(): 

609 # Insert into the static dataset table, generating autoincrement 

610 # dataset_id values. 

611 if datasetIdList: 

612 # reuse existing IDs 

613 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

614 self._db.insert(self._static.dataset, *rows) 

615 else: 

616 # use auto-incremented IDs 

617 datasetIdList = self._db.insert( 

618 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True 

619 ) 

620 assert datasetIdList is not None 

621 # Update the summary tables for this collection in case this is the 

622 # first time this dataset type or these governor values will be 

623 # inserted there. 

624 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

625 # Combine the generated dataset_id values and data ID fields to 

626 # form rows to be inserted into the tags table. 

627 protoTagsRow = { 

628 "dataset_type_id": self._dataset_type_id, 

629 self._collections.getCollectionForeignKeyName(): run.key, 

630 } 

631 tagsRows = [ 

632 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

633 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

634 ] 

635 # Insert those rows into the tags table. This is where we'll 

636 # get any unique constraint violations. 

637 self._db.insert(self._tags, *tagsRows) 

638 

639 for dataId, datasetId in zip(dataIdList, datasetIdList): 

640 yield DatasetRef( 

641 datasetType=self.datasetType, 

642 dataId=dataId, 

643 id=datasetId, 

644 run=run.name, 

645 ) 

646 

647 

648class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

649 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

650 dataset IDs. 

651 """ 

652 

653 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") 

654 """Namespace UUID used for UUID5 generation. Do not change. This was 

655 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

656 """ 

657 

658 def insert( 

659 self, 

660 run: RunRecord, 

661 dataIds: Iterable[DataCoordinate], 

662 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

663 ) -> Iterator[DatasetRef]: 

664 # Docstring inherited from DatasetRecordStorage. 

665 

666 # Remember any governor dimension values we see. 

667 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

668 

669 # Iterate over data IDs, transforming a possibly-single-pass iterable 

670 # into a list. 

671 dataIdList = [] 

672 rows = [] 

673 for dataId in dataIds: 

674 dataIdList.append(dataId) 

675 rows.append( 

676 { 

677 "id": self._makeDatasetId(run, dataId, idMode), 

678 "dataset_type_id": self._dataset_type_id, 

679 self._runKeyColumn: run.key, 

680 } 

681 ) 

682 governorValues.update_extract(dataId) 

683 

684 with self._db.transaction(): 

685 # Insert into the static dataset table. 

686 self._db.insert(self._static.dataset, *rows) 

687 # Update the summary tables for this collection in case this is the 

688 # first time this dataset type or these governor values will be 

689 # inserted there. 

690 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

691 # Combine the generated dataset_id values and data ID fields to 

692 # form rows to be inserted into the tags table. 

693 protoTagsRow = { 

694 "dataset_type_id": self._dataset_type_id, 

695 self._collections.getCollectionForeignKeyName(): run.key, 

696 } 

697 tagsRows = [ 

698 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

699 for dataId, row in zip(dataIdList, rows) 

700 ] 

701 # Insert those rows into the tags table. 

702 self._db.insert(self._tags, *tagsRows) 

703 

704 for dataId, row in zip(dataIdList, rows): 

705 yield DatasetRef( 

706 datasetType=self.datasetType, 

707 dataId=dataId, 

708 id=row["id"], 

709 run=run.name, 

710 ) 

711 

712 def import_( 

713 self, 

714 run: RunRecord, 

715 datasets: Iterable[DatasetRef], 

716 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

717 reuseIds: bool = False, 

718 ) -> Iterator[DatasetRef]: 

719 # Docstring inherited from DatasetRecordStorage. 

720 

721 # Remember any governor dimension values we see. 

722 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

723 

724 # Iterate over data IDs, transforming a possibly-single-pass iterable 

725 # into a list. 

726 dataIds = {} 

727 for dataset in datasets: 

728 # Ignore unknown ID types, normally all IDs have the same type but 

729 # this code supports mixed types or missing IDs. 

730 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

731 if datasetId is None: 

732 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode) 

733 dataIds[datasetId] = dataset.dataId 

734 governorValues.update_extract(dataset.dataId) 

735 

736 # We'll insert all new rows into a temporary table 

737 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False) 

738 collFkName = self._collections.getCollectionForeignKeyName() 

739 protoTagsRow = { 

740 "dataset_type_id": self._dataset_type_id, 

741 collFkName: run.key, 

742 } 

743 tmpRows = [ 

744 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

745 for dataset_id, dataId in dataIds.items() 

746 ] 

747 with self._db.transaction(for_temp_tables=True): 

748 with self._db.temporary_table(tableSpec) as tmp_tags: 

749 # store all incoming data in a temporary table 

750 self._db.insert(tmp_tags, *tmpRows) 

751 

752 # There are some checks that we want to make for consistency 

753 # of the new datasets with existing ones. 

754 self._validateImport(tmp_tags, run) 

755 

756 # Before we merge temporary table into dataset/tags we need to 

757 # drop datasets which are already there (and do not conflict). 

758 self._db.deleteWhere( 

759 tmp_tags, 

760 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

761 ) 

762 

763 # Copy it into dataset table, need to re-label some columns. 

764 self._db.insert( 

765 self._static.dataset, 

766 select=sqlalchemy.sql.select( 

767 tmp_tags.columns.dataset_id.label("id"), 

768 tmp_tags.columns.dataset_type_id, 

769 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

770 ), 

771 ) 

772 

773 # Update the summary tables for this collection in case this 

774 # is the first time this dataset type or these governor values 

775 # will be inserted there. 

776 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

777 

778 # Copy it into tags table. 

779 self._db.insert(self._tags, select=tmp_tags.select()) 

780 

781 # Return refs in the same order as in the input list. 

782 for dataset_id, dataId in dataIds.items(): 

783 yield DatasetRef( 

784 datasetType=self.datasetType, 

785 id=dataset_id, 

786 dataId=dataId, 

787 run=run.name, 

788 ) 

789 

790 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

791 """Validate imported refs against existing datasets. 

792 

793 Parameters 

794 ---------- 

795 tmp_tags : `sqlalchemy.schema.Table` 

796 Temporary table with new datasets and the same schema as tags 

797 table. 

798 run : `RunRecord` 

799 The record object describing the `~CollectionType.RUN` collection. 

800 

801 Raises 

802 ------ 

803 ConflictingDefinitionError 

804 Raise if new datasets conflict with existing ones. 

805 """ 

806 dataset = self._static.dataset 

807 tags = self._tags 

808 collFkName = self._collections.getCollectionForeignKeyName() 

809 

810 # Check that existing datasets have the same dataset type and 

811 # run. 

812 query = ( 

813 sqlalchemy.sql.select( 

814 dataset.columns.id.label("dataset_id"), 

815 dataset.columns.dataset_type_id.label("dataset_type_id"), 

816 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"), 

817 dataset.columns[self._runKeyColumn].label("run"), 

818 tmp_tags.columns[collFkName].label("new run"), 

819 ) 

820 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

821 .where( 

822 sqlalchemy.sql.or_( 

823 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

824 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

825 ) 

826 ) 

827 .limit(1) 

828 ) 

829 with self._db.query(query) as result: 

830 if (row := result.first()) is not None: 

831 # Only include the first one in the exception message 

832 raise ConflictingDefinitionError( 

833 f"Existing dataset type or run do not match new dataset: {row._asdict()}" 

834 ) 

835 

836 # Check that matching dataset in tags table has the same DataId. 

837 query = ( 

838 sqlalchemy.sql.select( 

839 tags.columns.dataset_id, 

840 tags.columns.dataset_type_id.label("type_id"), 

841 tmp_tags.columns.dataset_type_id.label("new type_id"), 

842 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

843 *[ 

844 tmp_tags.columns[dim].label(f"new {dim}") 

845 for dim in self.datasetType.dimensions.required.names 

846 ], 

847 ) 

848 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

849 .where( 

850 sqlalchemy.sql.or_( 

851 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

852 *[ 

853 tags.columns[dim] != tmp_tags.columns[dim] 

854 for dim in self.datasetType.dimensions.required.names 

855 ], 

856 ) 

857 ) 

858 .limit(1) 

859 ) 

860 

861 with self._db.query(query) as result: 

862 if (row := result.first()) is not None: 

863 # Only include the first one in the exception message 

864 raise ConflictingDefinitionError( 

865 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

866 ) 

867 

868 # Check that matching run+dataId have the same dataset ID. 

869 query = ( 

870 sqlalchemy.sql.select( 

871 tags.columns.dataset_type_id.label("dataset_type_id"), 

872 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

873 tags.columns.dataset_id, 

874 tmp_tags.columns.dataset_id.label("new dataset_id"), 

875 tags.columns[collFkName], 

876 tmp_tags.columns[collFkName].label(f"new {collFkName}"), 

877 ) 

878 .select_from( 

879 tags.join( 

880 tmp_tags, 

881 sqlalchemy.sql.and_( 

882 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

883 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

884 *[ 

885 tags.columns[dim] == tmp_tags.columns[dim] 

886 for dim in self.datasetType.dimensions.required.names 

887 ], 

888 ), 

889 ) 

890 ) 

891 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

892 .limit(1) 

893 ) 

894 with self._db.query(query) as result: 

895 if (row := result.first()) is not None: 

896 # only include the first one in the exception message 

897 raise ConflictingDefinitionError( 

898 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" 

899 ) 

900 

901 def _makeDatasetId( 

902 self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum 

903 ) -> uuid.UUID: 

904 """Generate dataset ID for a dataset. 

905 

906 Parameters 

907 ---------- 

908 run : `RunRecord` 

909 The record object describing the RUN collection for the dataset. 

910 dataId : `DataCoordinate` 

911 Expanded data ID for the dataset. 

912 idGenerationMode : `DatasetIdGenEnum` 

913 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random 

914 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

915 deterministic UUID5-type ID based on a dataset type name and 

916 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

917 deterministic UUID5-type ID based on a dataset type name, run 

918 collection name, and ``dataId``. 

919 

920 Returns 

921 ------- 

922 datasetId : `uuid.UUID` 

923 Dataset identifier. 

924 """ 

925 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

926 return uuid.uuid4() 

927 else: 

928 # WARNING: If you modify this code make sure that the order of 

929 # items in the `items` list below never changes. 

930 items: List[Tuple[str, str]] = [] 

931 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

932 items = [ 

933 ("dataset_type", self.datasetType.name), 

934 ] 

935 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 935 ↛ 941line 935 didn't jump to line 941, because the condition on line 935 was never false

936 items = [ 

937 ("dataset_type", self.datasetType.name), 

938 ("run", run.name), 

939 ] 

940 else: 

941 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

942 

943 for name, value in sorted(dataId.byName().items()): 

944 items.append((name, str(value))) 

945 data = ",".join(f"{key}={value}" for key, value in items) 

946 return uuid.uuid5(self.NS_UUID, data)