Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 88%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

304 statements  

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorage",) 

4 

5import uuid 

6from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple 

7 

8import sqlalchemy 

9from lsst.daf.butler import ( 

10 CollectionType, 

11 DataCoordinate, 

12 DataCoordinateSet, 

13 DatasetId, 

14 DatasetRef, 

15 DatasetType, 

16 SimpleQuery, 

17 Timespan, 

18 ddl, 

19) 

20from lsst.daf.butler.registry import ( 

21 CollectionTypeError, 

22 ConflictingDefinitionError, 

23 UnsupportedIdGeneratorError, 

24) 

25from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage 

26 

27from ...summaries import GovernorDimensionRestriction 

28from .tables import makeTagTableSpec 

29 

30if TYPE_CHECKING: 30 ↛ 31line 30 didn't jump to line 31, because the condition on line 30 was never true

31 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

32 from .summaries import CollectionSummaryManager 

33 from .tables import StaticDatasetTablesTuple 

34 

35 

36class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

37 """Dataset record storage implementation paired with 

38 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

39 information. 

40 

41 Instances of this class should never be constructed directly; use 

42 `DatasetRecordStorageManager.register` instead. 

43 """ 

44 

45 def __init__( 

46 self, 

47 *, 

48 datasetType: DatasetType, 

49 db: Database, 

50 dataset_type_id: int, 

51 collections: CollectionManager, 

52 static: StaticDatasetTablesTuple, 

53 summaries: CollectionSummaryManager, 

54 tags: sqlalchemy.schema.Table, 

55 calibs: Optional[sqlalchemy.schema.Table], 

56 ): 

57 super().__init__(datasetType=datasetType) 

58 self._dataset_type_id = dataset_type_id 

59 self._db = db 

60 self._collections = collections 

61 self._static = static 

62 self._summaries = summaries 

63 self._tags = tags 

64 self._calibs = calibs 

65 self._runKeyColumn = collections.getRunForeignKeyName() 

66 

67 def find( 

68 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None 

69 ) -> Optional[DatasetRef]: 

70 # Docstring inherited from DatasetRecordStorage. 

71 assert dataId.graph == self.datasetType.dimensions 

72 if collection.type is CollectionType.CALIBRATION and timespan is None: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true

73 raise TypeError( 

74 f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

75 f"without an input timespan." 

76 ) 

77 sql = self.select( 

78 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan 

79 ) 

80 sql = sql.combine() 

81 results = self._db.query(sql) 

82 row = results.fetchone() 

83 if row is None: 

84 return None 

85 if collection.type is CollectionType.CALIBRATION: 

86 # For temporal calibration lookups (only!) our invariants do not 

87 # guarantee that the number of result rows is <= 1. 

88 # They would if `select` constrained the given timespan to be 

89 # _contained_ by the validity range in the self._calibs table, 

90 # instead of simply _overlapping_ it, because we do guarantee that 

91 # the validity ranges are disjoint for a particular dataset type, 

92 # collection, and data ID. But using an overlap test and a check 

93 # for multiple result rows here allows us to provide a more useful 

94 # diagnostic, as well as allowing `select` to support more general 

95 # queries where multiple results are not an error. 

96 if results.fetchone() is not None: 

97 raise RuntimeError( 

98 f"Multiple matches found for calibration lookup in {collection.name} for " 

99 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

100 ) 

101 return DatasetRef( 

102 datasetType=self.datasetType, 

103 dataId=dataId, 

104 id=row.id, 

105 run=self._collections[row._mapping[self._runKeyColumn]].name, 

106 ) 

107 

108 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

109 # Docstring inherited from DatasetRecordStorage. 

110 # Only delete from common dataset table; ON DELETE foreign key clauses 

111 # will handle the rest. 

112 self._db.delete( 

113 self._static.dataset, 

114 ["id"], 

115 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

116 ) 

117 

118 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

119 # Docstring inherited from DatasetRecordStorage. 

120 if collection.type is not CollectionType.TAGGED: 120 ↛ 121line 120 didn't jump to line 121, because the condition on line 120 was never true

121 raise TypeError( 

122 f"Cannot associate into collection '{collection.name}' " 

123 f"of type {collection.type.name}; must be TAGGED." 

124 ) 

125 protoRow = { 

126 self._collections.getCollectionForeignKeyName(): collection.key, 

127 "dataset_type_id": self._dataset_type_id, 

128 } 

129 rows = [] 

130 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

131 for dataset in datasets: 

132 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

133 for dimension, value in dataset.dataId.items(): 

134 row[dimension.name] = value 

135 governorValues.update_extract(dataset.dataId) 

136 rows.append(row) 

137 # Update the summary tables for this collection in case this is the 

138 # first time this dataset type or these governor values will be 

139 # inserted there. 

140 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

141 # Update the tag table itself. 

142 self._db.replace(self._tags, *rows) 

143 

144 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

145 # Docstring inherited from DatasetRecordStorage. 

146 if collection.type is not CollectionType.TAGGED: 146 ↛ 147line 146 didn't jump to line 147, because the condition on line 146 was never true

147 raise TypeError( 

148 f"Cannot disassociate from collection '{collection.name}' " 

149 f"of type {collection.type.name}; must be TAGGED." 

150 ) 

151 rows = [ 

152 { 

153 "dataset_id": dataset.getCheckedId(), 

154 self._collections.getCollectionForeignKeyName(): collection.key, 

155 } 

156 for dataset in datasets 

157 ] 

158 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

159 

160 def _buildCalibOverlapQuery( 

161 self, collection: CollectionRecord, dataIds: Optional[DataCoordinateSet], timespan: Timespan 

162 ) -> SimpleQuery: 

163 assert self._calibs is not None 

164 # Start by building a SELECT query for any rows that would overlap 

165 # this one. 

166 query = SimpleQuery() 

167 query.join(self._calibs) 

168 # Add a WHERE clause matching the dataset type and collection. 

169 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

170 query.where.append( 

171 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

172 ) 

173 # Add a WHERE clause matching any of the given data IDs. 

174 if dataIds is not None: 

175 dataIds.constrain( 

176 query, 

177 lambda name: self._calibs.columns[name], # type: ignore 

178 ) 

179 # Add WHERE clause for timespan overlaps. 

180 TimespanReprClass = self._db.getTimespanRepresentation() 

181 query.where.append( 

182 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan)) 

183 ) 

184 return query 

185 

186 def certify( 

187 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan 

188 ) -> None: 

189 # Docstring inherited from DatasetRecordStorage. 

190 if self._calibs is None: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true

191 raise CollectionTypeError( 

192 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

193 f"DatasetType.isCalibration() is False." 

194 ) 

195 if collection.type is not CollectionType.CALIBRATION: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true

196 raise CollectionTypeError( 

197 f"Cannot certify into collection '{collection.name}' " 

198 f"of type {collection.type.name}; must be CALIBRATION." 

199 ) 

200 TimespanReprClass = self._db.getTimespanRepresentation() 

201 protoRow = { 

202 self._collections.getCollectionForeignKeyName(): collection.key, 

203 "dataset_type_id": self._dataset_type_id, 

204 } 

205 rows = [] 

206 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

207 dataIds: Optional[Set[DataCoordinate]] = ( 

208 set() if not TimespanReprClass.hasExclusionConstraint() else None 

209 ) 

210 for dataset in datasets: 

211 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

212 for dimension, value in dataset.dataId.items(): 

213 row[dimension.name] = value 

214 TimespanReprClass.update(timespan, result=row) 

215 governorValues.update_extract(dataset.dataId) 

216 rows.append(row) 

217 if dataIds is not None: 217 ↛ 210line 217 didn't jump to line 210, because the condition on line 217 was never false

218 dataIds.add(dataset.dataId) 

219 # Update the summary tables for this collection in case this is the 

220 # first time this dataset type or these governor values will be 

221 # inserted there. 

222 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

223 # Update the association table itself. 

224 if TimespanReprClass.hasExclusionConstraint(): 224 ↛ 227line 224 didn't jump to line 227, because the condition on line 224 was never true

225 # Rely on database constraint to enforce invariants; we just 

226 # reraise the exception for consistency across DB engines. 

227 try: 

228 self._db.insert(self._calibs, *rows) 

229 except sqlalchemy.exc.IntegrityError as err: 

230 raise ConflictingDefinitionError( 

231 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

232 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

233 ) from err 

234 else: 

235 # Have to implement exclusion constraint ourselves. 

236 # Start by building a SELECT query for any rows that would overlap 

237 # this one. 

238 query = self._buildCalibOverlapQuery( 

239 collection, 

240 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

241 timespan, 

242 ) 

243 query.columns.append(sqlalchemy.sql.func.count()) 

244 sql = query.combine() 

245 # Acquire a table lock to ensure there are no concurrent writes 

246 # could invalidate our checking before we finish the inserts. We 

247 # use a SAVEPOINT in case there is an outer transaction that a 

248 # failure here should not roll back. 

249 with self._db.transaction(lock=[self._calibs], savepoint=True): 

250 # Run the check SELECT query. 

251 conflicting = self._db.query(sql).scalar() 

252 if conflicting > 0: 

253 raise ConflictingDefinitionError( 

254 f"{conflicting} validity range conflicts certifying datasets of type " 

255 f"{self.datasetType.name} into {collection.name} for range " 

256 f"[{timespan.begin}, {timespan.end})." 

257 ) 

258 # Proceed with the insert. 

259 self._db.insert(self._calibs, *rows) 

260 

261 def decertify( 

262 self, 

263 collection: CollectionRecord, 

264 timespan: Timespan, 

265 *, 

266 dataIds: Optional[Iterable[DataCoordinate]] = None, 

267 ) -> None: 

268 # Docstring inherited from DatasetRecordStorage. 

269 if self._calibs is None: 269 ↛ 270line 269 didn't jump to line 270, because the condition on line 269 was never true

270 raise CollectionTypeError( 

271 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

272 f"DatasetType.isCalibration() is False." 

273 ) 

274 if collection.type is not CollectionType.CALIBRATION: 274 ↛ 275line 274 didn't jump to line 275, because the condition on line 274 was never true

275 raise CollectionTypeError( 

276 f"Cannot decertify from collection '{collection.name}' " 

277 f"of type {collection.type.name}; must be CALIBRATION." 

278 ) 

279 TimespanReprClass = self._db.getTimespanRepresentation() 

280 # Construct a SELECT query to find all rows that overlap our inputs. 

281 dataIdSet: Optional[DataCoordinateSet] 

282 if dataIds is not None: 

283 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

284 else: 

285 dataIdSet = None 

286 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

287 query.columns.extend(self._calibs.columns) 

288 sql = query.combine() 

289 # Set up collections to populate with the rows we'll want to modify. 

290 # The insert rows will have the same values for collection and 

291 # dataset type. 

292 protoInsertRow = { 

293 self._collections.getCollectionForeignKeyName(): collection.key, 

294 "dataset_type_id": self._dataset_type_id, 

295 } 

296 rowsToDelete = [] 

297 rowsToInsert = [] 

298 # Acquire a table lock to ensure there are no concurrent writes 

299 # between the SELECT and the DELETE and INSERT queries based on it. 

300 with self._db.transaction(lock=[self._calibs], savepoint=True): 

301 for row in self._db.query(sql).mappings(): 

302 rowsToDelete.append({"id": row["id"]}) 

303 # Construct the insert row(s) by copying the prototype row, 

304 # then adding the dimension column values, then adding what's 

305 # left of the timespan from that row after we subtract the 

306 # given timespan. 

307 newInsertRow = protoInsertRow.copy() 

308 newInsertRow["dataset_id"] = row["dataset_id"] 

309 for name in self.datasetType.dimensions.required.names: 

310 newInsertRow[name] = row[name] 

311 rowTimespan = TimespanReprClass.extract(row) 

312 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

313 for diffTimespan in rowTimespan.difference(timespan): 

314 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

315 # Run the DELETE and INSERT queries. 

316 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

317 self._db.insert(self._calibs, *rowsToInsert) 

318 

319 def select( 

320 self, 

321 *collections: CollectionRecord, 

322 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

323 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select, 

324 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

325 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

326 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

327 ) -> SimpleQuery: 

328 # Docstring inherited from DatasetRecordStorage. 

329 collection_types = {collection.type for collection in collections} 

330 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

331 # 

332 # There are two tables in play here: 

333 # 

334 # - the static dataset table (with the dataset ID, dataset type ID, 

335 # run ID/name, and ingest date); 

336 # 

337 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

338 # type ID, collection ID/name, data ID, and possibly validity 

339 # range). 

340 # 

341 # That means that we might want to return a query against either table 

342 # or a JOIN of both, depending on which quantities the caller wants. 

343 # But this method is documented/typed such that ``dataId`` is never 

344 # `None` - i.e. we always constrain or retreive the data ID. That 

345 # means we'll always include the tags/calibs table and join in the 

346 # static dataset table only if we need things from it that we can't get 

347 # from the tags/calibs table. 

348 # 

349 # Note that it's important that we include a WHERE constraint on both 

350 # tables for any column (e.g. dataset_type_id) that is in both when 

351 # it's given explicitly; not doing can prevent the query planner from 

352 # using very important indexes. At present, we don't include those 

353 # redundant columns in the JOIN ON expression, however, because the 

354 # FOREIGN KEY (and its index) are defined only on dataset_id. 

355 # 

356 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass 

357 # to its `join` method when we bring in the tags/calibs table. 

358 query = SimpleQuery() 

359 # We get the data ID or constrain it in the tags/calibs table, but 

360 # that's multiple columns, not one, so we need to transform the one 

361 # Select.Or argument into a dictionary of them. 

362 kwargs: Dict[str, Any] 

363 if dataId is SimpleQuery.Select: 

364 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

365 else: 

366 kwargs = dict(dataId.byName()) 

367 # We always constrain (never retrieve) the dataset type in at least the 

368 # tags/calibs table. 

369 kwargs["dataset_type_id"] = self._dataset_type_id 

370 # Join in the tags or calibs table, turning those 'kwargs' entries into 

371 # WHERE constraints or SELECT columns as appropriate. 

372 if collection_types == {CollectionType.CALIBRATION}: 

373 assert ( 

374 self._calibs is not None 

375 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

376 TimespanReprClass = self._db.getTimespanRepresentation() 

377 # Add the timespan column(s) to the result columns, or constrain 

378 # the timespan via an overlap condition. 

379 if timespan is SimpleQuery.Select: 

380 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()}) 

381 elif timespan is not None: 381 ↛ 387line 381 didn't jump to line 387, because the condition on line 381 was never false

382 query.where.append( 

383 TimespanReprClass.fromSelectable(self._calibs).overlaps( 

384 TimespanReprClass.fromLiteral(timespan) 

385 ) 

386 ) 

387 query.join(self._calibs, **kwargs) 

388 dataset_id_col = self._calibs.columns.dataset_id 

389 collection_col = self._calibs.columns[self._collections.getCollectionForeignKeyName()] 

390 elif CollectionType.CALIBRATION not in collection_types: 390 ↛ 395line 390 didn't jump to line 395, because the condition on line 390 was never false

391 query.join(self._tags, **kwargs) 

392 dataset_id_col = self._tags.columns.dataset_id 

393 collection_col = self._tags.columns[self._collections.getCollectionForeignKeyName()] 

394 else: 

395 raise TypeError( 

396 "Cannot query for CALIBRATION collections in the same " 

397 "subquery as other kinds of collections." 

398 ) 

399 # We always constrain (never retrieve) the collection(s) in the 

400 # tags/calibs table. 

401 if len(collections) == 1: 

402 query.where.append(collection_col == collections[0].key) 

403 elif len(collections) == 0: 

404 # We support the case where there are no collections as a way to 

405 # generate a valid SQL query that can't yield results. This should 

406 # never get executed, but lots of downstream code will still try 

407 # to access the SQLAlchemy objects representing the columns in the 

408 # subquery. That's not idea, but it'd take a lot of refactoring to 

409 # fix it. 

410 query.where.append(sqlalchemy.sql.literal(False)) 

411 else: 

412 query.where.append(collection_col.in_([collection.key for collection in collections])) 

413 # We can always get the dataset_id from the tags/calibs table or 

414 # constrain it there. Can't use kwargs for that because we need to 

415 # alias it to 'id'. 

416 if id is SimpleQuery.Select: 

417 query.columns.append(dataset_id_col.label("id")) 

418 elif id is not None: 418 ↛ 419line 418 didn't jump to line 419, because the condition on line 418 was never true

419 query.where.append(dataset_id_col == id) 

420 # It's possible we now have everything we need, from just the 

421 # tags/calibs table. The things we might need to get from the static 

422 # dataset table are the run key and the ingest date. 

423 need_static_table = False 

424 static_kwargs: Dict[str, Any] = {} 

425 if run is not None: 

426 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection." 

427 if len(collections) == 1 and collections[0].type is CollectionType.RUN: 

428 # If we are searching exactly one RUN collection, we 

429 # know that if we find the dataset in that collection, 

430 # then that's the datasets's run; we don't need to 

431 # query for it. 

432 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn)) 

433 else: 

434 static_kwargs[self._runKeyColumn] = SimpleQuery.Select 

435 need_static_table = True 

436 # Ingest date can only come from the static table. 

437 if ingestDate is not None: 

438 need_static_table = True 

439 if ingestDate is SimpleQuery.Select: 439 ↛ 442line 439 didn't jump to line 442, because the condition on line 439 was never false

440 static_kwargs["ingest_date"] = SimpleQuery.Select 

441 else: 

442 assert isinstance(ingestDate, Timespan) 

443 # Timespan is astropy Time (usually in TAI) and ingest_date is 

444 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

445 if ingestDate.isEmpty(): 

446 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

447 if ingestDate.begin is not None: 

448 begin = ingestDate.begin.utc.datetime # type: ignore 

449 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

450 if ingestDate.end is not None: 

451 end = ingestDate.end.utc.datetime # type: ignore 

452 query.where.append(self._static.dataset.columns.ingest_date < end) 

453 # If we need the static table, join it in via dataset_id and 

454 # dataset_type_id 

455 if need_static_table: 

456 query.join( 

457 self._static.dataset, 

458 onclause=(dataset_id_col == self._static.dataset.columns.id), 

459 **static_kwargs, 

460 ) 

461 # Also constrain dataset_type_id in static table in case that helps 

462 # generate a better plan. 

463 # We could also include this in the JOIN ON clause, but my guess is 

464 # that that's a good idea IFF it's in the foreign key, and right 

465 # now it isn't. 

466 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

467 return query 

468 

469 def getDataId(self, id: DatasetId) -> DataCoordinate: 

470 """Return DataId for a dataset. 

471 

472 Parameters 

473 ---------- 

474 id : `DatasetId` 

475 Unique dataset identifier. 

476 

477 Returns 

478 ------- 

479 dataId : `DataCoordinate` 

480 DataId for the dataset. 

481 """ 

482 # This query could return multiple rows (one for each tagged collection 

483 # the dataset is in, plus one for its run collection), and we don't 

484 # care which of those we get. 

485 sql = ( 

486 self._tags.select() 

487 .where( 

488 sqlalchemy.sql.and_( 

489 self._tags.columns.dataset_id == id, 

490 self._tags.columns.dataset_type_id == self._dataset_type_id, 

491 ) 

492 ) 

493 .limit(1) 

494 ) 

495 row = self._db.query(sql).mappings().fetchone() 

496 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

497 return DataCoordinate.standardize( 

498 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

499 graph=self.datasetType.dimensions, 

500 ) 

501 

502 

503class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

504 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

505 auto-incremented column for dataset IDs. 

506 """ 

507 

508 def insert( 

509 self, 

510 run: RunRecord, 

511 dataIds: Iterable[DataCoordinate], 

512 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

513 ) -> Iterator[DatasetRef]: 

514 # Docstring inherited from DatasetRecordStorage. 

515 

516 # We only support UNIQUE mode for integer dataset IDs 

517 if idMode != DatasetIdGenEnum.UNIQUE: 517 ↛ 518line 517 didn't jump to line 518, because the condition on line 517 was never true

518 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

519 

520 # Transform a possibly-single-pass iterable into a list. 

521 dataIdList = list(dataIds) 

522 yield from self._insert(run, dataIdList) 

523 

524 def import_( 

525 self, 

526 run: RunRecord, 

527 datasets: Iterable[DatasetRef], 

528 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

529 reuseIds: bool = False, 

530 ) -> Iterator[DatasetRef]: 

531 # Docstring inherited from DatasetRecordStorage. 

532 

533 # We only support UNIQUE mode for integer dataset IDs 

534 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 534 ↛ 535line 534 didn't jump to line 535, because the condition on line 534 was never true

535 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

536 

537 # Make a list of dataIds and optionally dataset IDs. 

538 dataIdList: List[DataCoordinate] = [] 

539 datasetIdList: List[int] = [] 

540 for dataset in datasets: 

541 dataIdList.append(dataset.dataId) 

542 

543 # We only accept integer dataset IDs, but also allow None. 

544 datasetId = dataset.id 

545 if datasetId is None: 545 ↛ 547line 545 didn't jump to line 547, because the condition on line 545 was never true

546 # if reuseIds is set then all IDs must be known 

547 if reuseIds: 

548 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

549 elif isinstance(datasetId, int): 549 ↛ 553line 549 didn't jump to line 553, because the condition on line 549 was never false

550 if reuseIds: 

551 datasetIdList.append(datasetId) 

552 else: 

553 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

554 

555 yield from self._insert(run, dataIdList, datasetIdList) 

556 

557 def _insert( 

558 self, run: RunRecord, dataIdList: List[DataCoordinate], datasetIdList: Optional[List[int]] = None 

559 ) -> Iterator[DatasetRef]: 

560 """Common part of implementation of `insert` and `import_` methods.""" 

561 

562 # Remember any governor dimension values we see. 

563 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

564 for dataId in dataIdList: 

565 governorValues.update_extract(dataId) 

566 

567 staticRow = { 

568 "dataset_type_id": self._dataset_type_id, 

569 self._runKeyColumn: run.key, 

570 } 

571 with self._db.transaction(): 

572 # Insert into the static dataset table, generating autoincrement 

573 # dataset_id values. 

574 if datasetIdList: 

575 # reuse existing IDs 

576 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

577 self._db.insert(self._static.dataset, *rows) 

578 else: 

579 # use auto-incremented IDs 

580 datasetIdList = self._db.insert( 

581 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True 

582 ) 

583 assert datasetIdList is not None 

584 # Update the summary tables for this collection in case this is the 

585 # first time this dataset type or these governor values will be 

586 # inserted there. 

587 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

588 # Combine the generated dataset_id values and data ID fields to 

589 # form rows to be inserted into the tags table. 

590 protoTagsRow = { 

591 "dataset_type_id": self._dataset_type_id, 

592 self._collections.getCollectionForeignKeyName(): run.key, 

593 } 

594 tagsRows = [ 

595 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

596 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

597 ] 

598 # Insert those rows into the tags table. This is where we'll 

599 # get any unique constraint violations. 

600 self._db.insert(self._tags, *tagsRows) 

601 

602 for dataId, datasetId in zip(dataIdList, datasetIdList): 

603 yield DatasetRef( 

604 datasetType=self.datasetType, 

605 dataId=dataId, 

606 id=datasetId, 

607 run=run.name, 

608 ) 

609 

610 

611class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

612 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

613 dataset IDs. 

614 """ 

615 

616 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") 

617 """Namespace UUID used for UUID5 generation. Do not change. This was 

618 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

619 """ 

620 

621 def insert( 

622 self, 

623 run: RunRecord, 

624 dataIds: Iterable[DataCoordinate], 

625 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

626 ) -> Iterator[DatasetRef]: 

627 # Docstring inherited from DatasetRecordStorage. 

628 

629 # Remember any governor dimension values we see. 

630 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

631 

632 # Iterate over data IDs, transforming a possibly-single-pass iterable 

633 # into a list. 

634 dataIdList = [] 

635 rows = [] 

636 for dataId in dataIds: 

637 dataIdList.append(dataId) 

638 rows.append( 

639 { 

640 "id": self._makeDatasetId(run, dataId, idMode), 

641 "dataset_type_id": self._dataset_type_id, 

642 self._runKeyColumn: run.key, 

643 } 

644 ) 

645 governorValues.update_extract(dataId) 

646 

647 with self._db.transaction(): 

648 # Insert into the static dataset table. 

649 self._db.insert(self._static.dataset, *rows) 

650 # Update the summary tables for this collection in case this is the 

651 # first time this dataset type or these governor values will be 

652 # inserted there. 

653 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

654 # Combine the generated dataset_id values and data ID fields to 

655 # form rows to be inserted into the tags table. 

656 protoTagsRow = { 

657 "dataset_type_id": self._dataset_type_id, 

658 self._collections.getCollectionForeignKeyName(): run.key, 

659 } 

660 tagsRows = [ 

661 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

662 for dataId, row in zip(dataIdList, rows) 

663 ] 

664 # Insert those rows into the tags table. 

665 self._db.insert(self._tags, *tagsRows) 

666 

667 for dataId, row in zip(dataIdList, rows): 

668 yield DatasetRef( 

669 datasetType=self.datasetType, 

670 dataId=dataId, 

671 id=row["id"], 

672 run=run.name, 

673 ) 

674 

675 def import_( 

676 self, 

677 run: RunRecord, 

678 datasets: Iterable[DatasetRef], 

679 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

680 reuseIds: bool = False, 

681 ) -> Iterator[DatasetRef]: 

682 # Docstring inherited from DatasetRecordStorage. 

683 

684 # Remember any governor dimension values we see. 

685 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

686 

687 # Iterate over data IDs, transforming a possibly-single-pass iterable 

688 # into a list. 

689 dataIds = {} 

690 for dataset in datasets: 

691 # Ignore unknown ID types, normally all IDs have the same type but 

692 # this code supports mixed types or missing IDs. 

693 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

694 if datasetId is None: 

695 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode) 

696 dataIds[datasetId] = dataset.dataId 

697 governorValues.update_extract(dataset.dataId) 

698 

699 with self._db.session() as session: 

700 

701 # insert all new rows into a temporary table 

702 tableSpec = makeTagTableSpec( 

703 self.datasetType, type(self._collections), ddl.GUID, constraints=False 

704 ) 

705 tmp_tags = session.makeTemporaryTable(tableSpec) 

706 

707 collFkName = self._collections.getCollectionForeignKeyName() 

708 protoTagsRow = { 

709 "dataset_type_id": self._dataset_type_id, 

710 collFkName: run.key, 

711 } 

712 tmpRows = [ 

713 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

714 for dataset_id, dataId in dataIds.items() 

715 ] 

716 

717 with self._db.transaction(): 

718 

719 # store all incoming data in a temporary table 

720 self._db.insert(tmp_tags, *tmpRows) 

721 

722 # There are some checks that we want to make for consistency 

723 # of the new datasets with existing ones. 

724 self._validateImport(tmp_tags, run) 

725 

726 # Before we merge temporary table into dataset/tags we need to 

727 # drop datasets which are already there (and do not conflict). 

728 self._db.deleteWhere( 

729 tmp_tags, 

730 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

731 ) 

732 

733 # Copy it into dataset table, need to re-label some columns. 

734 self._db.insert( 

735 self._static.dataset, 

736 select=sqlalchemy.sql.select( 

737 tmp_tags.columns.dataset_id.label("id"), 

738 tmp_tags.columns.dataset_type_id, 

739 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

740 ), 

741 ) 

742 

743 # Update the summary tables for this collection in case this 

744 # is the first time this dataset type or these governor values 

745 # will be inserted there. 

746 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

747 

748 # Copy it into tags table. 

749 self._db.insert(self._tags, select=tmp_tags.select()) 

750 

751 # Return refs in the same order as in the input list. 

752 for dataset_id, dataId in dataIds.items(): 

753 yield DatasetRef( 

754 datasetType=self.datasetType, 

755 id=dataset_id, 

756 dataId=dataId, 

757 run=run.name, 

758 ) 

759 

760 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

761 """Validate imported refs against existing datasets. 

762 

763 Parameters 

764 ---------- 

765 tmp_tags : `sqlalchemy.schema.Table` 

766 Temporary table with new datasets and the same schema as tags 

767 table. 

768 run : `RunRecord` 

769 The record object describing the `~CollectionType.RUN` collection. 

770 

771 Raises 

772 ------ 

773 ConflictingDefinitionError 

774 Raise if new datasets conflict with existing ones. 

775 """ 

776 dataset = self._static.dataset 

777 tags = self._tags 

778 collFkName = self._collections.getCollectionForeignKeyName() 

779 

780 # Check that existing datasets have the same dataset type and 

781 # run. 

782 query = ( 

783 sqlalchemy.sql.select( 

784 dataset.columns.id.label("dataset_id"), 

785 dataset.columns.dataset_type_id.label("dataset_type_id"), 

786 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"), 

787 dataset.columns[self._runKeyColumn].label("run"), 

788 tmp_tags.columns[collFkName].label("new run"), 

789 ) 

790 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

791 .where( 

792 sqlalchemy.sql.or_( 

793 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

794 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

795 ) 

796 ) 

797 ) 

798 result = self._db.query(query) 

799 if (row := result.first()) is not None: 

800 # Only include the first one in the exception message 

801 raise ConflictingDefinitionError( 

802 f"Existing dataset type or run do not match new dataset: {row._asdict()}" 

803 ) 

804 

805 # Check that matching dataset in tags table has the same DataId. 

806 query = ( 

807 sqlalchemy.sql.select( 

808 tags.columns.dataset_id, 

809 tags.columns.dataset_type_id.label("type_id"), 

810 tmp_tags.columns.dataset_type_id.label("new type_id"), 

811 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

812 *[ 

813 tmp_tags.columns[dim].label(f"new {dim}") 

814 for dim in self.datasetType.dimensions.required.names 

815 ], 

816 ) 

817 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

818 .where( 

819 sqlalchemy.sql.or_( 

820 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

821 *[ 

822 tags.columns[dim] != tmp_tags.columns[dim] 

823 for dim in self.datasetType.dimensions.required.names 

824 ], 

825 ) 

826 ) 

827 ) 

828 result = self._db.query(query) 

829 if (row := result.first()) is not None: 

830 # Only include the first one in the exception message 

831 raise ConflictingDefinitionError( 

832 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

833 ) 

834 

835 # Check that matching run+dataId have the same dataset ID. 

836 query = ( 

837 sqlalchemy.sql.select( 

838 tags.columns.dataset_type_id.label("dataset_type_id"), 

839 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

840 tags.columns.dataset_id, 

841 tmp_tags.columns.dataset_id.label("new dataset_id"), 

842 tags.columns[collFkName], 

843 tmp_tags.columns[collFkName].label(f"new {collFkName}"), 

844 ) 

845 .select_from( 

846 tags.join( 

847 tmp_tags, 

848 sqlalchemy.sql.and_( 

849 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

850 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

851 *[ 

852 tags.columns[dim] == tmp_tags.columns[dim] 

853 for dim in self.datasetType.dimensions.required.names 

854 ], 

855 ), 

856 ) 

857 ) 

858 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

859 ) 

860 result = self._db.query(query) 

861 if (row := result.first()) is not None: 

862 # only include the first one in the exception message 

863 raise ConflictingDefinitionError( 

864 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" 

865 ) 

866 

867 def _makeDatasetId( 

868 self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum 

869 ) -> uuid.UUID: 

870 """Generate dataset ID for a dataset. 

871 

872 Parameters 

873 ---------- 

874 run : `RunRecord` 

875 The record object describing the RUN collection for the dataset. 

876 dataId : `DataCoordinate` 

877 Expanded data ID for the dataset. 

878 idGenerationMode : `DatasetIdGenEnum` 

879 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random 

880 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

881 deterministic UUID5-type ID based on a dataset type name and 

882 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

883 deterministic UUID5-type ID based on a dataset type name, run 

884 collection name, and ``dataId``. 

885 

886 Returns 

887 ------- 

888 datasetId : `uuid.UUID` 

889 Dataset identifier. 

890 """ 

891 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

892 return uuid.uuid4() 

893 else: 

894 # WARNING: If you modify this code make sure that the order of 

895 # items in the `items` list below never changes. 

896 items: List[Tuple[str, str]] = [] 

897 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

898 items = [ 

899 ("dataset_type", self.datasetType.name), 

900 ] 

901 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 901 ↛ 907line 901 didn't jump to line 907, because the condition on line 901 was never false

902 items = [ 

903 ("dataset_type", self.datasetType.name), 

904 ("run", run.name), 

905 ] 

906 else: 

907 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

908 

909 for name, value in sorted(dataId.byName().items()): 

910 items.append((name, str(value))) 

911 data = ",".join(f"{key}={value}" for key, value in items) 

912 return uuid.uuid5(self.NS_UUID, data)