Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 88%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

304 statements  

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorage",) 

4 

5import uuid 

6from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple 

7 

8import sqlalchemy 

9from lsst.daf.butler import ( 

10 CollectionType, 

11 DataCoordinate, 

12 DataCoordinateSet, 

13 DatasetId, 

14 DatasetRef, 

15 DatasetType, 

16 SimpleQuery, 

17 Timespan, 

18 ddl, 

19) 

20from lsst.daf.butler.registry import ConflictingDefinitionError, UnsupportedIdGeneratorError 

21from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage 

22 

23from ...summaries import GovernorDimensionRestriction 

24from .tables import makeTagTableSpec 

25 

26if TYPE_CHECKING: 26 ↛ 27line 26 didn't jump to line 27, because the condition on line 26 was never true

27 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

28 from .summaries import CollectionSummaryManager 

29 from .tables import StaticDatasetTablesTuple 

30 

31 

32class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

33 """Dataset record storage implementation paired with 

34 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

35 information. 

36 

37 Instances of this class should never be constructed directly; use 

38 `DatasetRecordStorageManager.register` instead. 

39 """ 

40 

41 def __init__( 

42 self, 

43 *, 

44 datasetType: DatasetType, 

45 db: Database, 

46 dataset_type_id: int, 

47 collections: CollectionManager, 

48 static: StaticDatasetTablesTuple, 

49 summaries: CollectionSummaryManager, 

50 tags: sqlalchemy.schema.Table, 

51 calibs: Optional[sqlalchemy.schema.Table], 

52 ): 

53 super().__init__(datasetType=datasetType) 

54 self._dataset_type_id = dataset_type_id 

55 self._db = db 

56 self._collections = collections 

57 self._static = static 

58 self._summaries = summaries 

59 self._tags = tags 

60 self._calibs = calibs 

61 self._runKeyColumn = collections.getRunForeignKeyName() 

62 

63 def find( 

64 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None 

65 ) -> Optional[DatasetRef]: 

66 # Docstring inherited from DatasetRecordStorage. 

67 assert dataId.graph == self.datasetType.dimensions 

68 if collection.type is CollectionType.CALIBRATION and timespan is None: 68 ↛ 69line 68 didn't jump to line 69, because the condition on line 68 was never true

69 raise TypeError( 

70 f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

71 f"without an input timespan." 

72 ) 

73 sql = self.select( 

74 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan 

75 ) 

76 sql = sql.combine() 

77 results = self._db.query(sql) 

78 row = results.fetchone() 

79 if row is None: 

80 return None 

81 if collection.type is CollectionType.CALIBRATION: 

82 # For temporal calibration lookups (only!) our invariants do not 

83 # guarantee that the number of result rows is <= 1. 

84 # They would if `select` constrained the given timespan to be 

85 # _contained_ by the validity range in the self._calibs table, 

86 # instead of simply _overlapping_ it, because we do guarantee that 

87 # the validity ranges are disjoint for a particular dataset type, 

88 # collection, and data ID. But using an overlap test and a check 

89 # for multiple result rows here allows us to provide a more useful 

90 # diagnostic, as well as allowing `select` to support more general 

91 # queries where multiple results are not an error. 

92 if results.fetchone() is not None: 

93 raise RuntimeError( 

94 f"Multiple matches found for calibration lookup in {collection.name} for " 

95 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

96 ) 

97 return DatasetRef( 

98 datasetType=self.datasetType, 

99 dataId=dataId, 

100 id=row.id, 

101 run=self._collections[row._mapping[self._runKeyColumn]].name, 

102 ) 

103 

104 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

105 # Docstring inherited from DatasetRecordStorage. 

106 # Only delete from common dataset table; ON DELETE foreign key clauses 

107 # will handle the rest. 

108 self._db.delete( 

109 self._static.dataset, 

110 ["id"], 

111 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

112 ) 

113 

114 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

115 # Docstring inherited from DatasetRecordStorage. 

116 if collection.type is not CollectionType.TAGGED: 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true

117 raise TypeError( 

118 f"Cannot associate into collection '{collection.name}' " 

119 f"of type {collection.type.name}; must be TAGGED." 

120 ) 

121 protoRow = { 

122 self._collections.getCollectionForeignKeyName(): collection.key, 

123 "dataset_type_id": self._dataset_type_id, 

124 } 

125 rows = [] 

126 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

127 for dataset in datasets: 

128 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

129 for dimension, value in dataset.dataId.items(): 

130 row[dimension.name] = value 

131 governorValues.update_extract(dataset.dataId) 

132 rows.append(row) 

133 # Update the summary tables for this collection in case this is the 

134 # first time this dataset type or these governor values will be 

135 # inserted there. 

136 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

137 # Update the tag table itself. 

138 self._db.replace(self._tags, *rows) 

139 

140 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

141 # Docstring inherited from DatasetRecordStorage. 

142 if collection.type is not CollectionType.TAGGED: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true

143 raise TypeError( 

144 f"Cannot disassociate from collection '{collection.name}' " 

145 f"of type {collection.type.name}; must be TAGGED." 

146 ) 

147 rows = [ 

148 { 

149 "dataset_id": dataset.getCheckedId(), 

150 self._collections.getCollectionForeignKeyName(): collection.key, 

151 } 

152 for dataset in datasets 

153 ] 

154 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

155 

156 def _buildCalibOverlapQuery( 

157 self, collection: CollectionRecord, dataIds: Optional[DataCoordinateSet], timespan: Timespan 

158 ) -> SimpleQuery: 

159 assert self._calibs is not None 

160 # Start by building a SELECT query for any rows that would overlap 

161 # this one. 

162 query = SimpleQuery() 

163 query.join(self._calibs) 

164 # Add a WHERE clause matching the dataset type and collection. 

165 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

166 query.where.append( 

167 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

168 ) 

169 # Add a WHERE clause matching any of the given data IDs. 

170 if dataIds is not None: 

171 dataIds.constrain( 

172 query, 

173 lambda name: self._calibs.columns[name], # type: ignore 

174 ) 

175 # Add WHERE clause for timespan overlaps. 

176 TimespanReprClass = self._db.getTimespanRepresentation() 

177 query.where.append( 

178 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan)) 

179 ) 

180 return query 

181 

182 def certify( 

183 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan 

184 ) -> None: 

185 # Docstring inherited from DatasetRecordStorage. 

186 if self._calibs is None: 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true

187 raise TypeError( 

188 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

189 f"DatasetType.isCalibration() is False." 

190 ) 

191 if collection.type is not CollectionType.CALIBRATION: 191 ↛ 192line 191 didn't jump to line 192, because the condition on line 191 was never true

192 raise TypeError( 

193 f"Cannot certify into collection '{collection.name}' " 

194 f"of type {collection.type.name}; must be CALIBRATION." 

195 ) 

196 TimespanReprClass = self._db.getTimespanRepresentation() 

197 protoRow = { 

198 self._collections.getCollectionForeignKeyName(): collection.key, 

199 "dataset_type_id": self._dataset_type_id, 

200 } 

201 rows = [] 

202 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

203 dataIds: Optional[Set[DataCoordinate]] = ( 

204 set() if not TimespanReprClass.hasExclusionConstraint() else None 

205 ) 

206 for dataset in datasets: 

207 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

208 for dimension, value in dataset.dataId.items(): 

209 row[dimension.name] = value 

210 TimespanReprClass.update(timespan, result=row) 

211 governorValues.update_extract(dataset.dataId) 

212 rows.append(row) 

213 if dataIds is not None: 213 ↛ 206line 213 didn't jump to line 206, because the condition on line 213 was never false

214 dataIds.add(dataset.dataId) 

215 # Update the summary tables for this collection in case this is the 

216 # first time this dataset type or these governor values will be 

217 # inserted there. 

218 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

219 # Update the association table itself. 

220 if TimespanReprClass.hasExclusionConstraint(): 220 ↛ 223line 220 didn't jump to line 223, because the condition on line 220 was never true

221 # Rely on database constraint to enforce invariants; we just 

222 # reraise the exception for consistency across DB engines. 

223 try: 

224 self._db.insert(self._calibs, *rows) 

225 except sqlalchemy.exc.IntegrityError as err: 

226 raise ConflictingDefinitionError( 

227 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

228 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

229 ) from err 

230 else: 

231 # Have to implement exclusion constraint ourselves. 

232 # Start by building a SELECT query for any rows that would overlap 

233 # this one. 

234 query = self._buildCalibOverlapQuery( 

235 collection, 

236 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

237 timespan, 

238 ) 

239 query.columns.append(sqlalchemy.sql.func.count()) 

240 sql = query.combine() 

241 # Acquire a table lock to ensure there are no concurrent writes 

242 # could invalidate our checking before we finish the inserts. We 

243 # use a SAVEPOINT in case there is an outer transaction that a 

244 # failure here should not roll back. 

245 with self._db.transaction(lock=[self._calibs], savepoint=True): 

246 # Run the check SELECT query. 

247 conflicting = self._db.query(sql).scalar() 

248 if conflicting > 0: 

249 raise ConflictingDefinitionError( 

250 f"{conflicting} validity range conflicts certifying datasets of type " 

251 f"{self.datasetType.name} into {collection.name} for range " 

252 f"[{timespan.begin}, {timespan.end})." 

253 ) 

254 # Proceed with the insert. 

255 self._db.insert(self._calibs, *rows) 

256 

257 def decertify( 

258 self, 

259 collection: CollectionRecord, 

260 timespan: Timespan, 

261 *, 

262 dataIds: Optional[Iterable[DataCoordinate]] = None, 

263 ) -> None: 

264 # Docstring inherited from DatasetRecordStorage. 

265 if self._calibs is None: 265 ↛ 266line 265 didn't jump to line 266, because the condition on line 265 was never true

266 raise TypeError( 

267 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

268 f"DatasetType.isCalibration() is False." 

269 ) 

270 if collection.type is not CollectionType.CALIBRATION: 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true

271 raise TypeError( 

272 f"Cannot decertify from collection '{collection.name}' " 

273 f"of type {collection.type.name}; must be CALIBRATION." 

274 ) 

275 TimespanReprClass = self._db.getTimespanRepresentation() 

276 # Construct a SELECT query to find all rows that overlap our inputs. 

277 dataIdSet: Optional[DataCoordinateSet] 

278 if dataIds is not None: 

279 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

280 else: 

281 dataIdSet = None 

282 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

283 query.columns.extend(self._calibs.columns) 

284 sql = query.combine() 

285 # Set up collections to populate with the rows we'll want to modify. 

286 # The insert rows will have the same values for collection and 

287 # dataset type. 

288 protoInsertRow = { 

289 self._collections.getCollectionForeignKeyName(): collection.key, 

290 "dataset_type_id": self._dataset_type_id, 

291 } 

292 rowsToDelete = [] 

293 rowsToInsert = [] 

294 # Acquire a table lock to ensure there are no concurrent writes 

295 # between the SELECT and the DELETE and INSERT queries based on it. 

296 with self._db.transaction(lock=[self._calibs], savepoint=True): 

297 for row in self._db.query(sql).mappings(): 

298 rowsToDelete.append({"id": row["id"]}) 

299 # Construct the insert row(s) by copying the prototype row, 

300 # then adding the dimension column values, then adding what's 

301 # left of the timespan from that row after we subtract the 

302 # given timespan. 

303 newInsertRow = protoInsertRow.copy() 

304 newInsertRow["dataset_id"] = row["dataset_id"] 

305 for name in self.datasetType.dimensions.required.names: 

306 newInsertRow[name] = row[name] 

307 rowTimespan = TimespanReprClass.extract(row) 

308 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

309 for diffTimespan in rowTimespan.difference(timespan): 

310 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

311 # Run the DELETE and INSERT queries. 

312 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

313 self._db.insert(self._calibs, *rowsToInsert) 

314 

315 def select( 

316 self, 

317 *collections: CollectionRecord, 

318 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

319 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select, 

320 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

321 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

322 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

323 ) -> SimpleQuery: 

324 # Docstring inherited from DatasetRecordStorage. 

325 collection_types = {collection.type for collection in collections} 

326 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

327 # 

328 # There are two tables in play here: 

329 # 

330 # - the static dataset table (with the dataset ID, dataset type ID, 

331 # run ID/name, and ingest date); 

332 # 

333 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

334 # type ID, collection ID/name, data ID, and possibly validity 

335 # range). 

336 # 

337 # That means that we might want to return a query against either table 

338 # or a JOIN of both, depending on which quantities the caller wants. 

339 # But this method is documented/typed such that ``dataId`` is never 

340 # `None` - i.e. we always constrain or retreive the data ID. That 

341 # means we'll always include the tags/calibs table and join in the 

342 # static dataset table only if we need things from it that we can't get 

343 # from the tags/calibs table. 

344 # 

345 # Note that it's important that we include a WHERE constraint on both 

346 # tables for any column (e.g. dataset_type_id) that is in both when 

347 # it's given explicitly; not doing can prevent the query planner from 

348 # using very important indexes. At present, we don't include those 

349 # redundant columns in the JOIN ON expression, however, because the 

350 # FOREIGN KEY (and its index) are defined only on dataset_id. 

351 # 

352 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass 

353 # to its `join` method when we bring in the tags/calibs table. 

354 query = SimpleQuery() 

355 # We get the data ID or constrain it in the tags/calibs table, but 

356 # that's multiple columns, not one, so we need to transform the one 

357 # Select.Or argument into a dictionary of them. 

358 kwargs: Dict[str, Any] 

359 if dataId is SimpleQuery.Select: 

360 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

361 else: 

362 kwargs = dict(dataId.byName()) 

363 # We always constrain (never retrieve) the dataset type in at least the 

364 # tags/calibs table. 

365 kwargs["dataset_type_id"] = self._dataset_type_id 

366 # Join in the tags or calibs table, turning those 'kwargs' entries into 

367 # WHERE constraints or SELECT columns as appropriate. 

368 if collection_types == {CollectionType.CALIBRATION}: 

369 assert ( 

370 self._calibs is not None 

371 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

372 TimespanReprClass = self._db.getTimespanRepresentation() 

373 # Add the timespan column(s) to the result columns, or constrain 

374 # the timespan via an overlap condition. 

375 if timespan is SimpleQuery.Select: 

376 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()}) 

377 elif timespan is not None: 377 ↛ 383line 377 didn't jump to line 383, because the condition on line 377 was never false

378 query.where.append( 

379 TimespanReprClass.fromSelectable(self._calibs).overlaps( 

380 TimespanReprClass.fromLiteral(timespan) 

381 ) 

382 ) 

383 query.join(self._calibs, **kwargs) 

384 dataset_id_col = self._calibs.columns.dataset_id 

385 collection_col = self._calibs.columns[self._collections.getCollectionForeignKeyName()] 

386 elif CollectionType.CALIBRATION not in collection_types: 386 ↛ 391line 386 didn't jump to line 391, because the condition on line 386 was never false

387 query.join(self._tags, **kwargs) 

388 dataset_id_col = self._tags.columns.dataset_id 

389 collection_col = self._tags.columns[self._collections.getCollectionForeignKeyName()] 

390 else: 

391 raise TypeError( 

392 "Cannot query for CALIBRATION collections in the same " 

393 "subquery as other kinds of collections." 

394 ) 

395 # We always constrain (never retrieve) the collection(s) in the 

396 # tags/calibs table. 

397 if len(collections) == 1: 

398 query.where.append(collection_col == collections[0].key) 

399 elif len(collections) == 0: 

400 # We support the case where there are no collections as a way to 

401 # generate a valid SQL query that can't yield results. This should 

402 # never get executed, but lots of downstream code will still try 

403 # to access the SQLAlchemy objects representing the columns in the 

404 # subquery. That's not idea, but it'd take a lot of refactoring to 

405 # fix it. 

406 query.where.append(sqlalchemy.sql.literal(False)) 

407 else: 

408 query.where.append(collection_col.in_([collection.key for collection in collections])) 

409 # We can always get the dataset_id from the tags/calibs table or 

410 # constrain it there. Can't use kwargs for that because we need to 

411 # alias it to 'id'. 

412 if id is SimpleQuery.Select: 

413 query.columns.append(dataset_id_col.label("id")) 

414 elif id is not None: 414 ↛ 415line 414 didn't jump to line 415, because the condition on line 414 was never true

415 query.where.append(dataset_id_col == id) 

416 # It's possible we now have everything we need, from just the 

417 # tags/calibs table. The things we might need to get from the static 

418 # dataset table are the run key and the ingest date. 

419 need_static_table = False 

420 static_kwargs: Dict[str, Any] = {} 

421 if run is not None: 

422 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection." 

423 if len(collections) == 1 and collections[0].type is CollectionType.RUN: 

424 # If we are searching exactly one RUN collection, we 

425 # know that if we find the dataset in that collection, 

426 # then that's the datasets's run; we don't need to 

427 # query for it. 

428 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn)) 

429 else: 

430 static_kwargs[self._runKeyColumn] = SimpleQuery.Select 

431 need_static_table = True 

432 # Ingest date can only come from the static table. 

433 if ingestDate is not None: 

434 need_static_table = True 

435 if ingestDate is SimpleQuery.Select: 435 ↛ 438line 435 didn't jump to line 438, because the condition on line 435 was never false

436 static_kwargs["ingest_date"] = SimpleQuery.Select 

437 else: 

438 assert isinstance(ingestDate, Timespan) 

439 # Timespan is astropy Time (usually in TAI) and ingest_date is 

440 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

441 if ingestDate.isEmpty(): 

442 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

443 if ingestDate.begin is not None: 

444 begin = ingestDate.begin.utc.datetime # type: ignore 

445 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

446 if ingestDate.end is not None: 

447 end = ingestDate.end.utc.datetime # type: ignore 

448 query.where.append(self._static.dataset.columns.ingest_date < end) 

449 # If we need the static table, join it in via dataset_id and 

450 # dataset_type_id 

451 if need_static_table: 

452 query.join( 

453 self._static.dataset, 

454 onclause=(dataset_id_col == self._static.dataset.columns.id), 

455 **static_kwargs, 

456 ) 

457 # Also constrain dataset_type_id in static table in case that helps 

458 # generate a better plan. 

459 # We could also include this in the JOIN ON clause, but my guess is 

460 # that that's a good idea IFF it's in the foreign key, and right 

461 # now it isn't. 

462 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

463 return query 

464 

465 def getDataId(self, id: DatasetId) -> DataCoordinate: 

466 """Return DataId for a dataset. 

467 

468 Parameters 

469 ---------- 

470 id : `DatasetId` 

471 Unique dataset identifier. 

472 

473 Returns 

474 ------- 

475 dataId : `DataCoordinate` 

476 DataId for the dataset. 

477 """ 

478 # This query could return multiple rows (one for each tagged collection 

479 # the dataset is in, plus one for its run collection), and we don't 

480 # care which of those we get. 

481 sql = ( 

482 self._tags.select() 

483 .where( 

484 sqlalchemy.sql.and_( 

485 self._tags.columns.dataset_id == id, 

486 self._tags.columns.dataset_type_id == self._dataset_type_id, 

487 ) 

488 ) 

489 .limit(1) 

490 ) 

491 row = self._db.query(sql).mappings().fetchone() 

492 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

493 return DataCoordinate.standardize( 

494 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

495 graph=self.datasetType.dimensions, 

496 ) 

497 

498 

499class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

500 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

501 auto-incremented column for dataset IDs. 

502 """ 

503 

504 def insert( 

505 self, 

506 run: RunRecord, 

507 dataIds: Iterable[DataCoordinate], 

508 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

509 ) -> Iterator[DatasetRef]: 

510 # Docstring inherited from DatasetRecordStorage. 

511 

512 # We only support UNIQUE mode for integer dataset IDs 

513 if idMode != DatasetIdGenEnum.UNIQUE: 513 ↛ 514line 513 didn't jump to line 514, because the condition on line 513 was never true

514 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

515 

516 # Transform a possibly-single-pass iterable into a list. 

517 dataIdList = list(dataIds) 

518 yield from self._insert(run, dataIdList) 

519 

520 def import_( 

521 self, 

522 run: RunRecord, 

523 datasets: Iterable[DatasetRef], 

524 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

525 reuseIds: bool = False, 

526 ) -> Iterator[DatasetRef]: 

527 # Docstring inherited from DatasetRecordStorage. 

528 

529 # We only support UNIQUE mode for integer dataset IDs 

530 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 530 ↛ 531line 530 didn't jump to line 531, because the condition on line 530 was never true

531 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

532 

533 # Make a list of dataIds and optionally dataset IDs. 

534 dataIdList: List[DataCoordinate] = [] 

535 datasetIdList: List[int] = [] 

536 for dataset in datasets: 

537 dataIdList.append(dataset.dataId) 

538 

539 # We only accept integer dataset IDs, but also allow None. 

540 datasetId = dataset.id 

541 if datasetId is None: 541 ↛ 543line 541 didn't jump to line 543, because the condition on line 541 was never true

542 # if reuseIds is set then all IDs must be known 

543 if reuseIds: 

544 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

545 elif isinstance(datasetId, int): 545 ↛ 549line 545 didn't jump to line 549, because the condition on line 545 was never false

546 if reuseIds: 

547 datasetIdList.append(datasetId) 

548 else: 

549 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

550 

551 yield from self._insert(run, dataIdList, datasetIdList) 

552 

553 def _insert( 

554 self, run: RunRecord, dataIdList: List[DataCoordinate], datasetIdList: Optional[List[int]] = None 

555 ) -> Iterator[DatasetRef]: 

556 """Common part of implementation of `insert` and `import_` methods.""" 

557 

558 # Remember any governor dimension values we see. 

559 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

560 for dataId in dataIdList: 

561 governorValues.update_extract(dataId) 

562 

563 staticRow = { 

564 "dataset_type_id": self._dataset_type_id, 

565 self._runKeyColumn: run.key, 

566 } 

567 with self._db.transaction(): 

568 # Insert into the static dataset table, generating autoincrement 

569 # dataset_id values. 

570 if datasetIdList: 

571 # reuse existing IDs 

572 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

573 self._db.insert(self._static.dataset, *rows) 

574 else: 

575 # use auto-incremented IDs 

576 datasetIdList = self._db.insert( 

577 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True 

578 ) 

579 assert datasetIdList is not None 

580 # Update the summary tables for this collection in case this is the 

581 # first time this dataset type or these governor values will be 

582 # inserted there. 

583 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

584 # Combine the generated dataset_id values and data ID fields to 

585 # form rows to be inserted into the tags table. 

586 protoTagsRow = { 

587 "dataset_type_id": self._dataset_type_id, 

588 self._collections.getCollectionForeignKeyName(): run.key, 

589 } 

590 tagsRows = [ 

591 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

592 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

593 ] 

594 # Insert those rows into the tags table. This is where we'll 

595 # get any unique constraint violations. 

596 self._db.insert(self._tags, *tagsRows) 

597 

598 for dataId, datasetId in zip(dataIdList, datasetIdList): 

599 yield DatasetRef( 

600 datasetType=self.datasetType, 

601 dataId=dataId, 

602 id=datasetId, 

603 run=run.name, 

604 ) 

605 

606 

607class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

608 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

609 dataset IDs. 

610 """ 

611 

612 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") 

613 """Namespace UUID used for UUID5 generation. Do not change. This was 

614 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

615 """ 

616 

617 def insert( 

618 self, 

619 run: RunRecord, 

620 dataIds: Iterable[DataCoordinate], 

621 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

622 ) -> Iterator[DatasetRef]: 

623 # Docstring inherited from DatasetRecordStorage. 

624 

625 # Remember any governor dimension values we see. 

626 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

627 

628 # Iterate over data IDs, transforming a possibly-single-pass iterable 

629 # into a list. 

630 dataIdList = [] 

631 rows = [] 

632 for dataId in dataIds: 

633 dataIdList.append(dataId) 

634 rows.append( 

635 { 

636 "id": self._makeDatasetId(run, dataId, idMode), 

637 "dataset_type_id": self._dataset_type_id, 

638 self._runKeyColumn: run.key, 

639 } 

640 ) 

641 governorValues.update_extract(dataId) 

642 

643 with self._db.transaction(): 

644 # Insert into the static dataset table. 

645 self._db.insert(self._static.dataset, *rows) 

646 # Update the summary tables for this collection in case this is the 

647 # first time this dataset type or these governor values will be 

648 # inserted there. 

649 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

650 # Combine the generated dataset_id values and data ID fields to 

651 # form rows to be inserted into the tags table. 

652 protoTagsRow = { 

653 "dataset_type_id": self._dataset_type_id, 

654 self._collections.getCollectionForeignKeyName(): run.key, 

655 } 

656 tagsRows = [ 

657 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

658 for dataId, row in zip(dataIdList, rows) 

659 ] 

660 # Insert those rows into the tags table. 

661 self._db.insert(self._tags, *tagsRows) 

662 

663 for dataId, row in zip(dataIdList, rows): 

664 yield DatasetRef( 

665 datasetType=self.datasetType, 

666 dataId=dataId, 

667 id=row["id"], 

668 run=run.name, 

669 ) 

670 

671 def import_( 

672 self, 

673 run: RunRecord, 

674 datasets: Iterable[DatasetRef], 

675 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

676 reuseIds: bool = False, 

677 ) -> Iterator[DatasetRef]: 

678 # Docstring inherited from DatasetRecordStorage. 

679 

680 # Remember any governor dimension values we see. 

681 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

682 

683 # Iterate over data IDs, transforming a possibly-single-pass iterable 

684 # into a list. 

685 dataIds = {} 

686 for dataset in datasets: 

687 # Ignore unknown ID types, normally all IDs have the same type but 

688 # this code supports mixed types or missing IDs. 

689 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

690 if datasetId is None: 

691 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode) 

692 dataIds[datasetId] = dataset.dataId 

693 governorValues.update_extract(dataset.dataId) 

694 

695 with self._db.session() as session: 

696 

697 # insert all new rows into a temporary table 

698 tableSpec = makeTagTableSpec( 

699 self.datasetType, type(self._collections), ddl.GUID, constraints=False 

700 ) 

701 tmp_tags = session.makeTemporaryTable(tableSpec) 

702 

703 collFkName = self._collections.getCollectionForeignKeyName() 

704 protoTagsRow = { 

705 "dataset_type_id": self._dataset_type_id, 

706 collFkName: run.key, 

707 } 

708 tmpRows = [ 

709 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

710 for dataset_id, dataId in dataIds.items() 

711 ] 

712 

713 with self._db.transaction(): 

714 

715 # store all incoming data in a temporary table 

716 self._db.insert(tmp_tags, *tmpRows) 

717 

718 # There are some checks that we want to make for consistency 

719 # of the new datasets with existing ones. 

720 self._validateImport(tmp_tags, run) 

721 

722 # Before we merge temporary table into dataset/tags we need to 

723 # drop datasets which are already there (and do not conflict). 

724 self._db.deleteWhere( 

725 tmp_tags, 

726 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

727 ) 

728 

729 # Copy it into dataset table, need to re-label some columns. 

730 self._db.insert( 

731 self._static.dataset, 

732 select=sqlalchemy.sql.select( 

733 tmp_tags.columns.dataset_id.label("id"), 

734 tmp_tags.columns.dataset_type_id, 

735 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

736 ), 

737 ) 

738 

739 # Update the summary tables for this collection in case this 

740 # is the first time this dataset type or these governor values 

741 # will be inserted there. 

742 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

743 

744 # Copy it into tags table. 

745 self._db.insert(self._tags, select=tmp_tags.select()) 

746 

747 # Return refs in the same order as in the input list. 

748 for dataset_id, dataId in dataIds.items(): 

749 yield DatasetRef( 

750 datasetType=self.datasetType, 

751 id=dataset_id, 

752 dataId=dataId, 

753 run=run.name, 

754 ) 

755 

756 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

757 """Validate imported refs against existing datasets. 

758 

759 Parameters 

760 ---------- 

761 tmp_tags : `sqlalchemy.schema.Table` 

762 Temporary table with new datasets and the same schema as tags 

763 table. 

764 run : `RunRecord` 

765 The record object describing the `~CollectionType.RUN` collection. 

766 

767 Raises 

768 ------ 

769 ConflictingDefinitionError 

770 Raise if new datasets conflict with existing ones. 

771 """ 

772 dataset = self._static.dataset 

773 tags = self._tags 

774 collFkName = self._collections.getCollectionForeignKeyName() 

775 

776 # Check that existing datasets have the same dataset type and 

777 # run. 

778 query = ( 

779 sqlalchemy.sql.select( 

780 dataset.columns.id.label("dataset_id"), 

781 dataset.columns.dataset_type_id.label("dataset_type_id"), 

782 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"), 

783 dataset.columns[self._runKeyColumn].label("run"), 

784 tmp_tags.columns[collFkName].label("new run"), 

785 ) 

786 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

787 .where( 

788 sqlalchemy.sql.or_( 

789 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

790 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

791 ) 

792 ) 

793 ) 

794 result = self._db.query(query) 

795 if (row := result.first()) is not None: 

796 # Only include the first one in the exception message 

797 raise ConflictingDefinitionError( 

798 f"Existing dataset type or run do not match new dataset: {row._asdict()}" 

799 ) 

800 

801 # Check that matching dataset in tags table has the same DataId. 

802 query = ( 

803 sqlalchemy.sql.select( 

804 tags.columns.dataset_id, 

805 tags.columns.dataset_type_id.label("type_id"), 

806 tmp_tags.columns.dataset_type_id.label("new type_id"), 

807 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

808 *[ 

809 tmp_tags.columns[dim].label(f"new {dim}") 

810 for dim in self.datasetType.dimensions.required.names 

811 ], 

812 ) 

813 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

814 .where( 

815 sqlalchemy.sql.or_( 

816 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

817 *[ 

818 tags.columns[dim] != tmp_tags.columns[dim] 

819 for dim in self.datasetType.dimensions.required.names 

820 ], 

821 ) 

822 ) 

823 ) 

824 result = self._db.query(query) 

825 if (row := result.first()) is not None: 

826 # Only include the first one in the exception message 

827 raise ConflictingDefinitionError( 

828 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

829 ) 

830 

831 # Check that matching run+dataId have the same dataset ID. 

832 query = ( 

833 sqlalchemy.sql.select( 

834 tags.columns.dataset_type_id.label("dataset_type_id"), 

835 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

836 tags.columns.dataset_id, 

837 tmp_tags.columns.dataset_id.label("new dataset_id"), 

838 tags.columns[collFkName], 

839 tmp_tags.columns[collFkName].label(f"new {collFkName}"), 

840 ) 

841 .select_from( 

842 tags.join( 

843 tmp_tags, 

844 sqlalchemy.sql.and_( 

845 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

846 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

847 *[ 

848 tags.columns[dim] == tmp_tags.columns[dim] 

849 for dim in self.datasetType.dimensions.required.names 

850 ], 

851 ), 

852 ) 

853 ) 

854 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

855 ) 

856 result = self._db.query(query) 

857 if (row := result.first()) is not None: 

858 # only include the first one in the exception message 

859 raise ConflictingDefinitionError( 

860 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" 

861 ) 

862 

863 def _makeDatasetId( 

864 self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum 

865 ) -> uuid.UUID: 

866 """Generate dataset ID for a dataset. 

867 

868 Parameters 

869 ---------- 

870 run : `RunRecord` 

871 The record object describing the RUN collection for the dataset. 

872 dataId : `DataCoordinate` 

873 Expanded data ID for the dataset. 

874 idGenerationMode : `DatasetIdGenEnum` 

875 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random 

876 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

877 deterministic UUID5-type ID based on a dataset type name and 

878 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

879 deterministic UUID5-type ID based on a dataset type name, run 

880 collection name, and ``dataId``. 

881 

882 Returns 

883 ------- 

884 datasetId : `uuid.UUID` 

885 Dataset identifier. 

886 """ 

887 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

888 return uuid.uuid4() 

889 else: 

890 # WARNING: If you modify this code make sure that the order of 

891 # items in the `items` list below never changes. 

892 items: List[Tuple[str, str]] = [] 

893 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

894 items = [ 

895 ("dataset_type", self.datasetType.name), 

896 ] 

897 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 897 ↛ 903line 897 didn't jump to line 903, because the condition on line 897 was never false

898 items = [ 

899 ("dataset_type", self.datasetType.name), 

900 ("run", run.name), 

901 ] 

902 else: 

903 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

904 

905 for name, value in sorted(dataId.byName().items()): 

906 items.append((name, str(value))) 

907 data = ",".join(f"{key}={value}" for key, value in items) 

908 return uuid.uuid5(self.NS_UUID, data)