Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorage",) 

4 

5from typing import ( 

6 Any, 

7 Callable, 

8 Dict, 

9 Iterable, 

10 Iterator, 

11 List, 

12 Optional, 

13 Set, 

14 Tuple, 

15 TYPE_CHECKING, 

16) 

17import uuid 

18 

19import sqlalchemy 

20 

21from lsst.daf.butler import ( 

22 CollectionType, 

23 DataCoordinate, 

24 DataCoordinateSet, 

25 DatasetId, 

26 DatasetRef, 

27 DatasetType, 

28 SimpleQuery, 

29 Timespan, 

30) 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.daf.butler.registry.interfaces import DatasetRecordStorage, DatasetIdGenEnum 

33 

34from ...summaries import GovernorDimensionRestriction 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

38 from .tables import StaticDatasetTablesTuple 

39 from .summaries import CollectionSummaryManager 

40 

41 

42class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

43 """Dataset record storage implementation paired with 

44 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

45 information. 

46 

47 Instances of this class should never be constructed directly; use 

48 `DatasetRecordStorageManager.register` instead. 

49 """ 

50 

51 def __init__(self, *, datasetType: DatasetType, 

52 db: Database, 

53 dataset_type_id: int, 

54 collections: CollectionManager, 

55 static: StaticDatasetTablesTuple, 

56 summaries: CollectionSummaryManager, 

57 tags: sqlalchemy.schema.Table, 

58 calibs: Optional[sqlalchemy.schema.Table]): 

59 super().__init__(datasetType=datasetType) 

60 self._dataset_type_id = dataset_type_id 

61 self._db = db 

62 self._collections = collections 

63 self._static = static 

64 self._summaries = summaries 

65 self._tags = tags 

66 self._calibs = calibs 

67 self._runKeyColumn = collections.getRunForeignKeyName() 

68 

69 def find(self, collection: CollectionRecord, dataId: DataCoordinate, 

70 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]: 

71 # Docstring inherited from DatasetRecordStorage. 

72 assert dataId.graph == self.datasetType.dimensions 

73 if collection.type is CollectionType.CALIBRATION and timespan is None: 73 ↛ 74line 73 didn't jump to line 74, because the condition on line 73 was never true

74 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

75 f"without an input timespan.") 

76 sql = self.select(collection=collection, dataId=dataId, id=SimpleQuery.Select, 

77 run=SimpleQuery.Select, timespan=timespan) 

78 if sql is None: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 return None 

80 else: 

81 sql = sql.combine() 

82 results = self._db.query(sql) 

83 row = results.fetchone() 

84 if row is None: 

85 return None 

86 if collection.type is CollectionType.CALIBRATION: 

87 # For temporal calibration lookups (only!) our invariants do not 

88 # guarantee that the number of result rows is <= 1. 

89 # They would if `select` constrained the given timespan to be 

90 # _contained_ by the validity range in the self._calibs table, 

91 # instead of simply _overlapping_ it, because we do guarantee that 

92 # the validity ranges are disjoint for a particular dataset type, 

93 # collection, and data ID. But using an overlap test and a check 

94 # for multiple result rows here allows us to provide a more useful 

95 # diagnostic, as well as allowing `select` to support more general 

96 # queries where multiple results are not an error. 

97 if results.fetchone() is not None: 

98 raise RuntimeError( 

99 f"Multiple matches found for calibration lookup in {collection.name} for " 

100 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

101 ) 

102 return DatasetRef( 

103 datasetType=self.datasetType, 

104 dataId=dataId, 

105 id=row["id"], 

106 run=self._collections[row[self._runKeyColumn]].name 

107 ) 

108 

109 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

110 # Docstring inherited from DatasetRecordStorage. 

111 # Only delete from common dataset table; ON DELETE foreign key clauses 

112 # will handle the rest. 

113 self._db.delete( 

114 self._static.dataset, 

115 ["id"], 

116 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

117 ) 

118 

119 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

120 # Docstring inherited from DatasetRecordStorage. 

121 if collection.type is not CollectionType.TAGGED: 121 ↛ 122line 121 didn't jump to line 122, because the condition on line 121 was never true

122 raise TypeError(f"Cannot associate into collection '{collection.name}' " 

123 f"of type {collection.type.name}; must be TAGGED.") 

124 protoRow = { 

125 self._collections.getCollectionForeignKeyName(): collection.key, 

126 "dataset_type_id": self._dataset_type_id, 

127 } 

128 rows = [] 

129 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

130 for dataset in datasets: 

131 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

132 for dimension, value in dataset.dataId.items(): 

133 row[dimension.name] = value 

134 governorValues.update_extract(dataset.dataId) 

135 rows.append(row) 

136 # Update the summary tables for this collection in case this is the 

137 # first time this dataset type or these governor values will be 

138 # inserted there. 

139 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

140 # Update the tag table itself. 

141 self._db.replace(self._tags, *rows) 

142 

143 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

144 # Docstring inherited from DatasetRecordStorage. 

145 if collection.type is not CollectionType.TAGGED: 145 ↛ 146line 145 didn't jump to line 146, because the condition on line 145 was never true

146 raise TypeError(f"Cannot disassociate from collection '{collection.name}' " 

147 f"of type {collection.type.name}; must be TAGGED.") 

148 rows = [ 

149 { 

150 "dataset_id": dataset.getCheckedId(), 

151 self._collections.getCollectionForeignKeyName(): collection.key 

152 } 

153 for dataset in datasets 

154 ] 

155 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], 

156 *rows) 

157 

158 def _buildCalibOverlapQuery(self, collection: CollectionRecord, 

159 dataIds: Optional[DataCoordinateSet], 

160 timespan: Timespan) -> SimpleQuery: 

161 assert self._calibs is not None 

162 # Start by building a SELECT query for any rows that would overlap 

163 # this one. 

164 query = SimpleQuery() 

165 query.join(self._calibs) 

166 # Add a WHERE clause matching the dataset type and collection. 

167 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

168 query.where.append( 

169 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

170 ) 

171 # Add a WHERE clause matching any of the given data IDs. 

172 if dataIds is not None: 

173 dataIds.constrain( 

174 query, 

175 lambda name: self._calibs.columns[name], # type: ignore 

176 ) 

177 # Add WHERE clause for timespan overlaps. 

178 TimespanReprClass = self._db.getTimespanRepresentation() 

179 query.where.append( 

180 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan)) 

181 ) 

182 return query 

183 

184 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef], 

185 timespan: Timespan) -> None: 

186 # Docstring inherited from DatasetRecordStorage. 

187 if self._calibs is None: 187 ↛ 188line 187 didn't jump to line 188, because the condition on line 187 was never true

188 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which " 

189 f"DatasetType.isCalibration() is False.") 

190 if collection.type is not CollectionType.CALIBRATION: 190 ↛ 191line 190 didn't jump to line 191, because the condition on line 190 was never true

191 raise TypeError(f"Cannot certify into collection '{collection.name}' " 

192 f"of type {collection.type.name}; must be CALIBRATION.") 

193 TimespanReprClass = self._db.getTimespanRepresentation() 

194 protoRow = { 

195 self._collections.getCollectionForeignKeyName(): collection.key, 

196 "dataset_type_id": self._dataset_type_id, 

197 } 

198 rows = [] 

199 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

200 dataIds: Optional[Set[DataCoordinate]] = ( 

201 set() if not TimespanReprClass.hasExclusionConstraint() else None 

202 ) 

203 for dataset in datasets: 

204 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

205 for dimension, value in dataset.dataId.items(): 

206 row[dimension.name] = value 

207 TimespanReprClass.update(timespan, result=row) 

208 governorValues.update_extract(dataset.dataId) 

209 rows.append(row) 

210 if dataIds is not None: 210 ↛ 203line 210 didn't jump to line 203, because the condition on line 210 was never false

211 dataIds.add(dataset.dataId) 

212 # Update the summary tables for this collection in case this is the 

213 # first time this dataset type or these governor values will be 

214 # inserted there. 

215 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

216 # Update the association table itself. 

217 if TimespanReprClass.hasExclusionConstraint(): 217 ↛ 220line 217 didn't jump to line 220, because the condition on line 217 was never true

218 # Rely on database constraint to enforce invariants; we just 

219 # reraise the exception for consistency across DB engines. 

220 try: 

221 self._db.insert(self._calibs, *rows) 

222 except sqlalchemy.exc.IntegrityError as err: 

223 raise ConflictingDefinitionError( 

224 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

225 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

226 ) from err 

227 else: 

228 # Have to implement exclusion constraint ourselves. 

229 # Start by building a SELECT query for any rows that would overlap 

230 # this one. 

231 query = self._buildCalibOverlapQuery( 

232 collection, 

233 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

234 timespan 

235 ) 

236 query.columns.append(sqlalchemy.sql.func.count()) 

237 sql = query.combine() 

238 # Acquire a table lock to ensure there are no concurrent writes 

239 # could invalidate our checking before we finish the inserts. We 

240 # use a SAVEPOINT in case there is an outer transaction that a 

241 # failure here should not roll back. 

242 with self._db.transaction(lock=[self._calibs], savepoint=True): 

243 # Run the check SELECT query. 

244 conflicting = self._db.query(sql).scalar() 

245 if conflicting > 0: 

246 raise ConflictingDefinitionError( 

247 f"{conflicting} validity range conflicts certifying datasets of type " 

248 f"{self.datasetType.name} into {collection.name} for range " 

249 f"[{timespan.begin}, {timespan.end})." 

250 ) 

251 # Proceed with the insert. 

252 self._db.insert(self._calibs, *rows) 

253 

254 def decertify(self, collection: CollectionRecord, timespan: Timespan, *, 

255 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None: 

256 # Docstring inherited from DatasetRecordStorage. 

257 if self._calibs is None: 257 ↛ 258line 257 didn't jump to line 258, because the condition on line 257 was never true

258 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

259 f"DatasetType.isCalibration() is False.") 

260 if collection.type is not CollectionType.CALIBRATION: 260 ↛ 261line 260 didn't jump to line 261, because the condition on line 260 was never true

261 raise TypeError(f"Cannot decertify from collection '{collection.name}' " 

262 f"of type {collection.type.name}; must be CALIBRATION.") 

263 TimespanReprClass = self._db.getTimespanRepresentation() 

264 # Construct a SELECT query to find all rows that overlap our inputs. 

265 dataIdSet: Optional[DataCoordinateSet] 

266 if dataIds is not None: 

267 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

268 else: 

269 dataIdSet = None 

270 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

271 query.columns.extend(self._calibs.columns) 

272 sql = query.combine() 

273 # Set up collections to populate with the rows we'll want to modify. 

274 # The insert rows will have the same values for collection and 

275 # dataset type. 

276 protoInsertRow = { 

277 self._collections.getCollectionForeignKeyName(): collection.key, 

278 "dataset_type_id": self._dataset_type_id, 

279 } 

280 rowsToDelete = [] 

281 rowsToInsert = [] 

282 # Acquire a table lock to ensure there are no concurrent writes 

283 # between the SELECT and the DELETE and INSERT queries based on it. 

284 with self._db.transaction(lock=[self._calibs], savepoint=True): 

285 for row in self._db.query(sql): 

286 rowsToDelete.append({"id": row["id"]}) 

287 # Construct the insert row(s) by copying the prototype row, 

288 # then adding the dimension column values, then adding what's 

289 # left of the timespan from that row after we subtract the 

290 # given timespan. 

291 newInsertRow = protoInsertRow.copy() 

292 newInsertRow["dataset_id"] = row["dataset_id"] 

293 for name in self.datasetType.dimensions.required.names: 

294 newInsertRow[name] = row[name] 

295 rowTimespan = TimespanReprClass.extract(row) 

296 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

297 for diffTimespan in rowTimespan.difference(timespan): 

298 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

299 # Run the DELETE and INSERT queries. 

300 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

301 self._db.insert(self._calibs, *rowsToInsert) 

302 

303 def select(self, collection: CollectionRecord, 

304 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

305 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select, 

306 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

307 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

308 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

309 ) -> Optional[SimpleQuery]: 

310 # Docstring inherited from DatasetRecordStorage. 

311 assert collection.type is not CollectionType.CHAINED 

312 # 

313 # There are two tables in play here: 

314 # 

315 # - the static dataset table (with the dataset ID, dataset type ID, 

316 # run ID/name, and ingest date); 

317 # 

318 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

319 # type ID, collection ID/name, data ID, and possibly validity 

320 # range). 

321 # 

322 # That means that we might want to return a query against either table 

323 # or a JOIN of both, depending on which quantities the caller wants. 

324 # But this method is documented/typed such that ``dataId`` is never 

325 # `None` - i.e. we always constrain or retreive the data ID. That 

326 # means we'll always include the tags/calibs table and join in the 

327 # static dataset table only if we need things from it that we can't get 

328 # from the tags/calibs table. 

329 # 

330 # Note that it's important that we include a WHERE constraint on both 

331 # tables for any column (e.g. dataset_type_id) that is in both when 

332 # it's given explicitly; not doing can prevent the query planner from 

333 # using very important indexes. At present, we don't include those 

334 # redundant columns in the JOIN ON expression, however, because the 

335 # FOREIGN KEY (and its index) are defined only on dataset_id. 

336 # 

337 # We'll start with an empty SimpleQuery, and accumulate kwargs to pass 

338 # to its `join` method when we bring in the tags/calibs table. 

339 query = SimpleQuery() 

340 # We get the data ID or constrain it in the tags/calibs table, but 

341 # that's multiple columns, not one, so we need to transform the one 

342 # Select.Or argument into a dictionary of them. 

343 kwargs: Dict[str, Any] 

344 if dataId is SimpleQuery.Select: 

345 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

346 else: 

347 kwargs = dict(dataId.byName()) 

348 # We always constrain (never retrieve) the collection in the 

349 # tags/calibs table. 

350 kwargs[self._collections.getCollectionForeignKeyName()] = collection.key 

351 # We always constrain (never retrieve) the dataset type in at least the 

352 # tags/calibs table. 

353 kwargs["dataset_type_id"] = self._dataset_type_id 

354 # Join in the tags or calibs table, turning those 'kwargs' entries into 

355 # WHERE constraints or SELECT columns as appropriate. 

356 if collection.type is CollectionType.CALIBRATION: 

357 assert self._calibs is not None, \ 

358 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

359 TimespanReprClass = self._db.getTimespanRepresentation() 

360 # Add the timespan column(s) to the result columns, or constrain 

361 # the timespan via an overlap condition. 

362 if timespan is SimpleQuery.Select: 

363 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()}) 

364 elif timespan is not None: 364 ↛ 370line 364 didn't jump to line 370, because the condition on line 364 was never false

365 query.where.append( 

366 TimespanReprClass.fromSelectable(self._calibs).overlaps( 

367 TimespanReprClass.fromLiteral(timespan) 

368 ) 

369 ) 

370 query.join(self._calibs, **kwargs) 

371 dataset_id_col = self._calibs.columns.dataset_id 

372 else: 

373 query.join(self._tags, **kwargs) 

374 dataset_id_col = self._tags.columns.dataset_id 

375 # We can always get the dataset_id from the tags/calibs table or 

376 # constrain it there. Can't use kwargs for that because we need to 

377 # alias it to 'id'. 

378 if id is SimpleQuery.Select: 

379 query.columns.append(dataset_id_col.label("id")) 

380 elif id is not None: 380 ↛ 381line 380 didn't jump to line 381, because the condition on line 380 was never true

381 query.where.append(dataset_id_col == id) 

382 # It's possible we now have everything we need, from just the 

383 # tags/calibs table. The things we might need to get from the static 

384 # dataset table are the run key and the ingest date. 

385 need_static_table = False 

386 static_kwargs = {} 

387 if run is not None: 

388 if collection.type is CollectionType.RUN: 

389 if run is SimpleQuery.Select: 389 ↛ 394line 389 didn't jump to line 394, because the condition on line 389 was never false

390 # If the collection we're searching is a RUN, we know that 

391 # if we find the dataset in that collection, then that's 

392 # the datasets's run; we don't need to query for it. 

393 query.columns.append(sqlalchemy.sql.literal(collection.key).label(self._runKeyColumn)) 

394 elif run != collection.name: 

395 # This [sub]query is doomed to yield no results; dataset 

396 # cannot be in more than one run. 

397 return None 

398 else: 

399 query.where.append(self._static.dataset.columns[self._runKeyColumn] == collection.key) 

400 else: 

401 static_kwargs[self._runKeyColumn] = ( 

402 SimpleQuery.Select if run is SimpleQuery.Select else self._collections.find(run).key 

403 ) 

404 need_static_table = True 

405 # Ingest date can only come from the static table. 

406 if ingestDate is not None: 

407 need_static_table = True 

408 if ingestDate is SimpleQuery.Select: 408 ↛ 411line 408 didn't jump to line 411, because the condition on line 408 was never false

409 static_kwargs["ingest_date"] = SimpleQuery.Select 

410 else: 

411 assert isinstance(ingestDate, Timespan) 

412 # Timespan is astropy Time (usually in TAI) and ingest_date is 

413 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

414 if ingestDate.isEmpty(): 

415 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

416 if ingestDate.begin is not None: 

417 begin = ingestDate.begin.utc.datetime # type: ignore 

418 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

419 if ingestDate.end is not None: 

420 end = ingestDate.end.utc.datetime # type: ignore 

421 query.where.append(self._static.dataset.columns.ingest_date < end) 

422 # If we need the static table, join it in via dataset_id and 

423 # dataset_type_id 

424 if need_static_table: 

425 query.join( 

426 self._static.dataset, 

427 onclause=(dataset_id_col == self._static.dataset.columns.id), 

428 **static_kwargs, 

429 ) 

430 # Also constrain dataset_type_id in static table in case that helps 

431 # generate a better plan. 

432 # We could also include this in the JOIN ON clause, but my guess is 

433 # that that's a good idea IFF it's in the foreign key, and right 

434 # now it isn't. 

435 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

436 return query 

437 

438 def getDataId(self, id: DatasetId) -> DataCoordinate: 

439 """Return DataId for a dataset. 

440 

441 Parameters 

442 ---------- 

443 id : `DatasetId` 

444 Unique dataset identifier. 

445 

446 Returns 

447 ------- 

448 dataId : `DataCoordinate` 

449 DataId for the dataset. 

450 """ 

451 # This query could return multiple rows (one for each tagged collection 

452 # the dataset is in, plus one for its run collection), and we don't 

453 # care which of those we get. 

454 sql = self._tags.select().where( 

455 sqlalchemy.sql.and_( 

456 self._tags.columns.dataset_id == id, 

457 self._tags.columns.dataset_type_id == self._dataset_type_id 

458 ) 

459 ).limit(1) 

460 row = self._db.query(sql).fetchone() 

461 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

462 return DataCoordinate.standardize( 

463 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

464 graph=self.datasetType.dimensions 

465 ) 

466 

467 

468class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

469 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

470 auto-incremented column for dataset IDs. 

471 """ 

472 

473 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

474 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

475 # Docstring inherited from DatasetRecordStorage. 

476 

477 # We only support UNIQUE mode for integer dataset IDs 

478 if idMode != DatasetIdGenEnum.UNIQUE: 478 ↛ 479line 478 didn't jump to line 479, because the condition on line 478 was never true

479 raise ValueError("Only UNIQUE mode can be used with integer dataset IDs.") 

480 

481 # Transform a possibly-single-pass iterable into a list. 

482 dataIdList = list(dataIds) 

483 yield from self._insert(run, dataIdList) 

484 

485 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

486 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

487 reuseIds: bool = False) -> Iterator[DatasetRef]: 

488 # Docstring inherited from DatasetRecordStorage. 

489 

490 # We only support UNIQUE mode for integer dataset IDs 

491 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 491 ↛ 492line 491 didn't jump to line 492, because the condition on line 491 was never true

492 raise ValueError("Only UNIQUE mode can be used with integer dataset IDs.") 

493 

494 # Make a list of dataIds and optionally dataset IDs. 

495 dataIdList: List[DataCoordinate] = [] 

496 datasetIdList: List[int] = [] 

497 for dataset in datasets: 

498 dataIdList.append(dataset.dataId) 

499 

500 # We only accept integer dataset IDs, but also allow None. 

501 datasetId = dataset.id 

502 if datasetId is None: 502 ↛ 504line 502 didn't jump to line 504, because the condition on line 502 was never true

503 # if reuseIds is set then all IDs must be known 

504 if reuseIds: 

505 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

506 elif isinstance(datasetId, int): 506 ↛ 510line 506 didn't jump to line 510, because the condition on line 506 was never false

507 if reuseIds: 

508 datasetIdList.append(datasetId) 

509 else: 

510 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

511 

512 yield from self._insert(run, dataIdList, datasetIdList) 

513 

514 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate], 

515 datasetIdList: Optional[List[int]] = None) -> Iterator[DatasetRef]: 

516 """Common part of implementation of `insert` and `import_` methods. 

517 """ 

518 

519 # Remember any governor dimension values we see. 

520 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

521 for dataId in dataIdList: 

522 governorValues.update_extract(dataId) 

523 

524 staticRow = { 

525 "dataset_type_id": self._dataset_type_id, 

526 self._runKeyColumn: run.key, 

527 } 

528 with self._db.transaction(): 

529 # Insert into the static dataset table, generating autoincrement 

530 # dataset_id values. 

531 if datasetIdList: 

532 # reuse existing IDs 

533 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

534 self._db.insert(self._static.dataset, *rows) 

535 else: 

536 # use auto-incremented IDs 

537 datasetIdList = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)), 

538 returnIds=True) 

539 assert datasetIdList is not None 

540 # Update the summary tables for this collection in case this is the 

541 # first time this dataset type or these governor values will be 

542 # inserted there. 

543 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

544 # Combine the generated dataset_id values and data ID fields to 

545 # form rows to be inserted into the tags table. 

546 protoTagsRow = { 

547 "dataset_type_id": self._dataset_type_id, 

548 self._collections.getCollectionForeignKeyName(): run.key, 

549 } 

550 tagsRows = [ 

551 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

552 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

553 ] 

554 # Insert those rows into the tags table. This is where we'll 

555 # get any unique constraint violations. 

556 self._db.insert(self._tags, *tagsRows) 

557 

558 for dataId, datasetId in zip(dataIdList, datasetIdList): 

559 yield DatasetRef( 

560 datasetType=self.datasetType, 

561 dataId=dataId, 

562 id=datasetId, 

563 run=run.name, 

564 ) 

565 

566 

567class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

568 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

569 dataset IDs. 

570 """ 

571 

572 NS_UUID = uuid.UUID('840b31d9-05cd-5161-b2c8-00d32b280d0f') 

573 """Namespace UUID used for UUID5 generation. Do not change. This was 

574 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

575 """ 

576 

577 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

578 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

579 # Docstring inherited from DatasetRecordStorage. 

580 

581 # Iterate over data IDs, transforming a possibly-single-pass iterable 

582 # into a list. 

583 dataIdList = [] 

584 rows = [] 

585 for dataId in dataIds: 

586 dataIdList.append(dataId) 

587 rows.append({ 

588 "id": self._makeDatasetId(run, dataId, idMode), 

589 "dataset_type_id": self._dataset_type_id, 

590 self._runKeyColumn: run.key, 

591 }) 

592 

593 yield from self._insert(run, dataIdList, rows, self._db.insert) 

594 

595 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

596 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

597 reuseIds: bool = False) -> Iterator[DatasetRef]: 

598 # Docstring inherited from DatasetRecordStorage. 

599 

600 # Iterate over data IDs, transforming a possibly-single-pass iterable 

601 # into a list. 

602 dataIdList = [] 

603 rows = [] 

604 for dataset in datasets: 

605 dataIdList.append(dataset.dataId) 

606 # Ignore unknown ID types, normally all IDs have the same type but 

607 # this code supports mixed types or missing IDs. 

608 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

609 if datasetId is None: 

610 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode) 

611 rows.append({ 

612 "id": datasetId, 

613 "dataset_type_id": self._dataset_type_id, 

614 self._runKeyColumn: run.key, 

615 }) 

616 

617 yield from self._insert(run, dataIdList, rows, self._db.ensure) 

618 

619 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate], 

620 rows: List[Dict], insertMethod: Callable) -> Iterator[DatasetRef]: 

621 """Common part of implementation of `insert` and `import_` methods. 

622 """ 

623 

624 # Remember any governor dimension values we see. 

625 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

626 for dataId in dataIdList: 

627 governorValues.update_extract(dataId) 

628 

629 with self._db.transaction(): 

630 # Insert into the static dataset table. 

631 insertMethod(self._static.dataset, *rows) 

632 # Update the summary tables for this collection in case this is the 

633 # first time this dataset type or these governor values will be 

634 # inserted there. 

635 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

636 # Combine the generated dataset_id values and data ID fields to 

637 # form rows to be inserted into the tags table. 

638 protoTagsRow = { 

639 "dataset_type_id": self._dataset_type_id, 

640 self._collections.getCollectionForeignKeyName(): run.key, 

641 } 

642 tagsRows = [ 

643 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

644 for dataId, row in zip(dataIdList, rows) 

645 ] 

646 # Insert those rows into the tags table. 

647 insertMethod(self._tags, *tagsRows) 

648 for dataId, row in zip(dataIdList, rows): 

649 yield DatasetRef( 

650 datasetType=self.datasetType, 

651 dataId=dataId, 

652 id=row["id"], 

653 run=run.name, 

654 ) 

655 

656 def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate, 

657 idGenerationMode: DatasetIdGenEnum) -> uuid.UUID: 

658 """Generate dataset ID for a dataset. 

659 

660 Parameters 

661 ---------- 

662 run : `RunRecord` 

663 The record object describing the RUN collection for the dataset. 

664 dataId : `DataCoordinate` 

665 Expanded data ID for the dataset. 

666 idGenerationMode : `DatasetIdGenEnum` 

667 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random 

668 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

669 deterministic UUID5-type ID based on a dataset type name and 

670 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

671 deterministic UUID5-type ID based on a dataset type name, run 

672 collection name, and ``dataId``. 

673 

674 Returns 

675 ------- 

676 datasetId : `uuid.UUID` 

677 Dataset identifier. 

678 """ 

679 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 679 ↛ 684line 679 didn't jump to line 684, because the condition on line 679 was never false

680 return uuid.uuid4() 

681 else: 

682 # WARNING: If you modify this code make sure that the order of 

683 # items in the `items` list below never changes. 

684 items: List[Tuple[str, str]] = [] 

685 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

686 items = [ 

687 ("dataset_type", self.datasetType.name), 

688 ] 

689 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 

690 items = [ 

691 ("dataset_type", self.datasetType.name), 

692 ("run", run.name), 

693 ] 

694 else: 

695 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

696 

697 for name, value in sorted(dataId.byName().items()): 

698 items.append((name, str(value))) 

699 data = ",".join(f"{key}={value}" for key, value in items) 

700 return uuid.uuid5(self.NS_UUID, data)