Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from __future__ import annotations 

2 

3__all__ = ("ByDimensionsDatasetRecordStorage",) 

4 

5from typing import ( 

6 Any, 

7 Callable, 

8 Dict, 

9 Iterable, 

10 Iterator, 

11 List, 

12 Optional, 

13 Set, 

14 Tuple, 

15 TYPE_CHECKING, 

16) 

17import uuid 

18 

19import sqlalchemy 

20 

21from lsst.daf.butler import ( 

22 CollectionType, 

23 DataCoordinate, 

24 DataCoordinateSet, 

25 DatasetId, 

26 DatasetRef, 

27 DatasetType, 

28 SimpleQuery, 

29 Timespan, 

30) 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.daf.butler.registry.interfaces import DatasetRecordStorage, DatasetIdGenEnum 

33 

34from ...summaries import GovernorDimensionRestriction 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

38 from .tables import StaticDatasetTablesTuple 

39 from .summaries import CollectionSummaryManager 

40 

41 

42class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

43 """Dataset record storage implementation paired with 

44 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

45 information. 

46 

47 Instances of this class should never be constructed directly; use 

48 `DatasetRecordStorageManager.register` instead. 

49 """ 

50 def __init__(self, *, datasetType: DatasetType, 

51 db: Database, 

52 dataset_type_id: int, 

53 collections: CollectionManager, 

54 static: StaticDatasetTablesTuple, 

55 summaries: CollectionSummaryManager, 

56 tags: sqlalchemy.schema.Table, 

57 calibs: Optional[sqlalchemy.schema.Table]): 

58 super().__init__(datasetType=datasetType) 

59 self._dataset_type_id = dataset_type_id 

60 self._db = db 

61 self._collections = collections 

62 self._static = static 

63 self._summaries = summaries 

64 self._tags = tags 

65 self._calibs = calibs 

66 self._runKeyColumn = collections.getRunForeignKeyName() 

67 

68 def find(self, collection: CollectionRecord, dataId: DataCoordinate, 

69 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]: 

70 # Docstring inherited from DatasetRecordStorage. 

71 assert dataId.graph == self.datasetType.dimensions 

72 if collection.type is CollectionType.CALIBRATION and timespan is None: 72 ↛ 73line 72 didn't jump to line 73, because the condition on line 72 was never true

73 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

74 f"without an input timespan.") 

75 sql = self.select(collection=collection, dataId=dataId, id=SimpleQuery.Select, 

76 run=SimpleQuery.Select, timespan=timespan).combine() 

77 results = self._db.query(sql) 

78 row = results.fetchone() 

79 if row is None: 

80 return None 

81 if collection.type is CollectionType.CALIBRATION: 

82 # For temporal calibration lookups (only!) our invariants do not 

83 # guarantee that the number of result rows is <= 1. 

84 # They would if `select` constrained the given timespan to be 

85 # _contained_ by the validity range in the self._calibs table, 

86 # instead of simply _overlapping_ it, because we do guarantee that 

87 # the validity ranges are disjoint for a particular dataset type, 

88 # collection, and data ID. But using an overlap test and a check 

89 # for multiple result rows here allows us to provide a more useful 

90 # diagnostic, as well as allowing `select` to support more general 

91 # queries where multiple results are not an error. 

92 if results.fetchone() is not None: 

93 raise RuntimeError( 

94 f"Multiple matches found for calibration lookup in {collection.name} for " 

95 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

96 ) 

97 return DatasetRef( 

98 datasetType=self.datasetType, 

99 dataId=dataId, 

100 id=row["id"], 

101 run=self._collections[row[self._runKeyColumn]].name 

102 ) 

103 

104 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

105 # Docstring inherited from DatasetRecordStorage. 

106 # Only delete from common dataset table; ON DELETE foreign key clauses 

107 # will handle the rest. 

108 self._db.delete( 

109 self._static.dataset, 

110 ["id"], 

111 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

112 ) 

113 

114 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

115 # Docstring inherited from DatasetRecordStorage. 

116 if collection.type is not CollectionType.TAGGED: 116 ↛ 117line 116 didn't jump to line 117, because the condition on line 116 was never true

117 raise TypeError(f"Cannot associate into collection '{collection.name}' " 

118 f"of type {collection.type.name}; must be TAGGED.") 

119 protoRow = { 

120 self._collections.getCollectionForeignKeyName(): collection.key, 

121 "dataset_type_id": self._dataset_type_id, 

122 } 

123 rows = [] 

124 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

125 for dataset in datasets: 

126 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

127 for dimension, value in dataset.dataId.items(): 

128 row[dimension.name] = value 

129 governorValues.update_extract(dataset.dataId) 

130 rows.append(row) 

131 # Update the summary tables for this collection in case this is the 

132 # first time this dataset type or these governor values will be 

133 # inserted there. 

134 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

135 # Update the tag table itself. 

136 self._db.replace(self._tags, *rows) 

137 

138 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

139 # Docstring inherited from DatasetRecordStorage. 

140 if collection.type is not CollectionType.TAGGED: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true

141 raise TypeError(f"Cannot disassociate from collection '{collection.name}' " 

142 f"of type {collection.type.name}; must be TAGGED.") 

143 rows = [ 

144 { 

145 "dataset_id": dataset.getCheckedId(), 

146 self._collections.getCollectionForeignKeyName(): collection.key 

147 } 

148 for dataset in datasets 

149 ] 

150 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], 

151 *rows) 

152 

153 def _buildCalibOverlapQuery(self, collection: CollectionRecord, 

154 dataIds: Optional[DataCoordinateSet], 

155 timespan: Timespan) -> SimpleQuery: 

156 assert self._calibs is not None 

157 # Start by building a SELECT query for any rows that would overlap 

158 # this one. 

159 query = SimpleQuery() 

160 query.join(self._calibs) 

161 # Add a WHERE clause matching the dataset type and collection. 

162 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

163 query.where.append( 

164 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

165 ) 

166 # Add a WHERE clause matching any of the given data IDs. 

167 if dataIds is not None: 

168 dataIds.constrain( 

169 query, 

170 lambda name: self._calibs.columns[name], # type: ignore 

171 ) 

172 # Add WHERE clause for timespan overlaps. 

173 TimespanReprClass = self._db.getTimespanRepresentation() 

174 query.where.append( 

175 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan)) 

176 ) 

177 return query 

178 

179 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef], 

180 timespan: Timespan) -> None: 

181 # Docstring inherited from DatasetRecordStorage. 

182 if self._calibs is None: 182 ↛ 183line 182 didn't jump to line 183, because the condition on line 182 was never true

183 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which " 

184 f"DatasetType.isCalibration() is False.") 

185 if collection.type is not CollectionType.CALIBRATION: 185 ↛ 186line 185 didn't jump to line 186, because the condition on line 185 was never true

186 raise TypeError(f"Cannot certify into collection '{collection.name}' " 

187 f"of type {collection.type.name}; must be CALIBRATION.") 

188 TimespanReprClass = self._db.getTimespanRepresentation() 

189 protoRow = { 

190 self._collections.getCollectionForeignKeyName(): collection.key, 

191 "dataset_type_id": self._dataset_type_id, 

192 } 

193 rows = [] 

194 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

195 dataIds: Optional[Set[DataCoordinate]] = ( 

196 set() if not TimespanReprClass.hasExclusionConstraint() else None 

197 ) 

198 for dataset in datasets: 

199 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

200 for dimension, value in dataset.dataId.items(): 

201 row[dimension.name] = value 

202 TimespanReprClass.update(timespan, result=row) 

203 governorValues.update_extract(dataset.dataId) 

204 rows.append(row) 

205 if dataIds is not None: 205 ↛ 198line 205 didn't jump to line 198, because the condition on line 205 was never false

206 dataIds.add(dataset.dataId) 

207 # Update the summary tables for this collection in case this is the 

208 # first time this dataset type or these governor values will be 

209 # inserted there. 

210 self._summaries.update(collection, self.datasetType, self._dataset_type_id, governorValues) 

211 # Update the association table itself. 

212 if TimespanReprClass.hasExclusionConstraint(): 212 ↛ 215line 212 didn't jump to line 215, because the condition on line 212 was never true

213 # Rely on database constraint to enforce invariants; we just 

214 # reraise the exception for consistency across DB engines. 

215 try: 

216 self._db.insert(self._calibs, *rows) 

217 except sqlalchemy.exc.IntegrityError as err: 

218 raise ConflictingDefinitionError( 

219 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

220 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

221 ) from err 

222 else: 

223 # Have to implement exclusion constraint ourselves. 

224 # Start by building a SELECT query for any rows that would overlap 

225 # this one. 

226 query = self._buildCalibOverlapQuery( 

227 collection, 

228 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

229 timespan 

230 ) 

231 query.columns.append(sqlalchemy.sql.func.count()) 

232 sql = query.combine() 

233 # Acquire a table lock to ensure there are no concurrent writes 

234 # could invalidate our checking before we finish the inserts. We 

235 # use a SAVEPOINT in case there is an outer transaction that a 

236 # failure here should not roll back. 

237 with self._db.transaction(lock=[self._calibs], savepoint=True): 

238 # Run the check SELECT query. 

239 conflicting = self._db.query(sql).scalar() 

240 if conflicting > 0: 

241 raise ConflictingDefinitionError( 

242 f"{conflicting} validity range conflicts certifying datasets of type " 

243 f"{self.datasetType.name} into {collection.name} for range " 

244 f"[{timespan.begin}, {timespan.end})." 

245 ) 

246 # Proceed with the insert. 

247 self._db.insert(self._calibs, *rows) 

248 

249 def decertify(self, collection: CollectionRecord, timespan: Timespan, *, 

250 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None: 

251 # Docstring inherited from DatasetRecordStorage. 

252 if self._calibs is None: 252 ↛ 253line 252 didn't jump to line 253, because the condition on line 252 was never true

253 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

254 f"DatasetType.isCalibration() is False.") 

255 if collection.type is not CollectionType.CALIBRATION: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true

256 raise TypeError(f"Cannot decertify from collection '{collection.name}' " 

257 f"of type {collection.type.name}; must be CALIBRATION.") 

258 TimespanReprClass = self._db.getTimespanRepresentation() 

259 # Construct a SELECT query to find all rows that overlap our inputs. 

260 dataIdSet: Optional[DataCoordinateSet] 

261 if dataIds is not None: 

262 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

263 else: 

264 dataIdSet = None 

265 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

266 query.columns.extend(self._calibs.columns) 

267 sql = query.combine() 

268 # Set up collections to populate with the rows we'll want to modify. 

269 # The insert rows will have the same values for collection and 

270 # dataset type. 

271 protoInsertRow = { 

272 self._collections.getCollectionForeignKeyName(): collection.key, 

273 "dataset_type_id": self._dataset_type_id, 

274 } 

275 rowsToDelete = [] 

276 rowsToInsert = [] 

277 # Acquire a table lock to ensure there are no concurrent writes 

278 # between the SELECT and the DELETE and INSERT queries based on it. 

279 with self._db.transaction(lock=[self._calibs], savepoint=True): 

280 for row in self._db.query(sql): 

281 rowsToDelete.append({"id": row["id"]}) 

282 # Construct the insert row(s) by copying the prototype row, 

283 # then adding the dimension column values, then adding what's 

284 # left of the timespan from that row after we subtract the 

285 # given timespan. 

286 newInsertRow = protoInsertRow.copy() 

287 newInsertRow["dataset_id"] = row["dataset_id"] 

288 for name in self.datasetType.dimensions.required.names: 

289 newInsertRow[name] = row[name] 

290 rowTimespan = TimespanReprClass.extract(row) 

291 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

292 for diffTimespan in rowTimespan.difference(timespan): 

293 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

294 # Run the DELETE and INSERT queries. 

295 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

296 self._db.insert(self._calibs, *rowsToInsert) 

297 

298 def select(self, collection: CollectionRecord, 

299 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

300 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select, 

301 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

302 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

303 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

304 ) -> SimpleQuery: 

305 # Docstring inherited from DatasetRecordStorage. 

306 assert collection.type is not CollectionType.CHAINED 

307 query = SimpleQuery() 

308 # We always include the _static.dataset table, and we can always get 

309 # the id and run fields from that; passing them as kwargs here tells 

310 # SimpleQuery to handle them whether they're constraints or results. 

311 # We always constraint the dataset_type_id here as well. 

312 static_kwargs = {self._runKeyColumn: run} 

313 if ingestDate is not None: 

314 static_kwargs["ingest_date"] = SimpleQuery.Select 

315 query.join( 

316 self._static.dataset, 

317 id=id, 

318 dataset_type_id=self._dataset_type_id, 

319 **static_kwargs 

320 ) 

321 # If and only if the collection is a RUN, we constrain it in the static 

322 # table (and also the tags or calibs table below) 

323 if collection.type is CollectionType.RUN: 

324 query.where.append(self._static.dataset.columns[self._runKeyColumn] 

325 == collection.key) 

326 # We get or constrain the data ID from the tags/calibs table, but 

327 # that's multiple columns, not one, so we need to transform the one 

328 # Select.Or argument into a dictionary of them. 

329 kwargs: Dict[str, Any] 

330 if dataId is SimpleQuery.Select: 

331 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

332 else: 

333 kwargs = dict(dataId.byName()) 

334 # We always constrain (never retrieve) the collection from the tags 

335 # table. 

336 kwargs[self._collections.getCollectionForeignKeyName()] = collection.key 

337 # constrain ingest time 

338 if isinstance(ingestDate, Timespan): 338 ↛ 341line 338 didn't jump to line 341, because the condition on line 338 was never true

339 # Tmespan is astropy Time (usually in TAI) and ingest_date is 

340 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

341 if ingestDate.isEmpty(): 

342 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

343 if ingestDate.begin is not None: 

344 begin = ingestDate.begin.utc.datetime # type: ignore 

345 query.where.append(self._static.dataset.ingest_date >= begin) 

346 if ingestDate.end is not None: 

347 end = ingestDate.end.utc.datetime # type: ignore 

348 query.where.append(self._static.dataset.ingest_date < end) 

349 # And now we finally join in the tags or calibs table. 

350 if collection.type is CollectionType.CALIBRATION: 

351 assert self._calibs is not None, \ 

352 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

353 TimespanReprClass = self._db.getTimespanRepresentation() 

354 # Add the timespan column(s) to the result columns, or constrain 

355 # the timespan via an overlap condition. 

356 if timespan is SimpleQuery.Select: 

357 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()}) 

358 elif timespan is not None: 358 ↛ 364line 358 didn't jump to line 364, because the condition on line 358 was never false

359 query.where.append( 

360 TimespanReprClass.fromSelectable(self._calibs).overlaps( 

361 TimespanReprClass.fromLiteral(timespan) 

362 ) 

363 ) 

364 query.join( 

365 self._calibs, 

366 onclause=(self._static.dataset.columns.id == self._calibs.columns.dataset_id), 

367 **kwargs 

368 ) 

369 else: 

370 query.join( 

371 self._tags, 

372 onclause=(self._static.dataset.columns.id == self._tags.columns.dataset_id), 

373 **kwargs 

374 ) 

375 return query 

376 

377 def getDataId(self, id: DatasetId) -> DataCoordinate: 

378 """Return DataId for a dataset. 

379 

380 Parameters 

381 ---------- 

382 id : `DatasetId` 

383 Unique dataset identifier. 

384 

385 Returns 

386 ------- 

387 dataId : `DataCoordinate` 

388 DataId for the dataset. 

389 """ 

390 # This query could return multiple rows (one for each tagged collection 

391 # the dataset is in, plus one for its run collection), and we don't 

392 # care which of those we get. 

393 sql = self._tags.select().where( 

394 sqlalchemy.sql.and_( 

395 self._tags.columns.dataset_id == id, 

396 self._tags.columns.dataset_type_id == self._dataset_type_id 

397 ) 

398 ).limit(1) 

399 row = self._db.query(sql).fetchone() 

400 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

401 return DataCoordinate.standardize( 

402 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

403 graph=self.datasetType.dimensions 

404 ) 

405 

406 

407class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

408 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

409 auto-incremented column for dataset IDs. 

410 """ 

411 

412 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

413 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

414 # Docstring inherited from DatasetRecordStorage. 

415 

416 # We only support UNIQUE mode for integer dataset IDs 

417 if idMode != DatasetIdGenEnum.UNIQUE: 417 ↛ 418line 417 didn't jump to line 418, because the condition on line 417 was never true

418 raise ValueError("Only UNIQUE mode can be used with integer dataset IDs.") 

419 

420 # Transform a possibly-single-pass iterable into a list. 

421 dataIdList = list(dataIds) 

422 yield from self._insert(run, dataIdList) 

423 

424 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

425 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

426 reuseIds: bool = False) -> Iterator[DatasetRef]: 

427 # Docstring inherited from DatasetRecordStorage. 

428 

429 # We only support UNIQUE mode for integer dataset IDs 

430 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 430 ↛ 431line 430 didn't jump to line 431, because the condition on line 430 was never true

431 raise ValueError("Only UNIQUE mode can be used with integer dataset IDs.") 

432 

433 # Make a list of dataIds and optionally dataset IDs. 

434 dataIdList: List[DataCoordinate] = [] 

435 datasetIdList: List[int] = [] 

436 for dataset in datasets: 

437 dataIdList.append(dataset.dataId) 

438 

439 # We only accept integer dataset IDs, but also allow None. 

440 datasetId = dataset.id 

441 if datasetId is None: 441 ↛ 443line 441 didn't jump to line 443, because the condition on line 441 was never true

442 # if reuseIds is set then all IDs must be known 

443 if reuseIds: 

444 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

445 elif isinstance(datasetId, int): 445 ↛ 449line 445 didn't jump to line 449, because the condition on line 445 was never false

446 if reuseIds: 

447 datasetIdList.append(datasetId) 

448 else: 

449 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

450 

451 yield from self._insert(run, dataIdList, datasetIdList) 

452 

453 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate], 

454 datasetIdList: Optional[List[int]] = None) -> Iterator[DatasetRef]: 

455 """Common part of implementation of `insert` and `import_` methods. 

456 """ 

457 

458 # Remember any governor dimension values we see. 

459 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

460 for dataId in dataIdList: 

461 governorValues.update_extract(dataId) 

462 

463 staticRow = { 

464 "dataset_type_id": self._dataset_type_id, 

465 self._runKeyColumn: run.key, 

466 } 

467 with self._db.transaction(): 

468 # Insert into the static dataset table, generating autoincrement 

469 # dataset_id values. 

470 if datasetIdList: 

471 # reuse existing IDs 

472 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

473 self._db.insert(self._static.dataset, *rows) 

474 else: 

475 # use auto-incremented IDs 

476 datasetIdList = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)), 

477 returnIds=True) 

478 assert datasetIdList is not None 

479 # Update the summary tables for this collection in case this is the 

480 # first time this dataset type or these governor values will be 

481 # inserted there. 

482 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

483 # Combine the generated dataset_id values and data ID fields to 

484 # form rows to be inserted into the tags table. 

485 protoTagsRow = { 

486 "dataset_type_id": self._dataset_type_id, 

487 self._collections.getCollectionForeignKeyName(): run.key, 

488 } 

489 tagsRows = [ 

490 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

491 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

492 ] 

493 # Insert those rows into the tags table. This is where we'll 

494 # get any unique constraint violations. 

495 self._db.insert(self._tags, *tagsRows) 

496 

497 for dataId, datasetId in zip(dataIdList, datasetIdList): 

498 yield DatasetRef( 

499 datasetType=self.datasetType, 

500 dataId=dataId, 

501 id=datasetId, 

502 run=run.name, 

503 ) 

504 

505 

506class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

507 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

508 dataset IDs. 

509 """ 

510 

511 NS_UUID = uuid.UUID('840b31d9-05cd-5161-b2c8-00d32b280d0f') 

512 """Namespace UUID used for UUID5 generation. Do not change. This was 

513 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

514 """ 

515 

516 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

517 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

518 # Docstring inherited from DatasetRecordStorage. 

519 

520 # Iterate over data IDs, transforming a possibly-single-pass iterable 

521 # into a list. 

522 dataIdList = [] 

523 rows = [] 

524 for dataId in dataIds: 

525 dataIdList.append(dataId) 

526 rows.append({ 

527 "id": self._makeDatasetId(run, dataId, idMode), 

528 "dataset_type_id": self._dataset_type_id, 

529 self._runKeyColumn: run.key, 

530 }) 

531 

532 yield from self._insert(run, dataIdList, rows, self._db.insert) 

533 

534 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

535 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

536 reuseIds: bool = False) -> Iterator[DatasetRef]: 

537 # Docstring inherited from DatasetRecordStorage. 

538 

539 # Iterate over data IDs, transforming a possibly-single-pass iterable 

540 # into a list. 

541 dataIdList = [] 

542 rows = [] 

543 for dataset in datasets: 

544 dataIdList.append(dataset.dataId) 

545 # Ignore unknown ID types, normally all IDs have the same type but 

546 # this code supports mixed types or missing IDs. 

547 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

548 if datasetId is None: 

549 datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode) 

550 rows.append({ 

551 "id": datasetId, 

552 "dataset_type_id": self._dataset_type_id, 

553 self._runKeyColumn: run.key, 

554 }) 

555 

556 yield from self._insert(run, dataIdList, rows, self._db.ensure) 

557 

558 def _insert(self, run: RunRecord, dataIdList: List[DataCoordinate], 

559 rows: List[Dict], insertMethod: Callable) -> Iterator[DatasetRef]: 

560 """Common part of implementation of `insert` and `import_` methods. 

561 """ 

562 

563 # Remember any governor dimension values we see. 

564 governorValues = GovernorDimensionRestriction.makeEmpty(self.datasetType.dimensions.universe) 

565 for dataId in dataIdList: 

566 governorValues.update_extract(dataId) 

567 

568 with self._db.transaction(): 

569 # Insert into the static dataset table. 

570 insertMethod(self._static.dataset, *rows) 

571 # Update the summary tables for this collection in case this is the 

572 # first time this dataset type or these governor values will be 

573 # inserted there. 

574 self._summaries.update(run, self.datasetType, self._dataset_type_id, governorValues) 

575 # Combine the generated dataset_id values and data ID fields to 

576 # form rows to be inserted into the tags table. 

577 protoTagsRow = { 

578 "dataset_type_id": self._dataset_type_id, 

579 self._collections.getCollectionForeignKeyName(): run.key, 

580 } 

581 tagsRows = [ 

582 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

583 for dataId, row in zip(dataIdList, rows) 

584 ] 

585 # Insert those rows into the tags table. 

586 insertMethod(self._tags, *tagsRows) 

587 for dataId, row in zip(dataIdList, rows): 

588 yield DatasetRef( 

589 datasetType=self.datasetType, 

590 dataId=dataId, 

591 id=row["id"], 

592 run=run.name, 

593 ) 

594 

595 def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate, 

596 idGenerationMode: DatasetIdGenEnum) -> uuid.UUID: 

597 """Generate dataset ID for a dataset. 

598 

599 Parameters 

600 ---------- 

601 run : `RunRecord` 

602 The record object describing the RUN collection for the dataset. 

603 dataId : `DataCoordinate` 

604 Expanded data ID for the dataset. 

605 idGenerationMode : `DatasetIdGenEnum` 

606 ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random 

607 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

608 deterministic UUID5-type ID based on a dataset type name and 

609 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

610 deterministic UUID5-type ID based on a dataset type name, run 

611 collection name, and ``dataId``. 

612 

613 Returns 

614 ------- 

615 datasetId : `uuid.UUID` 

616 Dataset identifier. 

617 """ 

618 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 618 ↛ 623line 618 didn't jump to line 623, because the condition on line 618 was never false

619 return uuid.uuid4() 

620 else: 

621 # WARNING: If you modify this code make sure that the order of 

622 # items in the `items` list below never changes. 

623 items: List[Tuple[str, str]] = [] 

624 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

625 items = [ 

626 ("dataset_type", self.datasetType.name), 

627 ] 

628 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 

629 items = [ 

630 ("dataset_type", self.datasetType.name), 

631 ("run", run.name), 

632 ] 

633 else: 

634 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

635 

636 for name, value in sorted(dataId.byName().items()): 

637 items.append((name, str(value))) 

638 data = ",".join(f"{key}={value}" for key, value in items) 

639 return uuid.uuid5(self.NS_UUID, data)