Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 80%

309 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-04 02:04 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23from __future__ import annotations 

24 

25__all__ = ("ByDimensionsDatasetRecordStorage",) 

26 

27import uuid 

28from collections.abc import Iterable, Iterator, Sequence 

29from typing import TYPE_CHECKING, Any 

30 

31import sqlalchemy 

32from deprecated.sphinx import deprecated 

33 

34from ....core import ( 

35 DataCoordinate, 

36 DataCoordinateSet, 

37 DatasetId, 

38 DatasetRef, 

39 DatasetType, 

40 SimpleQuery, 

41 StorageClass, 

42 Timespan, 

43 ddl, 

44) 

45from ..._collection_summary import CollectionSummary 

46from ..._collectionType import CollectionType 

47from ..._exceptions import CollectionTypeError, ConflictingDefinitionError, UnsupportedIdGeneratorError 

48from ...interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage 

49from .tables import makeTagTableSpec 

50 

51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true

52 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

53 from .summaries import CollectionSummaryManager 

54 from .tables import StaticDatasetTablesTuple 

55 

56 

57class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

58 """Dataset record storage implementation paired with 

59 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

60 information. 

61 

62 Instances of this class should never be constructed directly; use 

63 `DatasetRecordStorageManager.register` instead. 

64 """ 

65 

66 def __init__( 

67 self, 

68 *, 

69 datasetType: DatasetType, 

70 db: Database, 

71 dataset_type_id: int, 

72 collections: CollectionManager, 

73 static: StaticDatasetTablesTuple, 

74 summaries: CollectionSummaryManager, 

75 tags: sqlalchemy.schema.Table, 

76 calibs: sqlalchemy.schema.Table | None, 

77 ): 

78 super().__init__(datasetType=datasetType) 

79 self._dataset_type_id = dataset_type_id 

80 self._db = db 

81 self._collections = collections 

82 self._static = static 

83 self._summaries = summaries 

84 self._tags = tags 

85 self._calibs = calibs 

86 self._runKeyColumn = collections.getRunForeignKeyName() 

87 

88 def find( 

89 self, 

90 collection: CollectionRecord, 

91 dataId: DataCoordinate, 

92 timespan: Timespan | None = None, 

93 storage_class: StorageClass | str | None = None, 

94 ) -> DatasetRef | None: 

95 # Docstring inherited from DatasetRecordStorage. 

96 assert dataId.graph == self.datasetType.dimensions 

97 if collection.type is CollectionType.CALIBRATION and timespan is None: 97 ↛ 98line 97 didn't jump to line 98, because the condition on line 97 was never true

98 raise TypeError( 

99 f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

100 f"without an input timespan." 

101 ) 

102 sql = self.select( 

103 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan 

104 ) 

105 with self._db.query(sql) as results: 

106 row = results.fetchone() 

107 if row is None: 

108 return None 

109 if collection.type is CollectionType.CALIBRATION: 

110 # For temporal calibration lookups (only!) our invariants do 

111 # not guarantee that the number of result rows is <= 1. They 

112 # would if `select` constrained the given timespan to be 

113 # _contained_ by the validity range in the self._calibs table, 

114 # instead of simply _overlapping_ it, because we do guarantee 

115 # that the validity ranges are disjoint for a particular 

116 # dataset type, collection, and data ID. But using an overlap 

117 # test and a check for multiple result rows here allows us to 

118 # provide a more useful diagnostic, as well as allowing 

119 # `select` to support more general queries where multiple 

120 # results are not an error. 

121 if results.fetchone() is not None: 

122 raise RuntimeError( 

123 f"Multiple matches found for calibration lookup in {collection.name} for " 

124 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

125 ) 

126 datasetType = self.datasetType 

127 if storage_class is not None: 

128 datasetType = datasetType.overrideStorageClass(storage_class) 

129 return DatasetRef( 

130 datasetType=datasetType, 

131 dataId=dataId, 

132 id=row.id, 

133 run=self._collections[row._mapping[self._runKeyColumn]].name, 

134 ) 

135 

136 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

137 # Docstring inherited from DatasetRecordStorage. 

138 # Only delete from common dataset table; ON DELETE foreign key clauses 

139 # will handle the rest. 

140 self._db.delete( 

141 self._static.dataset, 

142 ["id"], 

143 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

144 ) 

145 

146 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

147 # Docstring inherited from DatasetRecordStorage. 

148 if collection.type is not CollectionType.TAGGED: 148 ↛ 149line 148 didn't jump to line 149, because the condition on line 148 was never true

149 raise TypeError( 

150 f"Cannot associate into collection '{collection.name}' " 

151 f"of type {collection.type.name}; must be TAGGED." 

152 ) 

153 protoRow = { 

154 self._collections.getCollectionForeignKeyName(): collection.key, 

155 "dataset_type_id": self._dataset_type_id, 

156 } 

157 rows = [] 

158 summary = CollectionSummary() 

159 for dataset in summary.add_datasets_generator(datasets): 

160 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

161 for dimension, value in dataset.dataId.items(): 

162 row[dimension.name] = value 

163 rows.append(row) 

164 # Update the summary tables for this collection in case this is the 

165 # first time this dataset type or these governor values will be 

166 # inserted there. 

167 self._summaries.update(collection, [self._dataset_type_id], summary) 

168 # Update the tag table itself. 

169 self._db.replace(self._tags, *rows) 

170 

171 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

172 # Docstring inherited from DatasetRecordStorage. 

173 if collection.type is not CollectionType.TAGGED: 173 ↛ 174line 173 didn't jump to line 174, because the condition on line 173 was never true

174 raise TypeError( 

175 f"Cannot disassociate from collection '{collection.name}' " 

176 f"of type {collection.type.name}; must be TAGGED." 

177 ) 

178 rows = [ 

179 { 

180 "dataset_id": dataset.getCheckedId(), 

181 self._collections.getCollectionForeignKeyName(): collection.key, 

182 } 

183 for dataset in datasets 

184 ] 

185 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

186 

187 def _buildCalibOverlapQuery( 

188 self, collection: CollectionRecord, dataIds: DataCoordinateSet | None, timespan: Timespan 

189 ) -> SimpleQuery: 

190 assert self._calibs is not None 

191 # Start by building a SELECT query for any rows that would overlap 

192 # this one. 

193 query = SimpleQuery() 

194 query.join(self._calibs) 

195 # Add a WHERE clause matching the dataset type and collection. 

196 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

197 query.where.append( 

198 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

199 ) 

200 # Add a WHERE clause matching any of the given data IDs. 

201 if dataIds is not None: 

202 dataIds.constrain( 

203 query, 

204 lambda name: self._calibs.columns[name], # type: ignore 

205 ) 

206 # Add WHERE clause for timespan overlaps. 

207 TimespanReprClass = self._db.getTimespanRepresentation() 

208 query.where.append( 

209 TimespanReprClass.from_columns(self._calibs.columns).overlaps( 

210 TimespanReprClass.fromLiteral(timespan) 

211 ) 

212 ) 

213 return query 

214 

215 def certify( 

216 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan 

217 ) -> None: 

218 # Docstring inherited from DatasetRecordStorage. 

219 if self._calibs is None: 219 ↛ 220line 219 didn't jump to line 220, because the condition on line 219 was never true

220 raise CollectionTypeError( 

221 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

222 f"DatasetType.isCalibration() is False." 

223 ) 

224 if collection.type is not CollectionType.CALIBRATION: 224 ↛ 225line 224 didn't jump to line 225, because the condition on line 224 was never true

225 raise CollectionTypeError( 

226 f"Cannot certify into collection '{collection.name}' " 

227 f"of type {collection.type.name}; must be CALIBRATION." 

228 ) 

229 TimespanReprClass = self._db.getTimespanRepresentation() 

230 protoRow = { 

231 self._collections.getCollectionForeignKeyName(): collection.key, 

232 "dataset_type_id": self._dataset_type_id, 

233 } 

234 rows = [] 

235 dataIds: set[DataCoordinate] | None = ( 

236 set() if not TimespanReprClass.hasExclusionConstraint() else None 

237 ) 

238 summary = CollectionSummary() 

239 for dataset in summary.add_datasets_generator(datasets): 

240 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

241 for dimension, value in dataset.dataId.items(): 

242 row[dimension.name] = value 

243 TimespanReprClass.update(timespan, result=row) 

244 rows.append(row) 

245 if dataIds is not None: 245 ↛ 239line 245 didn't jump to line 239, because the condition on line 245 was never false

246 dataIds.add(dataset.dataId) 

247 # Update the summary tables for this collection in case this is the 

248 # first time this dataset type or these governor values will be 

249 # inserted there. 

250 self._summaries.update(collection, [self._dataset_type_id], summary) 

251 # Update the association table itself. 

252 if TimespanReprClass.hasExclusionConstraint(): 252 ↛ 255line 252 didn't jump to line 255, because the condition on line 252 was never true

253 # Rely on database constraint to enforce invariants; we just 

254 # reraise the exception for consistency across DB engines. 

255 try: 

256 self._db.insert(self._calibs, *rows) 

257 except sqlalchemy.exc.IntegrityError as err: 

258 raise ConflictingDefinitionError( 

259 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

260 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

261 ) from err 

262 else: 

263 # Have to implement exclusion constraint ourselves. 

264 # Start by building a SELECT query for any rows that would overlap 

265 # this one. 

266 query = self._buildCalibOverlapQuery( 

267 collection, 

268 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

269 timespan, 

270 ) 

271 query.columns.append(sqlalchemy.sql.func.count()) 

272 sql = query.combine() 

273 # Acquire a table lock to ensure there are no concurrent writes 

274 # could invalidate our checking before we finish the inserts. We 

275 # use a SAVEPOINT in case there is an outer transaction that a 

276 # failure here should not roll back. 

277 with self._db.transaction(lock=[self._calibs], savepoint=True): 

278 # Run the check SELECT query. 

279 with self._db.query(sql) as sql_result: 

280 conflicting = sql_result.scalar() 

281 if conflicting > 0: 

282 raise ConflictingDefinitionError( 

283 f"{conflicting} validity range conflicts certifying datasets of type " 

284 f"{self.datasetType.name} into {collection.name} for range " 

285 f"[{timespan.begin}, {timespan.end})." 

286 ) 

287 # Proceed with the insert. 

288 self._db.insert(self._calibs, *rows) 

289 

290 def decertify( 

291 self, 

292 collection: CollectionRecord, 

293 timespan: Timespan, 

294 *, 

295 dataIds: Iterable[DataCoordinate] | None = None, 

296 ) -> None: 

297 # Docstring inherited from DatasetRecordStorage. 

298 if self._calibs is None: 298 ↛ 299line 298 didn't jump to line 299, because the condition on line 298 was never true

299 raise CollectionTypeError( 

300 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

301 f"DatasetType.isCalibration() is False." 

302 ) 

303 if collection.type is not CollectionType.CALIBRATION: 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true

304 raise CollectionTypeError( 

305 f"Cannot decertify from collection '{collection.name}' " 

306 f"of type {collection.type.name}; must be CALIBRATION." 

307 ) 

308 TimespanReprClass = self._db.getTimespanRepresentation() 

309 # Construct a SELECT query to find all rows that overlap our inputs. 

310 dataIdSet: DataCoordinateSet | None 

311 if dataIds is not None: 

312 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

313 else: 

314 dataIdSet = None 

315 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

316 query.columns.extend(self._calibs.columns) 

317 sql = query.combine() 

318 # Set up collections to populate with the rows we'll want to modify. 

319 # The insert rows will have the same values for collection and 

320 # dataset type. 

321 protoInsertRow = { 

322 self._collections.getCollectionForeignKeyName(): collection.key, 

323 "dataset_type_id": self._dataset_type_id, 

324 } 

325 rowsToDelete = [] 

326 rowsToInsert = [] 

327 # Acquire a table lock to ensure there are no concurrent writes 

328 # between the SELECT and the DELETE and INSERT queries based on it. 

329 with self._db.transaction(lock=[self._calibs], savepoint=True): 

330 with self._db.query(sql) as sql_result: 

331 sql_rows = sql_result.mappings().fetchall() 

332 for row in sql_rows: 

333 rowsToDelete.append({"id": row["id"]}) 

334 # Construct the insert row(s) by copying the prototype row, 

335 # then adding the dimension column values, then adding what's 

336 # left of the timespan from that row after we subtract the 

337 # given timespan. 

338 newInsertRow = protoInsertRow.copy() 

339 newInsertRow["dataset_id"] = row["dataset_id"] 

340 for name in self.datasetType.dimensions.required.names: 

341 newInsertRow[name] = row[name] 

342 rowTimespan = TimespanReprClass.extract(row) 

343 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

344 for diffTimespan in rowTimespan.difference(timespan): 

345 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

346 # Run the DELETE and INSERT queries. 

347 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

348 self._db.insert(self._calibs, *rowsToInsert) 

349 

350 def select( 

351 self, 

352 *collections: CollectionRecord, 

353 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

354 id: SimpleQuery.Select.Or[int | None] = SimpleQuery.Select, 

355 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

356 timespan: SimpleQuery.Select.Or[Timespan | None] = SimpleQuery.Select, 

357 ingestDate: SimpleQuery.Select.Or[Timespan | None] = None, 

358 rank: SimpleQuery.Select.Or[None] = None, 

359 ) -> sqlalchemy.sql.Selectable: 

360 # Docstring inherited from DatasetRecordStorage. 

361 collection_types = {collection.type for collection in collections} 

362 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

363 TimespanReprClass = self._db.getTimespanRepresentation() 

364 # 

365 # There are two kinds of table in play here: 

366 # 

367 # - the static dataset table (with the dataset ID, dataset type ID, 

368 # run ID/name, and ingest date); 

369 # 

370 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

371 # type ID, collection ID/name, data ID, and possibly validity 

372 # range). 

373 # 

374 # That means that we might want to return a query against either table 

375 # or a JOIN of both, depending on which quantities the caller wants. 

376 # But this method is documented/typed such that ``dataId`` is never 

377 # `None` - i.e. we always constrain or retreive the data ID. That 

378 # means we'll always include the tags/calibs table and join in the 

379 # static dataset table only if we need things from it that we can't get 

380 # from the tags/calibs table. 

381 # 

382 # Note that it's important that we include a WHERE constraint on both 

383 # tables for any column (e.g. dataset_type_id) that is in both when 

384 # it's given explicitly; not doing can prevent the query planner from 

385 # using very important indexes. At present, we don't include those 

386 # redundant columns in the JOIN ON expression, however, because the 

387 # FOREIGN KEY (and its index) are defined only on dataset_id. 

388 # 

389 # We'll start by accumulating kwargs to pass to SimpleQuery.join when 

390 # we bring in the tags/calibs table. We get the data ID or constrain 

391 # it in the tags/calibs table(s), but that's multiple columns, not one, 

392 # so we need to transform the one Select.Or argument into a dictionary 

393 # of them. 

394 kwargs: dict[str, Any] 

395 if dataId is SimpleQuery.Select: 

396 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

397 else: 

398 kwargs = dict(dataId.byName()) 

399 # We always constrain (never retrieve) the dataset type in at least the 

400 # tags/calibs table. 

401 kwargs["dataset_type_id"] = self._dataset_type_id 

402 # Join in the tags and/or calibs tables, turning those 'kwargs' entries 

403 # into WHERE constraints or SELECT columns as appropriate. 

404 if collection_types != {CollectionType.CALIBRATION}: 

405 # We'll need a subquery for the tags table if any of the given 

406 # collections are not a CALIBRATION collection. This intentionally 

407 # also fires when the list of collections is empty as a way to 

408 # create a dummy subquery that we know will fail. 

409 tags_query = SimpleQuery() 

410 tags_query.join(self._tags, **kwargs) 

411 # If the timespan is requested, simulate a potentially compound 

412 # column whose values are the maximum and minimum timespan 

413 # bounds. 

414 # If the timespan is constrained, ignore the constraint, since 

415 # it'd be guaranteed to evaluate to True. 

416 if timespan is SimpleQuery.Select: 

417 tags_query.columns.extend(TimespanReprClass.fromLiteral(Timespan(None, None)).flatten()) 

418 self._finish_single_select( 

419 tags_query, 

420 self._tags, 

421 collections, 

422 id=id, 

423 run=run, 

424 ingestDate=ingestDate, 

425 rank=rank, 

426 ) 

427 else: 

428 tags_query = None 

429 if CollectionType.CALIBRATION in collection_types: 

430 # If at least one collection is a CALIBRATION collection, we'll 

431 # need a subquery for the calibs table, and could include the 

432 # timespan as a result or constraint. 

433 calibs_query = SimpleQuery() 

434 assert ( 

435 self._calibs is not None 

436 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

437 calibs_query.join(self._calibs, **kwargs) 

438 # Add the timespan column(s) to the result columns, or constrain 

439 # the timespan via an overlap condition. 

440 if timespan is SimpleQuery.Select: 

441 calibs_query.columns.extend(TimespanReprClass.from_columns(self._calibs.columns).flatten()) 

442 elif timespan is not None: 

443 calibs_query.where.append( 

444 TimespanReprClass.from_columns(self._calibs.columns).overlaps( 

445 TimespanReprClass.fromLiteral(timespan) 

446 ) 

447 ) 

448 self._finish_single_select( 

449 calibs_query, 

450 self._calibs, 

451 collections, 

452 id=id, 

453 run=run, 

454 ingestDate=ingestDate, 

455 rank=rank, 

456 ) 

457 else: 

458 calibs_query = None 

459 if calibs_query is not None: 

460 if tags_query is not None: 

461 return tags_query.combine().union(calibs_query.combine()) 

462 else: 

463 return calibs_query.combine() 

464 else: 

465 assert tags_query is not None, "Earlier logic should guaranteed at least one is not None." 

466 return tags_query.combine() 

467 

468 def _finish_single_select( 

469 self, 

470 query: SimpleQuery, 

471 table: sqlalchemy.schema.Table, 

472 collections: Sequence[CollectionRecord], 

473 id: SimpleQuery.Select.Or[int | None], 

474 run: SimpleQuery.Select.Or[None], 

475 ingestDate: SimpleQuery.Select.Or[Timespan | None], 

476 rank: SimpleQuery.Select.Or[None], 

477 ) -> None: 

478 dataset_id_col = table.columns.dataset_id 

479 collection_col = table.columns[self._collections.getCollectionForeignKeyName()] 

480 # We always constrain (never retrieve) the collection(s) in the 

481 # tags/calibs table. 

482 if len(collections) == 1: 

483 query.where.append(collection_col == collections[0].key) 

484 elif len(collections) == 0: 

485 # We support the case where there are no collections as a way to 

486 # generate a valid SQL query that can't yield results. This should 

487 # never get executed, but lots of downstream code will still try 

488 # to access the SQLAlchemy objects representing the columns in the 

489 # subquery. That's not ideal, but it'd take a lot of refactoring 

490 # to fix it (DM-31725). 

491 query.where.append(sqlalchemy.sql.literal(False)) 

492 else: 

493 query.where.append(collection_col.in_([collection.key for collection in collections])) 

494 # Add rank if requested as a CASE-based calculation the collection 

495 # column. 

496 if rank is not None: 

497 assert rank is SimpleQuery.Select, "Cannot constraint rank, only select it." 

498 query.columns.append( 

499 sqlalchemy.sql.case( 

500 {record.key: n for n, record in enumerate(collections)}, 

501 value=collection_col, 

502 ).label("rank") 

503 ) 

504 # We can always get the dataset_id from the tags/calibs table or 

505 # constrain it there. Can't use kwargs for that because we need to 

506 # alias it to 'id'. 

507 if id is SimpleQuery.Select: 

508 query.columns.append(dataset_id_col.label("id")) 

509 elif id is not None: 509 ↛ 510line 509 didn't jump to line 510, because the condition on line 509 was never true

510 query.where.append(dataset_id_col == id) 

511 # It's possible we now have everything we need, from just the 

512 # tags/calibs table. The things we might need to get from the static 

513 # dataset table are the run key and the ingest date. 

514 need_static_table = False 

515 static_kwargs: dict[str, Any] = {} 

516 if run is not None: 

517 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection." 

518 if len(collections) == 1 and collections[0].type is CollectionType.RUN: 

519 # If we are searching exactly one RUN collection, we 

520 # know that if we find the dataset in that collection, 

521 # then that's the datasets's run; we don't need to 

522 # query for it. 

523 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn)) 

524 else: 

525 static_kwargs[self._runKeyColumn] = SimpleQuery.Select 

526 need_static_table = True 

527 # Ingest date can only come from the static table. 

528 if ingestDate is not None: 

529 need_static_table = True 

530 if ingestDate is SimpleQuery.Select: 530 ↛ 533line 530 didn't jump to line 533, because the condition on line 530 was never false

531 static_kwargs["ingest_date"] = SimpleQuery.Select 

532 else: 

533 assert isinstance(ingestDate, Timespan) 

534 # Timespan is astropy Time (usually in TAI) and ingest_date is 

535 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

536 if ingestDate.isEmpty(): 

537 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

538 if ingestDate.begin is not None: 

539 begin = ingestDate.begin.utc.datetime # type: ignore 

540 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

541 if ingestDate.end is not None: 

542 end = ingestDate.end.utc.datetime # type: ignore 

543 query.where.append(self._static.dataset.columns.ingest_date < end) 

544 # If we need the static table, join it in via dataset_id and 

545 # dataset_type_id 

546 if need_static_table: 

547 query.join( 

548 self._static.dataset, 

549 onclause=(dataset_id_col == self._static.dataset.columns.id), 

550 **static_kwargs, 

551 ) 

552 # Also constrain dataset_type_id in static table in case that helps 

553 # generate a better plan. 

554 # We could also include this in the JOIN ON clause, but my guess is 

555 # that that's a good idea IFF it's in the foreign key, and right 

556 # now it isn't. 

557 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

558 

559 def getDataId(self, id: DatasetId) -> DataCoordinate: 

560 """Return DataId for a dataset. 

561 

562 Parameters 

563 ---------- 

564 id : `DatasetId` 

565 Unique dataset identifier. 

566 

567 Returns 

568 ------- 

569 dataId : `DataCoordinate` 

570 DataId for the dataset. 

571 """ 

572 # This query could return multiple rows (one for each tagged collection 

573 # the dataset is in, plus one for its run collection), and we don't 

574 # care which of those we get. 

575 sql = ( 

576 self._tags.select() 

577 .where( 

578 sqlalchemy.sql.and_( 

579 self._tags.columns.dataset_id == id, 

580 self._tags.columns.dataset_type_id == self._dataset_type_id, 

581 ) 

582 ) 

583 .limit(1) 

584 ) 

585 with self._db.query(sql) as sql_result: 

586 row = sql_result.mappings().fetchone() 

587 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

588 return DataCoordinate.standardize( 

589 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

590 graph=self.datasetType.dimensions, 

591 ) 

592 

593 

594@deprecated( 

595 "Integer dataset IDs are deprecated in favor of UUIDs; support will be removed after v26. " 

596 "Please migrate or re-create this data repository.", 

597 version="v25.0", 

598 category=FutureWarning, 

599) 

600class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

601 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

602 auto-incremented column for dataset IDs. 

603 """ 

604 

605 def insert( 

606 self, 

607 run: RunRecord, 

608 dataIds: Iterable[DataCoordinate], 

609 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

610 ) -> Iterator[DatasetRef]: 

611 # Docstring inherited from DatasetRecordStorage. 

612 

613 # We only support UNIQUE mode for integer dataset IDs 

614 if idMode != DatasetIdGenEnum.UNIQUE: 

615 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

616 

617 # Transform a possibly-single-pass iterable into a list. 

618 dataIdList = list(dataIds) 

619 yield from self._insert(run, dataIdList) 

620 

621 def import_( 

622 self, 

623 run: RunRecord, 

624 datasets: Iterable[DatasetRef], 

625 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

626 reuseIds: bool = False, 

627 ) -> Iterator[DatasetRef]: 

628 # Docstring inherited from DatasetRecordStorage. 

629 

630 # We only support UNIQUE mode for integer dataset IDs 

631 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 

632 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

633 

634 # Make a list of dataIds and optionally dataset IDs. 

635 dataIdList: list[DataCoordinate] = [] 

636 datasetIdList: list[int] = [] 

637 for dataset in datasets: 

638 dataIdList.append(dataset.dataId) 

639 

640 # We only accept integer dataset IDs, but also allow None. 

641 datasetId = dataset.id 

642 if datasetId is None: 

643 # if reuseIds is set then all IDs must be known 

644 if reuseIds: 

645 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

646 elif isinstance(datasetId, int): 

647 if reuseIds: 

648 datasetIdList.append(datasetId) 

649 else: 

650 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

651 

652 yield from self._insert(run, dataIdList, datasetIdList) 

653 

654 def _insert( 

655 self, run: RunRecord, dataIdList: list[DataCoordinate], datasetIdList: list[int] | None = None 

656 ) -> Iterator[DatasetRef]: 

657 """Common part of implementation of `insert` and `import_` methods.""" 

658 

659 # Remember any governor dimension values we see. 

660 summary = CollectionSummary() 

661 summary.add_data_ids(self.datasetType, dataIdList) 

662 

663 staticRow = { 

664 "dataset_type_id": self._dataset_type_id, 

665 self._runKeyColumn: run.key, 

666 } 

667 with self._db.transaction(): 

668 # Insert into the static dataset table, generating autoincrement 

669 # dataset_id values. 

670 if datasetIdList: 

671 # reuse existing IDs 

672 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

673 self._db.insert(self._static.dataset, *rows) 

674 else: 

675 # use auto-incremented IDs 

676 datasetIdList = self._db.insert( 

677 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True 

678 ) 

679 assert datasetIdList is not None 

680 # Update the summary tables for this collection in case this is the 

681 # first time this dataset type or these governor values will be 

682 # inserted there. 

683 self._summaries.update(run, [self._dataset_type_id], summary) 

684 # Combine the generated dataset_id values and data ID fields to 

685 # form rows to be inserted into the tags table. 

686 protoTagsRow = { 

687 "dataset_type_id": self._dataset_type_id, 

688 self._collections.getCollectionForeignKeyName(): run.key, 

689 } 

690 tagsRows = [ 

691 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

692 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

693 ] 

694 # Insert those rows into the tags table. This is where we'll 

695 # get any unique constraint violations. 

696 self._db.insert(self._tags, *tagsRows) 

697 

698 for dataId, datasetId in zip(dataIdList, datasetIdList): 

699 yield DatasetRef( 

700 datasetType=self.datasetType, 

701 dataId=dataId, 

702 id=datasetId, 

703 run=run.name, 

704 ) 

705 

706 

707class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

708 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

709 dataset IDs. 

710 """ 

711 

712 idMaker = DatasetIdFactory() 

713 """Factory for dataset IDs. In the future this factory may be shared with 

714 other classes (e.g. Registry).""" 

715 

716 def insert( 

717 self, 

718 run: RunRecord, 

719 dataIds: Iterable[DataCoordinate], 

720 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

721 ) -> Iterator[DatasetRef]: 

722 # Docstring inherited from DatasetRecordStorage. 

723 

724 # Iterate over data IDs, transforming a possibly-single-pass iterable 

725 # into a list. 

726 dataIdList = [] 

727 rows = [] 

728 summary = CollectionSummary() 

729 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds): 

730 dataIdList.append(dataId) 

731 rows.append( 

732 { 

733 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), 

734 "dataset_type_id": self._dataset_type_id, 

735 self._runKeyColumn: run.key, 

736 } 

737 ) 

738 

739 with self._db.transaction(): 

740 # Insert into the static dataset table. 

741 self._db.insert(self._static.dataset, *rows) 

742 # Update the summary tables for this collection in case this is the 

743 # first time this dataset type or these governor values will be 

744 # inserted there. 

745 self._summaries.update(run, [self._dataset_type_id], summary) 

746 # Combine the generated dataset_id values and data ID fields to 

747 # form rows to be inserted into the tags table. 

748 protoTagsRow = { 

749 "dataset_type_id": self._dataset_type_id, 

750 self._collections.getCollectionForeignKeyName(): run.key, 

751 } 

752 tagsRows = [ 

753 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

754 for dataId, row in zip(dataIdList, rows) 

755 ] 

756 # Insert those rows into the tags table. 

757 self._db.insert(self._tags, *tagsRows) 

758 

759 for dataId, row in zip(dataIdList, rows): 

760 yield DatasetRef( 

761 datasetType=self.datasetType, 

762 dataId=dataId, 

763 id=row["id"], 

764 run=run.name, 

765 ) 

766 

767 def import_( 

768 self, 

769 run: RunRecord, 

770 datasets: Iterable[DatasetRef], 

771 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

772 reuseIds: bool = False, 

773 ) -> Iterator[DatasetRef]: 

774 # Docstring inherited from DatasetRecordStorage. 

775 

776 # Iterate over data IDs, transforming a possibly-single-pass iterable 

777 # into a list. 

778 dataIds = {} 

779 summary = CollectionSummary() 

780 for dataset in summary.add_datasets_generator(datasets): 

781 # Ignore unknown ID types, normally all IDs have the same type but 

782 # this code supports mixed types or missing IDs. 

783 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

784 if datasetId is None: 

785 datasetId = self.idMaker.makeDatasetId( 

786 run.name, self.datasetType, dataset.dataId, idGenerationMode 

787 ) 

788 dataIds[datasetId] = dataset.dataId 

789 

790 # We'll insert all new rows into a temporary table 

791 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False) 

792 collFkName = self._collections.getCollectionForeignKeyName() 

793 protoTagsRow = { 

794 "dataset_type_id": self._dataset_type_id, 

795 collFkName: run.key, 

796 } 

797 tmpRows = [ 

798 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

799 for dataset_id, dataId in dataIds.items() 

800 ] 

801 with self._db.transaction(for_temp_tables=True): 

802 with self._db.temporary_table(tableSpec) as tmp_tags: 

803 # store all incoming data in a temporary table 

804 self._db.insert(tmp_tags, *tmpRows) 

805 

806 # There are some checks that we want to make for consistency 

807 # of the new datasets with existing ones. 

808 self._validateImport(tmp_tags, run) 

809 

810 # Before we merge temporary table into dataset/tags we need to 

811 # drop datasets which are already there (and do not conflict). 

812 self._db.deleteWhere( 

813 tmp_tags, 

814 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

815 ) 

816 

817 # Copy it into dataset table, need to re-label some columns. 

818 self._db.insert( 

819 self._static.dataset, 

820 select=sqlalchemy.sql.select( 

821 tmp_tags.columns.dataset_id.label("id"), 

822 tmp_tags.columns.dataset_type_id, 

823 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

824 ), 

825 ) 

826 

827 # Update the summary tables for this collection in case this 

828 # is the first time this dataset type or these governor values 

829 # will be inserted there. 

830 self._summaries.update(run, [self._dataset_type_id], summary) 

831 

832 # Copy it into tags table. 

833 self._db.insert(self._tags, select=tmp_tags.select()) 

834 

835 # Return refs in the same order as in the input list. 

836 for dataset_id, dataId in dataIds.items(): 

837 yield DatasetRef( 

838 datasetType=self.datasetType, 

839 id=dataset_id, 

840 dataId=dataId, 

841 run=run.name, 

842 ) 

843 

844 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

845 """Validate imported refs against existing datasets. 

846 

847 Parameters 

848 ---------- 

849 tmp_tags : `sqlalchemy.schema.Table` 

850 Temporary table with new datasets and the same schema as tags 

851 table. 

852 run : `RunRecord` 

853 The record object describing the `~CollectionType.RUN` collection. 

854 

855 Raises 

856 ------ 

857 ConflictingDefinitionError 

858 Raise if new datasets conflict with existing ones. 

859 """ 

860 dataset = self._static.dataset 

861 tags = self._tags 

862 collFkName = self._collections.getCollectionForeignKeyName() 

863 

864 # Check that existing datasets have the same dataset type and 

865 # run. 

866 query = ( 

867 sqlalchemy.sql.select( 

868 dataset.columns.id.label("dataset_id"), 

869 dataset.columns.dataset_type_id.label("dataset_type_id"), 

870 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"), 

871 dataset.columns[self._runKeyColumn].label("run"), 

872 tmp_tags.columns[collFkName].label("new run"), 

873 ) 

874 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

875 .where( 

876 sqlalchemy.sql.or_( 

877 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

878 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

879 ) 

880 ) 

881 .limit(1) 

882 ) 

883 with self._db.query(query) as result: 

884 if (row := result.first()) is not None: 

885 # Only include the first one in the exception message 

886 raise ConflictingDefinitionError( 

887 f"Existing dataset type or run do not match new dataset: {row._asdict()}" 

888 ) 

889 

890 # Check that matching dataset in tags table has the same DataId. 

891 query = ( 

892 sqlalchemy.sql.select( 

893 tags.columns.dataset_id, 

894 tags.columns.dataset_type_id.label("type_id"), 

895 tmp_tags.columns.dataset_type_id.label("new type_id"), 

896 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

897 *[ 

898 tmp_tags.columns[dim].label(f"new {dim}") 

899 for dim in self.datasetType.dimensions.required.names 

900 ], 

901 ) 

902 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

903 .where( 

904 sqlalchemy.sql.or_( 

905 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

906 *[ 

907 tags.columns[dim] != tmp_tags.columns[dim] 

908 for dim in self.datasetType.dimensions.required.names 

909 ], 

910 ) 

911 ) 

912 .limit(1) 

913 ) 

914 

915 with self._db.query(query) as result: 

916 if (row := result.first()) is not None: 

917 # Only include the first one in the exception message 

918 raise ConflictingDefinitionError( 

919 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

920 ) 

921 

922 # Check that matching run+dataId have the same dataset ID. 

923 query = ( 

924 sqlalchemy.sql.select( 

925 tags.columns.dataset_type_id.label("dataset_type_id"), 

926 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

927 tags.columns.dataset_id, 

928 tmp_tags.columns.dataset_id.label("new dataset_id"), 

929 tags.columns[collFkName], 

930 tmp_tags.columns[collFkName].label(f"new {collFkName}"), 

931 ) 

932 .select_from( 

933 tags.join( 

934 tmp_tags, 

935 sqlalchemy.sql.and_( 

936 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

937 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

938 *[ 

939 tags.columns[dim] == tmp_tags.columns[dim] 

940 for dim in self.datasetType.dimensions.required.names 

941 ], 

942 ), 

943 ) 

944 ) 

945 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

946 .limit(1) 

947 ) 

948 with self._db.query(query) as result: 

949 if (row := result.first()) is not None: 

950 # only include the first one in the exception message 

951 raise ConflictingDefinitionError( 

952 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" 

953 )