Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 79%

306 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-10-26 15:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23from __future__ import annotations 

24 

25__all__ = ("ByDimensionsDatasetRecordStorage",) 

26 

27import uuid 

28from collections.abc import Iterable, Iterator, Sequence 

29from typing import TYPE_CHECKING, Any 

30 

31import sqlalchemy 

32from deprecated.sphinx import deprecated 

33 

34from ....core import ( 

35 DataCoordinate, 

36 DataCoordinateSet, 

37 DatasetId, 

38 DatasetRef, 

39 DatasetType, 

40 SimpleQuery, 

41 Timespan, 

42 ddl, 

43) 

44from ..._collection_summary import CollectionSummary 

45from ..._collectionType import CollectionType 

46from ..._exceptions import CollectionTypeError, ConflictingDefinitionError, UnsupportedIdGeneratorError 

47from ...interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage 

48from .tables import makeTagTableSpec 

49 

50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

52 from .summaries import CollectionSummaryManager 

53 from .tables import StaticDatasetTablesTuple 

54 

55 

56class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

57 """Dataset record storage implementation paired with 

58 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

59 information. 

60 

61 Instances of this class should never be constructed directly; use 

62 `DatasetRecordStorageManager.register` instead. 

63 """ 

64 

65 def __init__( 

66 self, 

67 *, 

68 datasetType: DatasetType, 

69 db: Database, 

70 dataset_type_id: int, 

71 collections: CollectionManager, 

72 static: StaticDatasetTablesTuple, 

73 summaries: CollectionSummaryManager, 

74 tags: sqlalchemy.schema.Table, 

75 calibs: sqlalchemy.schema.Table | None, 

76 ): 

77 super().__init__(datasetType=datasetType) 

78 self._dataset_type_id = dataset_type_id 

79 self._db = db 

80 self._collections = collections 

81 self._static = static 

82 self._summaries = summaries 

83 self._tags = tags 

84 self._calibs = calibs 

85 self._runKeyColumn = collections.getRunForeignKeyName() 

86 

87 def find( 

88 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Timespan | None = None 

89 ) -> DatasetRef | None: 

90 # Docstring inherited from DatasetRecordStorage. 

91 assert dataId.graph == self.datasetType.dimensions 

92 if collection.type is CollectionType.CALIBRATION and timespan is None: 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true

93 raise TypeError( 

94 f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

95 f"without an input timespan." 

96 ) 

97 sql = self.select( 

98 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan 

99 ) 

100 with self._db.query(sql) as results: 

101 row = results.fetchone() 

102 if row is None: 

103 return None 

104 if collection.type is CollectionType.CALIBRATION: 

105 # For temporal calibration lookups (only!) our invariants do 

106 # not guarantee that the number of result rows is <= 1. They 

107 # would if `select` constrained the given timespan to be 

108 # _contained_ by the validity range in the self._calibs table, 

109 # instead of simply _overlapping_ it, because we do guarantee 

110 # that the validity ranges are disjoint for a particular 

111 # dataset type, collection, and data ID. But using an overlap 

112 # test and a check for multiple result rows here allows us to 

113 # provide a more useful diagnostic, as well as allowing 

114 # `select` to support more general queries where multiple 

115 # results are not an error. 

116 if results.fetchone() is not None: 

117 raise RuntimeError( 

118 f"Multiple matches found for calibration lookup in {collection.name} for " 

119 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

120 ) 

121 return DatasetRef( 

122 datasetType=self.datasetType, 

123 dataId=dataId, 

124 id=row.id, 

125 run=self._collections[row._mapping[self._runKeyColumn]].name, 

126 ) 

127 

128 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

129 # Docstring inherited from DatasetRecordStorage. 

130 # Only delete from common dataset table; ON DELETE foreign key clauses 

131 # will handle the rest. 

132 self._db.delete( 

133 self._static.dataset, 

134 ["id"], 

135 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

136 ) 

137 

138 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

139 # Docstring inherited from DatasetRecordStorage. 

140 if collection.type is not CollectionType.TAGGED: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true

141 raise TypeError( 

142 f"Cannot associate into collection '{collection.name}' " 

143 f"of type {collection.type.name}; must be TAGGED." 

144 ) 

145 protoRow = { 

146 self._collections.getCollectionForeignKeyName(): collection.key, 

147 "dataset_type_id": self._dataset_type_id, 

148 } 

149 rows = [] 

150 summary = CollectionSummary() 

151 for dataset in summary.add_datasets_generator(datasets): 

152 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

153 for dimension, value in dataset.dataId.items(): 

154 row[dimension.name] = value 

155 rows.append(row) 

156 # Update the summary tables for this collection in case this is the 

157 # first time this dataset type or these governor values will be 

158 # inserted there. 

159 self._summaries.update(collection, [self._dataset_type_id], summary) 

160 # Update the tag table itself. 

161 self._db.replace(self._tags, *rows) 

162 

163 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

164 # Docstring inherited from DatasetRecordStorage. 

165 if collection.type is not CollectionType.TAGGED: 165 ↛ 166line 165 didn't jump to line 166, because the condition on line 165 was never true

166 raise TypeError( 

167 f"Cannot disassociate from collection '{collection.name}' " 

168 f"of type {collection.type.name}; must be TAGGED." 

169 ) 

170 rows = [ 

171 { 

172 "dataset_id": dataset.getCheckedId(), 

173 self._collections.getCollectionForeignKeyName(): collection.key, 

174 } 

175 for dataset in datasets 

176 ] 

177 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

178 

179 def _buildCalibOverlapQuery( 

180 self, collection: CollectionRecord, dataIds: DataCoordinateSet | None, timespan: Timespan 

181 ) -> SimpleQuery: 

182 assert self._calibs is not None 

183 # Start by building a SELECT query for any rows that would overlap 

184 # this one. 

185 query = SimpleQuery() 

186 query.join(self._calibs) 

187 # Add a WHERE clause matching the dataset type and collection. 

188 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

189 query.where.append( 

190 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

191 ) 

192 # Add a WHERE clause matching any of the given data IDs. 

193 if dataIds is not None: 

194 dataIds.constrain( 

195 query, 

196 lambda name: self._calibs.columns[name], # type: ignore 

197 ) 

198 # Add WHERE clause for timespan overlaps. 

199 TimespanReprClass = self._db.getTimespanRepresentation() 

200 query.where.append( 

201 TimespanReprClass.from_columns(self._calibs.columns).overlaps( 

202 TimespanReprClass.fromLiteral(timespan) 

203 ) 

204 ) 

205 return query 

206 

207 def certify( 

208 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan 

209 ) -> None: 

210 # Docstring inherited from DatasetRecordStorage. 

211 if self._calibs is None: 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true

212 raise CollectionTypeError( 

213 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

214 f"DatasetType.isCalibration() is False." 

215 ) 

216 if collection.type is not CollectionType.CALIBRATION: 216 ↛ 217line 216 didn't jump to line 217, because the condition on line 216 was never true

217 raise CollectionTypeError( 

218 f"Cannot certify into collection '{collection.name}' " 

219 f"of type {collection.type.name}; must be CALIBRATION." 

220 ) 

221 TimespanReprClass = self._db.getTimespanRepresentation() 

222 protoRow = { 

223 self._collections.getCollectionForeignKeyName(): collection.key, 

224 "dataset_type_id": self._dataset_type_id, 

225 } 

226 rows = [] 

227 dataIds: set[DataCoordinate] | None = ( 

228 set() if not TimespanReprClass.hasExclusionConstraint() else None 

229 ) 

230 summary = CollectionSummary() 

231 for dataset in summary.add_datasets_generator(datasets): 

232 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

233 for dimension, value in dataset.dataId.items(): 

234 row[dimension.name] = value 

235 TimespanReprClass.update(timespan, result=row) 

236 rows.append(row) 

237 if dataIds is not None: 237 ↛ 231line 237 didn't jump to line 231, because the condition on line 237 was never false

238 dataIds.add(dataset.dataId) 

239 # Update the summary tables for this collection in case this is the 

240 # first time this dataset type or these governor values will be 

241 # inserted there. 

242 self._summaries.update(collection, [self._dataset_type_id], summary) 

243 # Update the association table itself. 

244 if TimespanReprClass.hasExclusionConstraint(): 244 ↛ 247line 244 didn't jump to line 247, because the condition on line 244 was never true

245 # Rely on database constraint to enforce invariants; we just 

246 # reraise the exception for consistency across DB engines. 

247 try: 

248 self._db.insert(self._calibs, *rows) 

249 except sqlalchemy.exc.IntegrityError as err: 

250 raise ConflictingDefinitionError( 

251 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

252 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

253 ) from err 

254 else: 

255 # Have to implement exclusion constraint ourselves. 

256 # Start by building a SELECT query for any rows that would overlap 

257 # this one. 

258 query = self._buildCalibOverlapQuery( 

259 collection, 

260 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

261 timespan, 

262 ) 

263 query.columns.append(sqlalchemy.sql.func.count()) 

264 sql = query.combine() 

265 # Acquire a table lock to ensure there are no concurrent writes 

266 # could invalidate our checking before we finish the inserts. We 

267 # use a SAVEPOINT in case there is an outer transaction that a 

268 # failure here should not roll back. 

269 with self._db.transaction(lock=[self._calibs], savepoint=True): 

270 # Run the check SELECT query. 

271 with self._db.query(sql) as sql_result: 

272 conflicting = sql_result.scalar() 

273 if conflicting > 0: 

274 raise ConflictingDefinitionError( 

275 f"{conflicting} validity range conflicts certifying datasets of type " 

276 f"{self.datasetType.name} into {collection.name} for range " 

277 f"[{timespan.begin}, {timespan.end})." 

278 ) 

279 # Proceed with the insert. 

280 self._db.insert(self._calibs, *rows) 

281 

282 def decertify( 

283 self, 

284 collection: CollectionRecord, 

285 timespan: Timespan, 

286 *, 

287 dataIds: Iterable[DataCoordinate] | None = None, 

288 ) -> None: 

289 # Docstring inherited from DatasetRecordStorage. 

290 if self._calibs is None: 290 ↛ 291line 290 didn't jump to line 291, because the condition on line 290 was never true

291 raise CollectionTypeError( 

292 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

293 f"DatasetType.isCalibration() is False." 

294 ) 

295 if collection.type is not CollectionType.CALIBRATION: 295 ↛ 296line 295 didn't jump to line 296, because the condition on line 295 was never true

296 raise CollectionTypeError( 

297 f"Cannot decertify from collection '{collection.name}' " 

298 f"of type {collection.type.name}; must be CALIBRATION." 

299 ) 

300 TimespanReprClass = self._db.getTimespanRepresentation() 

301 # Construct a SELECT query to find all rows that overlap our inputs. 

302 dataIdSet: DataCoordinateSet | None 

303 if dataIds is not None: 

304 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

305 else: 

306 dataIdSet = None 

307 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

308 query.columns.extend(self._calibs.columns) 

309 sql = query.combine() 

310 # Set up collections to populate with the rows we'll want to modify. 

311 # The insert rows will have the same values for collection and 

312 # dataset type. 

313 protoInsertRow = { 

314 self._collections.getCollectionForeignKeyName(): collection.key, 

315 "dataset_type_id": self._dataset_type_id, 

316 } 

317 rowsToDelete = [] 

318 rowsToInsert = [] 

319 # Acquire a table lock to ensure there are no concurrent writes 

320 # between the SELECT and the DELETE and INSERT queries based on it. 

321 with self._db.transaction(lock=[self._calibs], savepoint=True): 

322 with self._db.query(sql) as sql_result: 

323 sql_rows = sql_result.mappings().fetchall() 

324 for row in sql_rows: 

325 rowsToDelete.append({"id": row["id"]}) 

326 # Construct the insert row(s) by copying the prototype row, 

327 # then adding the dimension column values, then adding what's 

328 # left of the timespan from that row after we subtract the 

329 # given timespan. 

330 newInsertRow = protoInsertRow.copy() 

331 newInsertRow["dataset_id"] = row["dataset_id"] 

332 for name in self.datasetType.dimensions.required.names: 

333 newInsertRow[name] = row[name] 

334 rowTimespan = TimespanReprClass.extract(row) 

335 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

336 for diffTimespan in rowTimespan.difference(timespan): 

337 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

338 # Run the DELETE and INSERT queries. 

339 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

340 self._db.insert(self._calibs, *rowsToInsert) 

341 

342 def select( 

343 self, 

344 *collections: CollectionRecord, 

345 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

346 id: SimpleQuery.Select.Or[int | None] = SimpleQuery.Select, 

347 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

348 timespan: SimpleQuery.Select.Or[Timespan | None] = SimpleQuery.Select, 

349 ingestDate: SimpleQuery.Select.Or[Timespan | None] = None, 

350 rank: SimpleQuery.Select.Or[None] = None, 

351 ) -> sqlalchemy.sql.Selectable: 

352 # Docstring inherited from DatasetRecordStorage. 

353 collection_types = {collection.type for collection in collections} 

354 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

355 TimespanReprClass = self._db.getTimespanRepresentation() 

356 # 

357 # There are two kinds of table in play here: 

358 # 

359 # - the static dataset table (with the dataset ID, dataset type ID, 

360 # run ID/name, and ingest date); 

361 # 

362 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

363 # type ID, collection ID/name, data ID, and possibly validity 

364 # range). 

365 # 

366 # That means that we might want to return a query against either table 

367 # or a JOIN of both, depending on which quantities the caller wants. 

368 # But this method is documented/typed such that ``dataId`` is never 

369 # `None` - i.e. we always constrain or retreive the data ID. That 

370 # means we'll always include the tags/calibs table and join in the 

371 # static dataset table only if we need things from it that we can't get 

372 # from the tags/calibs table. 

373 # 

374 # Note that it's important that we include a WHERE constraint on both 

375 # tables for any column (e.g. dataset_type_id) that is in both when 

376 # it's given explicitly; not doing can prevent the query planner from 

377 # using very important indexes. At present, we don't include those 

378 # redundant columns in the JOIN ON expression, however, because the 

379 # FOREIGN KEY (and its index) are defined only on dataset_id. 

380 # 

381 # We'll start by accumulating kwargs to pass to SimpleQuery.join when 

382 # we bring in the tags/calibs table. We get the data ID or constrain 

383 # it in the tags/calibs table(s), but that's multiple columns, not one, 

384 # so we need to transform the one Select.Or argument into a dictionary 

385 # of them. 

386 kwargs: dict[str, Any] 

387 if dataId is SimpleQuery.Select: 

388 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

389 else: 

390 kwargs = dict(dataId.byName()) 

391 # We always constrain (never retrieve) the dataset type in at least the 

392 # tags/calibs table. 

393 kwargs["dataset_type_id"] = self._dataset_type_id 

394 # Join in the tags and/or calibs tables, turning those 'kwargs' entries 

395 # into WHERE constraints or SELECT columns as appropriate. 

396 if collection_types != {CollectionType.CALIBRATION}: 

397 # We'll need a subquery for the tags table if any of the given 

398 # collections are not a CALIBRATION collection. This intentionally 

399 # also fires when the list of collections is empty as a way to 

400 # create a dummy subquery that we know will fail. 

401 tags_query = SimpleQuery() 

402 tags_query.join(self._tags, **kwargs) 

403 # If the timespan is requested, simulate a potentially compound 

404 # column whose values are the maximum and minimum timespan 

405 # bounds. 

406 # If the timespan is constrained, ignore the constraint, since 

407 # it'd be guaranteed to evaluate to True. 

408 if timespan is SimpleQuery.Select: 

409 tags_query.columns.extend(TimespanReprClass.fromLiteral(Timespan(None, None)).flatten()) 

410 self._finish_single_select( 

411 tags_query, 

412 self._tags, 

413 collections, 

414 id=id, 

415 run=run, 

416 ingestDate=ingestDate, 

417 rank=rank, 

418 ) 

419 else: 

420 tags_query = None 

421 if CollectionType.CALIBRATION in collection_types: 

422 # If at least one collection is a CALIBRATION collection, we'll 

423 # need a subquery for the calibs table, and could include the 

424 # timespan as a result or constraint. 

425 calibs_query = SimpleQuery() 

426 assert ( 

427 self._calibs is not None 

428 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

429 calibs_query.join(self._calibs, **kwargs) 

430 # Add the timespan column(s) to the result columns, or constrain 

431 # the timespan via an overlap condition. 

432 if timespan is SimpleQuery.Select: 

433 calibs_query.columns.extend(TimespanReprClass.from_columns(self._calibs.columns).flatten()) 

434 elif timespan is not None: 

435 calibs_query.where.append( 

436 TimespanReprClass.from_columns(self._calibs.columns).overlaps( 

437 TimespanReprClass.fromLiteral(timespan) 

438 ) 

439 ) 

440 self._finish_single_select( 

441 calibs_query, 

442 self._calibs, 

443 collections, 

444 id=id, 

445 run=run, 

446 ingestDate=ingestDate, 

447 rank=rank, 

448 ) 

449 else: 

450 calibs_query = None 

451 if calibs_query is not None: 

452 if tags_query is not None: 

453 return tags_query.combine().union(calibs_query.combine()) 

454 else: 

455 return calibs_query.combine() 

456 else: 

457 assert tags_query is not None, "Earlier logic should guaranteed at least one is not None." 

458 return tags_query.combine() 

459 

460 def _finish_single_select( 

461 self, 

462 query: SimpleQuery, 

463 table: sqlalchemy.schema.Table, 

464 collections: Sequence[CollectionRecord], 

465 id: SimpleQuery.Select.Or[int | None], 

466 run: SimpleQuery.Select.Or[None], 

467 ingestDate: SimpleQuery.Select.Or[Timespan | None], 

468 rank: SimpleQuery.Select.Or[None], 

469 ) -> None: 

470 dataset_id_col = table.columns.dataset_id 

471 collection_col = table.columns[self._collections.getCollectionForeignKeyName()] 

472 # We always constrain (never retrieve) the collection(s) in the 

473 # tags/calibs table. 

474 if len(collections) == 1: 

475 query.where.append(collection_col == collections[0].key) 

476 elif len(collections) == 0: 

477 # We support the case where there are no collections as a way to 

478 # generate a valid SQL query that can't yield results. This should 

479 # never get executed, but lots of downstream code will still try 

480 # to access the SQLAlchemy objects representing the columns in the 

481 # subquery. That's not ideal, but it'd take a lot of refactoring 

482 # to fix it (DM-31725). 

483 query.where.append(sqlalchemy.sql.literal(False)) 

484 else: 

485 query.where.append(collection_col.in_([collection.key for collection in collections])) 

486 # Add rank if requested as a CASE-based calculation the collection 

487 # column. 

488 if rank is not None: 

489 assert rank is SimpleQuery.Select, "Cannot constraint rank, only select it." 

490 query.columns.append( 

491 sqlalchemy.sql.case( 

492 {record.key: n for n, record in enumerate(collections)}, 

493 value=collection_col, 

494 ).label("rank") 

495 ) 

496 # We can always get the dataset_id from the tags/calibs table or 

497 # constrain it there. Can't use kwargs for that because we need to 

498 # alias it to 'id'. 

499 if id is SimpleQuery.Select: 

500 query.columns.append(dataset_id_col.label("id")) 

501 elif id is not None: 501 ↛ 502line 501 didn't jump to line 502, because the condition on line 501 was never true

502 query.where.append(dataset_id_col == id) 

503 # It's possible we now have everything we need, from just the 

504 # tags/calibs table. The things we might need to get from the static 

505 # dataset table are the run key and the ingest date. 

506 need_static_table = False 

507 static_kwargs: dict[str, Any] = {} 

508 if run is not None: 

509 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection." 

510 if len(collections) == 1 and collections[0].type is CollectionType.RUN: 

511 # If we are searching exactly one RUN collection, we 

512 # know that if we find the dataset in that collection, 

513 # then that's the datasets's run; we don't need to 

514 # query for it. 

515 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn)) 

516 else: 

517 static_kwargs[self._runKeyColumn] = SimpleQuery.Select 

518 need_static_table = True 

519 # Ingest date can only come from the static table. 

520 if ingestDate is not None: 

521 need_static_table = True 

522 if ingestDate is SimpleQuery.Select: 522 ↛ 525line 522 didn't jump to line 525, because the condition on line 522 was never false

523 static_kwargs["ingest_date"] = SimpleQuery.Select 

524 else: 

525 assert isinstance(ingestDate, Timespan) 

526 # Timespan is astropy Time (usually in TAI) and ingest_date is 

527 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

528 if ingestDate.isEmpty(): 

529 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

530 if ingestDate.begin is not None: 

531 begin = ingestDate.begin.utc.datetime # type: ignore 

532 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

533 if ingestDate.end is not None: 

534 end = ingestDate.end.utc.datetime # type: ignore 

535 query.where.append(self._static.dataset.columns.ingest_date < end) 

536 # If we need the static table, join it in via dataset_id and 

537 # dataset_type_id 

538 if need_static_table: 

539 query.join( 

540 self._static.dataset, 

541 onclause=(dataset_id_col == self._static.dataset.columns.id), 

542 **static_kwargs, 

543 ) 

544 # Also constrain dataset_type_id in static table in case that helps 

545 # generate a better plan. 

546 # We could also include this in the JOIN ON clause, but my guess is 

547 # that that's a good idea IFF it's in the foreign key, and right 

548 # now it isn't. 

549 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

550 

551 def getDataId(self, id: DatasetId) -> DataCoordinate: 

552 """Return DataId for a dataset. 

553 

554 Parameters 

555 ---------- 

556 id : `DatasetId` 

557 Unique dataset identifier. 

558 

559 Returns 

560 ------- 

561 dataId : `DataCoordinate` 

562 DataId for the dataset. 

563 """ 

564 # This query could return multiple rows (one for each tagged collection 

565 # the dataset is in, plus one for its run collection), and we don't 

566 # care which of those we get. 

567 sql = ( 

568 self._tags.select() 

569 .where( 

570 sqlalchemy.sql.and_( 

571 self._tags.columns.dataset_id == id, 

572 self._tags.columns.dataset_type_id == self._dataset_type_id, 

573 ) 

574 ) 

575 .limit(1) 

576 ) 

577 with self._db.query(sql) as sql_result: 

578 row = sql_result.mappings().fetchone() 

579 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

580 return DataCoordinate.standardize( 

581 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

582 graph=self.datasetType.dimensions, 

583 ) 

584 

585 

586@deprecated( 

587 "Integer dataset IDs are deprecated in favor of UUIDs; support will be removed after v25. " 

588 "Please migrate or re-create this data repository.", 

589 version="v25.0", 

590 category=FutureWarning, 

591) 

592class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

593 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

594 auto-incremented column for dataset IDs. 

595 """ 

596 

597 def insert( 

598 self, 

599 run: RunRecord, 

600 dataIds: Iterable[DataCoordinate], 

601 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

602 ) -> Iterator[DatasetRef]: 

603 # Docstring inherited from DatasetRecordStorage. 

604 

605 # We only support UNIQUE mode for integer dataset IDs 

606 if idMode != DatasetIdGenEnum.UNIQUE: 

607 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

608 

609 # Transform a possibly-single-pass iterable into a list. 

610 dataIdList = list(dataIds) 

611 yield from self._insert(run, dataIdList) 

612 

613 def import_( 

614 self, 

615 run: RunRecord, 

616 datasets: Iterable[DatasetRef], 

617 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

618 reuseIds: bool = False, 

619 ) -> Iterator[DatasetRef]: 

620 # Docstring inherited from DatasetRecordStorage. 

621 

622 # We only support UNIQUE mode for integer dataset IDs 

623 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 

624 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

625 

626 # Make a list of dataIds and optionally dataset IDs. 

627 dataIdList: list[DataCoordinate] = [] 

628 datasetIdList: list[int] = [] 

629 for dataset in datasets: 

630 dataIdList.append(dataset.dataId) 

631 

632 # We only accept integer dataset IDs, but also allow None. 

633 datasetId = dataset.id 

634 if datasetId is None: 

635 # if reuseIds is set then all IDs must be known 

636 if reuseIds: 

637 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

638 elif isinstance(datasetId, int): 

639 if reuseIds: 

640 datasetIdList.append(datasetId) 

641 else: 

642 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

643 

644 yield from self._insert(run, dataIdList, datasetIdList) 

645 

646 def _insert( 

647 self, run: RunRecord, dataIdList: list[DataCoordinate], datasetIdList: list[int] | None = None 

648 ) -> Iterator[DatasetRef]: 

649 """Common part of implementation of `insert` and `import_` methods.""" 

650 

651 # Remember any governor dimension values we see. 

652 summary = CollectionSummary() 

653 summary.add_data_ids(self.datasetType, dataIdList) 

654 

655 staticRow = { 

656 "dataset_type_id": self._dataset_type_id, 

657 self._runKeyColumn: run.key, 

658 } 

659 with self._db.transaction(): 

660 # Insert into the static dataset table, generating autoincrement 

661 # dataset_id values. 

662 if datasetIdList: 

663 # reuse existing IDs 

664 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

665 self._db.insert(self._static.dataset, *rows) 

666 else: 

667 # use auto-incremented IDs 

668 datasetIdList = self._db.insert( 

669 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True 

670 ) 

671 assert datasetIdList is not None 

672 # Update the summary tables for this collection in case this is the 

673 # first time this dataset type or these governor values will be 

674 # inserted there. 

675 self._summaries.update(run, [self._dataset_type_id], summary) 

676 # Combine the generated dataset_id values and data ID fields to 

677 # form rows to be inserted into the tags table. 

678 protoTagsRow = { 

679 "dataset_type_id": self._dataset_type_id, 

680 self._collections.getCollectionForeignKeyName(): run.key, 

681 } 

682 tagsRows = [ 

683 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

684 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

685 ] 

686 # Insert those rows into the tags table. This is where we'll 

687 # get any unique constraint violations. 

688 self._db.insert(self._tags, *tagsRows) 

689 

690 for dataId, datasetId in zip(dataIdList, datasetIdList): 

691 yield DatasetRef( 

692 datasetType=self.datasetType, 

693 dataId=dataId, 

694 id=datasetId, 

695 run=run.name, 

696 ) 

697 

698 

699class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

700 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

701 dataset IDs. 

702 """ 

703 

704 idMaker = DatasetIdFactory() 

705 """Factory for dataset IDs. In the future this factory may be shared with 

706 other classes (e.g. Registry).""" 

707 

708 def insert( 

709 self, 

710 run: RunRecord, 

711 dataIds: Iterable[DataCoordinate], 

712 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

713 ) -> Iterator[DatasetRef]: 

714 # Docstring inherited from DatasetRecordStorage. 

715 

716 # Iterate over data IDs, transforming a possibly-single-pass iterable 

717 # into a list. 

718 dataIdList = [] 

719 rows = [] 

720 summary = CollectionSummary() 

721 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds): 

722 dataIdList.append(dataId) 

723 rows.append( 

724 { 

725 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), 

726 "dataset_type_id": self._dataset_type_id, 

727 self._runKeyColumn: run.key, 

728 } 

729 ) 

730 

731 with self._db.transaction(): 

732 # Insert into the static dataset table. 

733 self._db.insert(self._static.dataset, *rows) 

734 # Update the summary tables for this collection in case this is the 

735 # first time this dataset type or these governor values will be 

736 # inserted there. 

737 self._summaries.update(run, [self._dataset_type_id], summary) 

738 # Combine the generated dataset_id values and data ID fields to 

739 # form rows to be inserted into the tags table. 

740 protoTagsRow = { 

741 "dataset_type_id": self._dataset_type_id, 

742 self._collections.getCollectionForeignKeyName(): run.key, 

743 } 

744 tagsRows = [ 

745 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

746 for dataId, row in zip(dataIdList, rows) 

747 ] 

748 # Insert those rows into the tags table. 

749 self._db.insert(self._tags, *tagsRows) 

750 

751 for dataId, row in zip(dataIdList, rows): 

752 yield DatasetRef( 

753 datasetType=self.datasetType, 

754 dataId=dataId, 

755 id=row["id"], 

756 run=run.name, 

757 ) 

758 

759 def import_( 

760 self, 

761 run: RunRecord, 

762 datasets: Iterable[DatasetRef], 

763 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

764 reuseIds: bool = False, 

765 ) -> Iterator[DatasetRef]: 

766 # Docstring inherited from DatasetRecordStorage. 

767 

768 # Iterate over data IDs, transforming a possibly-single-pass iterable 

769 # into a list. 

770 dataIds = {} 

771 summary = CollectionSummary() 

772 for dataset in summary.add_datasets_generator(datasets): 

773 # Ignore unknown ID types, normally all IDs have the same type but 

774 # this code supports mixed types or missing IDs. 

775 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

776 if datasetId is None: 

777 datasetId = self.idMaker.makeDatasetId( 

778 run.name, self.datasetType, dataset.dataId, idGenerationMode 

779 ) 

780 dataIds[datasetId] = dataset.dataId 

781 

782 # We'll insert all new rows into a temporary table 

783 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False) 

784 collFkName = self._collections.getCollectionForeignKeyName() 

785 protoTagsRow = { 

786 "dataset_type_id": self._dataset_type_id, 

787 collFkName: run.key, 

788 } 

789 tmpRows = [ 

790 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

791 for dataset_id, dataId in dataIds.items() 

792 ] 

793 with self._db.transaction(for_temp_tables=True): 

794 with self._db.temporary_table(tableSpec) as tmp_tags: 

795 # store all incoming data in a temporary table 

796 self._db.insert(tmp_tags, *tmpRows) 

797 

798 # There are some checks that we want to make for consistency 

799 # of the new datasets with existing ones. 

800 self._validateImport(tmp_tags, run) 

801 

802 # Before we merge temporary table into dataset/tags we need to 

803 # drop datasets which are already there (and do not conflict). 

804 self._db.deleteWhere( 

805 tmp_tags, 

806 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

807 ) 

808 

809 # Copy it into dataset table, need to re-label some columns. 

810 self._db.insert( 

811 self._static.dataset, 

812 select=sqlalchemy.sql.select( 

813 tmp_tags.columns.dataset_id.label("id"), 

814 tmp_tags.columns.dataset_type_id, 

815 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

816 ), 

817 ) 

818 

819 # Update the summary tables for this collection in case this 

820 # is the first time this dataset type or these governor values 

821 # will be inserted there. 

822 self._summaries.update(run, [self._dataset_type_id], summary) 

823 

824 # Copy it into tags table. 

825 self._db.insert(self._tags, select=tmp_tags.select()) 

826 

827 # Return refs in the same order as in the input list. 

828 for dataset_id, dataId in dataIds.items(): 

829 yield DatasetRef( 

830 datasetType=self.datasetType, 

831 id=dataset_id, 

832 dataId=dataId, 

833 run=run.name, 

834 ) 

835 

836 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

837 """Validate imported refs against existing datasets. 

838 

839 Parameters 

840 ---------- 

841 tmp_tags : `sqlalchemy.schema.Table` 

842 Temporary table with new datasets and the same schema as tags 

843 table. 

844 run : `RunRecord` 

845 The record object describing the `~CollectionType.RUN` collection. 

846 

847 Raises 

848 ------ 

849 ConflictingDefinitionError 

850 Raise if new datasets conflict with existing ones. 

851 """ 

852 dataset = self._static.dataset 

853 tags = self._tags 

854 collFkName = self._collections.getCollectionForeignKeyName() 

855 

856 # Check that existing datasets have the same dataset type and 

857 # run. 

858 query = ( 

859 sqlalchemy.sql.select( 

860 dataset.columns.id.label("dataset_id"), 

861 dataset.columns.dataset_type_id.label("dataset_type_id"), 

862 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"), 

863 dataset.columns[self._runKeyColumn].label("run"), 

864 tmp_tags.columns[collFkName].label("new run"), 

865 ) 

866 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

867 .where( 

868 sqlalchemy.sql.or_( 

869 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

870 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

871 ) 

872 ) 

873 .limit(1) 

874 ) 

875 with self._db.query(query) as result: 

876 if (row := result.first()) is not None: 

877 # Only include the first one in the exception message 

878 raise ConflictingDefinitionError( 

879 f"Existing dataset type or run do not match new dataset: {row._asdict()}" 

880 ) 

881 

882 # Check that matching dataset in tags table has the same DataId. 

883 query = ( 

884 sqlalchemy.sql.select( 

885 tags.columns.dataset_id, 

886 tags.columns.dataset_type_id.label("type_id"), 

887 tmp_tags.columns.dataset_type_id.label("new type_id"), 

888 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

889 *[ 

890 tmp_tags.columns[dim].label(f"new {dim}") 

891 for dim in self.datasetType.dimensions.required.names 

892 ], 

893 ) 

894 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

895 .where( 

896 sqlalchemy.sql.or_( 

897 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

898 *[ 

899 tags.columns[dim] != tmp_tags.columns[dim] 

900 for dim in self.datasetType.dimensions.required.names 

901 ], 

902 ) 

903 ) 

904 .limit(1) 

905 ) 

906 

907 with self._db.query(query) as result: 

908 if (row := result.first()) is not None: 

909 # Only include the first one in the exception message 

910 raise ConflictingDefinitionError( 

911 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

912 ) 

913 

914 # Check that matching run+dataId have the same dataset ID. 

915 query = ( 

916 sqlalchemy.sql.select( 

917 tags.columns.dataset_type_id.label("dataset_type_id"), 

918 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

919 tags.columns.dataset_id, 

920 tmp_tags.columns.dataset_id.label("new dataset_id"), 

921 tags.columns[collFkName], 

922 tmp_tags.columns[collFkName].label(f"new {collFkName}"), 

923 ) 

924 .select_from( 

925 tags.join( 

926 tmp_tags, 

927 sqlalchemy.sql.and_( 

928 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

929 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

930 *[ 

931 tags.columns[dim] == tmp_tags.columns[dim] 

932 for dim in self.datasetType.dimensions.required.names 

933 ], 

934 ), 

935 ) 

936 ) 

937 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

938 .limit(1) 

939 ) 

940 with self._db.query(query) as result: 

941 if (row := result.first()) is not None: 

942 # only include the first one in the exception message 

943 raise ConflictingDefinitionError( 

944 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" 

945 )