Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 79%

303 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-15 02:03 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23from __future__ import annotations 

24 

25__all__ = ("ByDimensionsDatasetRecordStorage",) 

26 

27import uuid 

28from collections.abc import Iterable, Iterator, Sequence 

29from typing import TYPE_CHECKING, Any 

30 

31import sqlalchemy 

32from deprecated.sphinx import deprecated 

33 

34from ....core import ( 

35 DataCoordinate, 

36 DataCoordinateSet, 

37 DatasetId, 

38 DatasetRef, 

39 DatasetType, 

40 SimpleQuery, 

41 Timespan, 

42 ddl, 

43) 

44from ..._collection_summary import CollectionSummary 

45from ..._collectionType import CollectionType 

46from ..._exceptions import CollectionTypeError, ConflictingDefinitionError, UnsupportedIdGeneratorError 

47from ...interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage 

48from .tables import makeTagTableSpec 

49 

50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

52 from .summaries import CollectionSummaryManager 

53 from .tables import StaticDatasetTablesTuple 

54 

55 

56class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

57 """Dataset record storage implementation paired with 

58 `ByDimensionsDatasetRecordStorageManager`; see that class for more 

59 information. 

60 

61 Instances of this class should never be constructed directly; use 

62 `DatasetRecordStorageManager.register` instead. 

63 """ 

64 

65 def __init__( 

66 self, 

67 *, 

68 datasetType: DatasetType, 

69 db: Database, 

70 dataset_type_id: int, 

71 collections: CollectionManager, 

72 static: StaticDatasetTablesTuple, 

73 summaries: CollectionSummaryManager, 

74 tags: sqlalchemy.schema.Table, 

75 calibs: sqlalchemy.schema.Table | None, 

76 ): 

77 super().__init__(datasetType=datasetType) 

78 self._dataset_type_id = dataset_type_id 

79 self._db = db 

80 self._collections = collections 

81 self._static = static 

82 self._summaries = summaries 

83 self._tags = tags 

84 self._calibs = calibs 

85 self._runKeyColumn = collections.getRunForeignKeyName() 

86 

87 def find( 

88 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Timespan | None = None 

89 ) -> DatasetRef | None: 

90 # Docstring inherited from DatasetRecordStorage. 

91 assert dataId.graph == self.datasetType.dimensions 

92 if collection.type is CollectionType.CALIBRATION and timespan is None: 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true

93 raise TypeError( 

94 f"Cannot search for dataset in CALIBRATION collection {collection.name} " 

95 f"without an input timespan." 

96 ) 

97 sql = self.select( 

98 collection, dataId=dataId, id=SimpleQuery.Select, run=SimpleQuery.Select, timespan=timespan 

99 ) 

100 results = self._db.query(sql) 

101 row = results.fetchone() 

102 if row is None: 

103 return None 

104 if collection.type is CollectionType.CALIBRATION: 

105 # For temporal calibration lookups (only!) our invariants do not 

106 # guarantee that the number of result rows is <= 1. 

107 # They would if `select` constrained the given timespan to be 

108 # _contained_ by the validity range in the self._calibs table, 

109 # instead of simply _overlapping_ it, because we do guarantee that 

110 # the validity ranges are disjoint for a particular dataset type, 

111 # collection, and data ID. But using an overlap test and a check 

112 # for multiple result rows here allows us to provide a more useful 

113 # diagnostic, as well as allowing `select` to support more general 

114 # queries where multiple results are not an error. 

115 if results.fetchone() is not None: 

116 raise RuntimeError( 

117 f"Multiple matches found for calibration lookup in {collection.name} for " 

118 f"{self.datasetType.name} with {dataId} overlapping {timespan}. " 

119 ) 

120 return DatasetRef( 

121 datasetType=self.datasetType, 

122 dataId=dataId, 

123 id=row.id, 

124 run=self._collections[row._mapping[self._runKeyColumn]].name, 

125 ) 

126 

127 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

128 # Docstring inherited from DatasetRecordStorage. 

129 # Only delete from common dataset table; ON DELETE foreign key clauses 

130 # will handle the rest. 

131 self._db.delete( 

132 self._static.dataset, 

133 ["id"], 

134 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

135 ) 

136 

137 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

138 # Docstring inherited from DatasetRecordStorage. 

139 if collection.type is not CollectionType.TAGGED: 139 ↛ 140line 139 didn't jump to line 140, because the condition on line 139 was never true

140 raise TypeError( 

141 f"Cannot associate into collection '{collection.name}' " 

142 f"of type {collection.type.name}; must be TAGGED." 

143 ) 

144 protoRow = { 

145 self._collections.getCollectionForeignKeyName(): collection.key, 

146 "dataset_type_id": self._dataset_type_id, 

147 } 

148 rows = [] 

149 summary = CollectionSummary() 

150 for dataset in summary.add_datasets_generator(datasets): 

151 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

152 for dimension, value in dataset.dataId.items(): 

153 row[dimension.name] = value 

154 rows.append(row) 

155 # Update the summary tables for this collection in case this is the 

156 # first time this dataset type or these governor values will be 

157 # inserted there. 

158 self._summaries.update(collection, [self._dataset_type_id], summary) 

159 # Update the tag table itself. 

160 self._db.replace(self._tags, *rows) 

161 

162 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

163 # Docstring inherited from DatasetRecordStorage. 

164 if collection.type is not CollectionType.TAGGED: 164 ↛ 165line 164 didn't jump to line 165, because the condition on line 164 was never true

165 raise TypeError( 

166 f"Cannot disassociate from collection '{collection.name}' " 

167 f"of type {collection.type.name}; must be TAGGED." 

168 ) 

169 rows = [ 

170 { 

171 "dataset_id": dataset.getCheckedId(), 

172 self._collections.getCollectionForeignKeyName(): collection.key, 

173 } 

174 for dataset in datasets 

175 ] 

176 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

177 

178 def _buildCalibOverlapQuery( 

179 self, collection: CollectionRecord, dataIds: DataCoordinateSet | None, timespan: Timespan 

180 ) -> SimpleQuery: 

181 assert self._calibs is not None 

182 # Start by building a SELECT query for any rows that would overlap 

183 # this one. 

184 query = SimpleQuery() 

185 query.join(self._calibs) 

186 # Add a WHERE clause matching the dataset type and collection. 

187 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id) 

188 query.where.append( 

189 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key 

190 ) 

191 # Add a WHERE clause matching any of the given data IDs. 

192 if dataIds is not None: 

193 dataIds.constrain( 

194 query, 

195 lambda name: self._calibs.columns[name], # type: ignore 

196 ) 

197 # Add WHERE clause for timespan overlaps. 

198 TimespanReprClass = self._db.getTimespanRepresentation() 

199 query.where.append( 

200 TimespanReprClass.from_columns(self._calibs.columns).overlaps( 

201 TimespanReprClass.fromLiteral(timespan) 

202 ) 

203 ) 

204 return query 

205 

206 def certify( 

207 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan 

208 ) -> None: 

209 # Docstring inherited from DatasetRecordStorage. 

210 if self._calibs is None: 210 ↛ 211line 210 didn't jump to line 211, because the condition on line 210 was never true

211 raise CollectionTypeError( 

212 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

213 f"DatasetType.isCalibration() is False." 

214 ) 

215 if collection.type is not CollectionType.CALIBRATION: 215 ↛ 216line 215 didn't jump to line 216, because the condition on line 215 was never true

216 raise CollectionTypeError( 

217 f"Cannot certify into collection '{collection.name}' " 

218 f"of type {collection.type.name}; must be CALIBRATION." 

219 ) 

220 TimespanReprClass = self._db.getTimespanRepresentation() 

221 protoRow = { 

222 self._collections.getCollectionForeignKeyName(): collection.key, 

223 "dataset_type_id": self._dataset_type_id, 

224 } 

225 rows = [] 

226 dataIds: set[DataCoordinate] | None = ( 

227 set() if not TimespanReprClass.hasExclusionConstraint() else None 

228 ) 

229 summary = CollectionSummary() 

230 for dataset in summary.add_datasets_generator(datasets): 

231 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

232 for dimension, value in dataset.dataId.items(): 

233 row[dimension.name] = value 

234 TimespanReprClass.update(timespan, result=row) 

235 rows.append(row) 

236 if dataIds is not None: 236 ↛ 230line 236 didn't jump to line 230, because the condition on line 236 was never false

237 dataIds.add(dataset.dataId) 

238 # Update the summary tables for this collection in case this is the 

239 # first time this dataset type or these governor values will be 

240 # inserted there. 

241 self._summaries.update(collection, [self._dataset_type_id], summary) 

242 # Update the association table itself. 

243 if TimespanReprClass.hasExclusionConstraint(): 243 ↛ 246line 243 didn't jump to line 246, because the condition on line 243 was never true

244 # Rely on database constraint to enforce invariants; we just 

245 # reraise the exception for consistency across DB engines. 

246 try: 

247 self._db.insert(self._calibs, *rows) 

248 except sqlalchemy.exc.IntegrityError as err: 

249 raise ConflictingDefinitionError( 

250 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

251 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

252 ) from err 

253 else: 

254 # Have to implement exclusion constraint ourselves. 

255 # Start by building a SELECT query for any rows that would overlap 

256 # this one. 

257 query = self._buildCalibOverlapQuery( 

258 collection, 

259 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore 

260 timespan, 

261 ) 

262 query.columns.append(sqlalchemy.sql.func.count()) 

263 sql = query.combine() 

264 # Acquire a table lock to ensure there are no concurrent writes 

265 # could invalidate our checking before we finish the inserts. We 

266 # use a SAVEPOINT in case there is an outer transaction that a 

267 # failure here should not roll back. 

268 with self._db.transaction(lock=[self._calibs], savepoint=True): 

269 # Run the check SELECT query. 

270 conflicting = self._db.query(sql).scalar() 

271 if conflicting > 0: 

272 raise ConflictingDefinitionError( 

273 f"{conflicting} validity range conflicts certifying datasets of type " 

274 f"{self.datasetType.name} into {collection.name} for range " 

275 f"[{timespan.begin}, {timespan.end})." 

276 ) 

277 # Proceed with the insert. 

278 self._db.insert(self._calibs, *rows) 

279 

280 def decertify( 

281 self, 

282 collection: CollectionRecord, 

283 timespan: Timespan, 

284 *, 

285 dataIds: Iterable[DataCoordinate] | None = None, 

286 ) -> None: 

287 # Docstring inherited from DatasetRecordStorage. 

288 if self._calibs is None: 288 ↛ 289line 288 didn't jump to line 289, because the condition on line 288 was never true

289 raise CollectionTypeError( 

290 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

291 f"DatasetType.isCalibration() is False." 

292 ) 

293 if collection.type is not CollectionType.CALIBRATION: 293 ↛ 294line 293 didn't jump to line 294, because the condition on line 293 was never true

294 raise CollectionTypeError( 

295 f"Cannot decertify from collection '{collection.name}' " 

296 f"of type {collection.type.name}; must be CALIBRATION." 

297 ) 

298 TimespanReprClass = self._db.getTimespanRepresentation() 

299 # Construct a SELECT query to find all rows that overlap our inputs. 

300 dataIdSet: DataCoordinateSet | None 

301 if dataIds is not None: 

302 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions) 

303 else: 

304 dataIdSet = None 

305 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan) 

306 query.columns.extend(self._calibs.columns) 

307 sql = query.combine() 

308 # Set up collections to populate with the rows we'll want to modify. 

309 # The insert rows will have the same values for collection and 

310 # dataset type. 

311 protoInsertRow = { 

312 self._collections.getCollectionForeignKeyName(): collection.key, 

313 "dataset_type_id": self._dataset_type_id, 

314 } 

315 rowsToDelete = [] 

316 rowsToInsert = [] 

317 # Acquire a table lock to ensure there are no concurrent writes 

318 # between the SELECT and the DELETE and INSERT queries based on it. 

319 with self._db.transaction(lock=[self._calibs], savepoint=True): 

320 for row in self._db.query(sql).mappings(): 

321 rowsToDelete.append({"id": row["id"]}) 

322 # Construct the insert row(s) by copying the prototype row, 

323 # then adding the dimension column values, then adding what's 

324 # left of the timespan from that row after we subtract the 

325 # given timespan. 

326 newInsertRow = protoInsertRow.copy() 

327 newInsertRow["dataset_id"] = row["dataset_id"] 

328 for name in self.datasetType.dimensions.required.names: 

329 newInsertRow[name] = row[name] 

330 rowTimespan = TimespanReprClass.extract(row) 

331 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

332 for diffTimespan in rowTimespan.difference(timespan): 

333 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy())) 

334 # Run the DELETE and INSERT queries. 

335 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

336 self._db.insert(self._calibs, *rowsToInsert) 

337 

338 def select( 

339 self, 

340 *collections: CollectionRecord, 

341 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

342 id: SimpleQuery.Select.Or[int | None] = SimpleQuery.Select, 

343 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

344 timespan: SimpleQuery.Select.Or[Timespan | None] = SimpleQuery.Select, 

345 ingestDate: SimpleQuery.Select.Or[Timespan | None] = None, 

346 rank: SimpleQuery.Select.Or[None] = None, 

347 ) -> sqlalchemy.sql.Selectable: 

348 # Docstring inherited from DatasetRecordStorage. 

349 collection_types = {collection.type for collection in collections} 

350 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

351 TimespanReprClass = self._db.getTimespanRepresentation() 

352 # 

353 # There are two kinds of table in play here: 

354 # 

355 # - the static dataset table (with the dataset ID, dataset type ID, 

356 # run ID/name, and ingest date); 

357 # 

358 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

359 # type ID, collection ID/name, data ID, and possibly validity 

360 # range). 

361 # 

362 # That means that we might want to return a query against either table 

363 # or a JOIN of both, depending on which quantities the caller wants. 

364 # But this method is documented/typed such that ``dataId`` is never 

365 # `None` - i.e. we always constrain or retreive the data ID. That 

366 # means we'll always include the tags/calibs table and join in the 

367 # static dataset table only if we need things from it that we can't get 

368 # from the tags/calibs table. 

369 # 

370 # Note that it's important that we include a WHERE constraint on both 

371 # tables for any column (e.g. dataset_type_id) that is in both when 

372 # it's given explicitly; not doing can prevent the query planner from 

373 # using very important indexes. At present, we don't include those 

374 # redundant columns in the JOIN ON expression, however, because the 

375 # FOREIGN KEY (and its index) are defined only on dataset_id. 

376 # 

377 # We'll start by accumulating kwargs to pass to SimpleQuery.join when 

378 # we bring in the tags/calibs table. We get the data ID or constrain 

379 # it in the tags/calibs table(s), but that's multiple columns, not one, 

380 # so we need to transform the one Select.Or argument into a dictionary 

381 # of them. 

382 kwargs: dict[str, Any] 

383 if dataId is SimpleQuery.Select: 

384 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required} 

385 else: 

386 kwargs = dict(dataId.byName()) 

387 # We always constrain (never retrieve) the dataset type in at least the 

388 # tags/calibs table. 

389 kwargs["dataset_type_id"] = self._dataset_type_id 

390 # Join in the tags and/or calibs tables, turning those 'kwargs' entries 

391 # into WHERE constraints or SELECT columns as appropriate. 

392 if collection_types != {CollectionType.CALIBRATION}: 

393 # We'll need a subquery for the tags table if any of the given 

394 # collections are not a CALIBRATION collection. This intentionally 

395 # also fires when the list of collections is empty as a way to 

396 # create a dummy subquery that we know will fail. 

397 tags_query = SimpleQuery() 

398 tags_query.join(self._tags, **kwargs) 

399 # If the timespan is requested, simulate a potentially compound 

400 # column whose values are the maximum and minimum timespan 

401 # bounds. 

402 # If the timespan is constrained, ignore the constraint, since 

403 # it'd be guaranteed to evaluate to True. 

404 if timespan is SimpleQuery.Select: 

405 tags_query.columns.extend(TimespanReprClass.fromLiteral(Timespan(None, None)).flatten()) 

406 self._finish_single_select( 

407 tags_query, 

408 self._tags, 

409 collections, 

410 id=id, 

411 run=run, 

412 ingestDate=ingestDate, 

413 rank=rank, 

414 ) 

415 else: 

416 tags_query = None 

417 if CollectionType.CALIBRATION in collection_types: 

418 # If at least one collection is a CALIBRATION collection, we'll 

419 # need a subquery for the calibs table, and could include the 

420 # timespan as a result or constraint. 

421 calibs_query = SimpleQuery() 

422 assert ( 

423 self._calibs is not None 

424 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

425 calibs_query.join(self._calibs, **kwargs) 

426 # Add the timespan column(s) to the result columns, or constrain 

427 # the timespan via an overlap condition. 

428 if timespan is SimpleQuery.Select: 

429 calibs_query.columns.extend(TimespanReprClass.from_columns(self._calibs.columns).flatten()) 

430 elif timespan is not None: 

431 calibs_query.where.append( 

432 TimespanReprClass.from_columns(self._calibs.columns).overlaps( 

433 TimespanReprClass.fromLiteral(timespan) 

434 ) 

435 ) 

436 self._finish_single_select( 

437 calibs_query, 

438 self._calibs, 

439 collections, 

440 id=id, 

441 run=run, 

442 ingestDate=ingestDate, 

443 rank=rank, 

444 ) 

445 else: 

446 calibs_query = None 

447 if calibs_query is not None: 

448 if tags_query is not None: 

449 return tags_query.combine().union(calibs_query.combine()) 

450 else: 

451 return calibs_query.combine() 

452 else: 

453 assert tags_query is not None, "Earlier logic should guaranteed at least one is not None." 

454 return tags_query.combine() 

455 

456 def _finish_single_select( 

457 self, 

458 query: SimpleQuery, 

459 table: sqlalchemy.schema.Table, 

460 collections: Sequence[CollectionRecord], 

461 id: SimpleQuery.Select.Or[int | None], 

462 run: SimpleQuery.Select.Or[None], 

463 ingestDate: SimpleQuery.Select.Or[Timespan | None], 

464 rank: SimpleQuery.Select.Or[None], 

465 ) -> None: 

466 dataset_id_col = table.columns.dataset_id 

467 collection_col = table.columns[self._collections.getCollectionForeignKeyName()] 

468 # We always constrain (never retrieve) the collection(s) in the 

469 # tags/calibs table. 

470 if len(collections) == 1: 

471 query.where.append(collection_col == collections[0].key) 

472 elif len(collections) == 0: 

473 # We support the case where there are no collections as a way to 

474 # generate a valid SQL query that can't yield results. This should 

475 # never get executed, but lots of downstream code will still try 

476 # to access the SQLAlchemy objects representing the columns in the 

477 # subquery. That's not ideal, but it'd take a lot of refactoring 

478 # to fix it (DM-31725). 

479 query.where.append(sqlalchemy.sql.literal(False)) 

480 else: 

481 query.where.append(collection_col.in_([collection.key for collection in collections])) 

482 # Add rank if requested as a CASE-based calculation the collection 

483 # column. 

484 if rank is not None: 

485 assert rank is SimpleQuery.Select, "Cannot constraint rank, only select it." 

486 query.columns.append( 

487 sqlalchemy.sql.case( 

488 {record.key: n for n, record in enumerate(collections)}, 

489 value=collection_col, 

490 ).label("rank") 

491 ) 

492 # We can always get the dataset_id from the tags/calibs table or 

493 # constrain it there. Can't use kwargs for that because we need to 

494 # alias it to 'id'. 

495 if id is SimpleQuery.Select: 

496 query.columns.append(dataset_id_col.label("id")) 

497 elif id is not None: 497 ↛ 498line 497 didn't jump to line 498, because the condition on line 497 was never true

498 query.where.append(dataset_id_col == id) 

499 # It's possible we now have everything we need, from just the 

500 # tags/calibs table. The things we might need to get from the static 

501 # dataset table are the run key and the ingest date. 

502 need_static_table = False 

503 static_kwargs: dict[str, Any] = {} 

504 if run is not None: 

505 assert run is SimpleQuery.Select, "To constrain the run name, pass a RunRecord as a collection." 

506 if len(collections) == 1 and collections[0].type is CollectionType.RUN: 

507 # If we are searching exactly one RUN collection, we 

508 # know that if we find the dataset in that collection, 

509 # then that's the datasets's run; we don't need to 

510 # query for it. 

511 query.columns.append(sqlalchemy.sql.literal(collections[0].key).label(self._runKeyColumn)) 

512 else: 

513 static_kwargs[self._runKeyColumn] = SimpleQuery.Select 

514 need_static_table = True 

515 # Ingest date can only come from the static table. 

516 if ingestDate is not None: 

517 need_static_table = True 

518 if ingestDate is SimpleQuery.Select: 518 ↛ 521line 518 didn't jump to line 521, because the condition on line 518 was never false

519 static_kwargs["ingest_date"] = SimpleQuery.Select 

520 else: 

521 assert isinstance(ingestDate, Timespan) 

522 # Timespan is astropy Time (usually in TAI) and ingest_date is 

523 # TIMESTAMP, convert values to Python datetime for sqlalchemy. 

524 if ingestDate.isEmpty(): 

525 raise RuntimeError("Empty timespan constraint provided for ingest_date.") 

526 if ingestDate.begin is not None: 

527 begin = ingestDate.begin.utc.datetime # type: ignore 

528 query.where.append(self._static.dataset.columns.ingest_date >= begin) 

529 if ingestDate.end is not None: 

530 end = ingestDate.end.utc.datetime # type: ignore 

531 query.where.append(self._static.dataset.columns.ingest_date < end) 

532 # If we need the static table, join it in via dataset_id and 

533 # dataset_type_id 

534 if need_static_table: 

535 query.join( 

536 self._static.dataset, 

537 onclause=(dataset_id_col == self._static.dataset.columns.id), 

538 **static_kwargs, 

539 ) 

540 # Also constrain dataset_type_id in static table in case that helps 

541 # generate a better plan. 

542 # We could also include this in the JOIN ON clause, but my guess is 

543 # that that's a good idea IFF it's in the foreign key, and right 

544 # now it isn't. 

545 query.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

546 

547 def getDataId(self, id: DatasetId) -> DataCoordinate: 

548 """Return DataId for a dataset. 

549 

550 Parameters 

551 ---------- 

552 id : `DatasetId` 

553 Unique dataset identifier. 

554 

555 Returns 

556 ------- 

557 dataId : `DataCoordinate` 

558 DataId for the dataset. 

559 """ 

560 # This query could return multiple rows (one for each tagged collection 

561 # the dataset is in, plus one for its run collection), and we don't 

562 # care which of those we get. 

563 sql = ( 

564 self._tags.select() 

565 .where( 

566 sqlalchemy.sql.and_( 

567 self._tags.columns.dataset_id == id, 

568 self._tags.columns.dataset_type_id == self._dataset_type_id, 

569 ) 

570 ) 

571 .limit(1) 

572 ) 

573 row = self._db.query(sql).mappings().fetchone() 

574 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

575 return DataCoordinate.standardize( 

576 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

577 graph=self.datasetType.dimensions, 

578 ) 

579 

580 

581@deprecated( 

582 "Integer dataset IDs are deprecated in favor of UUIDs; support will be removed after v26. " 

583 "Please migrate or re-create this data repository.", 

584 version="v25.0", 

585 category=FutureWarning, 

586) 

587class ByDimensionsDatasetRecordStorageInt(ByDimensionsDatasetRecordStorage): 

588 """Implementation of ByDimensionsDatasetRecordStorage which uses integer 

589 auto-incremented column for dataset IDs. 

590 """ 

591 

592 def insert( 

593 self, 

594 run: RunRecord, 

595 dataIds: Iterable[DataCoordinate], 

596 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

597 ) -> Iterator[DatasetRef]: 

598 # Docstring inherited from DatasetRecordStorage. 

599 

600 # We only support UNIQUE mode for integer dataset IDs 

601 if idMode != DatasetIdGenEnum.UNIQUE: 

602 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

603 

604 # Transform a possibly-single-pass iterable into a list. 

605 dataIdList = list(dataIds) 

606 yield from self._insert(run, dataIdList) 

607 

608 def import_( 

609 self, 

610 run: RunRecord, 

611 datasets: Iterable[DatasetRef], 

612 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

613 reuseIds: bool = False, 

614 ) -> Iterator[DatasetRef]: 

615 # Docstring inherited from DatasetRecordStorage. 

616 

617 # We only support UNIQUE mode for integer dataset IDs 

618 if idGenerationMode != DatasetIdGenEnum.UNIQUE: 

619 raise UnsupportedIdGeneratorError("Only UNIQUE mode can be used with integer dataset IDs.") 

620 

621 # Make a list of dataIds and optionally dataset IDs. 

622 dataIdList: list[DataCoordinate] = [] 

623 datasetIdList: list[int] = [] 

624 for dataset in datasets: 

625 dataIdList.append(dataset.dataId) 

626 

627 # We only accept integer dataset IDs, but also allow None. 

628 datasetId = dataset.id 

629 if datasetId is None: 

630 # if reuseIds is set then all IDs must be known 

631 if reuseIds: 

632 raise TypeError("All dataset IDs must be known if `reuseIds` is set") 

633 elif isinstance(datasetId, int): 

634 if reuseIds: 

635 datasetIdList.append(datasetId) 

636 else: 

637 raise TypeError(f"Unsupported type of dataset ID: {type(datasetId)}") 

638 

639 yield from self._insert(run, dataIdList, datasetIdList) 

640 

641 def _insert( 

642 self, run: RunRecord, dataIdList: list[DataCoordinate], datasetIdList: list[int] | None = None 

643 ) -> Iterator[DatasetRef]: 

644 """Common part of implementation of `insert` and `import_` methods.""" 

645 

646 # Remember any governor dimension values we see. 

647 summary = CollectionSummary() 

648 summary.add_data_ids(self.datasetType, dataIdList) 

649 

650 staticRow = { 

651 "dataset_type_id": self._dataset_type_id, 

652 self._runKeyColumn: run.key, 

653 } 

654 with self._db.transaction(): 

655 # Insert into the static dataset table, generating autoincrement 

656 # dataset_id values. 

657 if datasetIdList: 

658 # reuse existing IDs 

659 rows = [dict(staticRow, id=datasetId) for datasetId in datasetIdList] 

660 self._db.insert(self._static.dataset, *rows) 

661 else: 

662 # use auto-incremented IDs 

663 datasetIdList = self._db.insert( 

664 self._static.dataset, *([staticRow] * len(dataIdList)), returnIds=True 

665 ) 

666 assert datasetIdList is not None 

667 # Update the summary tables for this collection in case this is the 

668 # first time this dataset type or these governor values will be 

669 # inserted there. 

670 self._summaries.update(run, [self._dataset_type_id], summary) 

671 # Combine the generated dataset_id values and data ID fields to 

672 # form rows to be inserted into the tags table. 

673 protoTagsRow = { 

674 "dataset_type_id": self._dataset_type_id, 

675 self._collections.getCollectionForeignKeyName(): run.key, 

676 } 

677 tagsRows = [ 

678 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

679 for dataId, dataset_id in zip(dataIdList, datasetIdList) 

680 ] 

681 # Insert those rows into the tags table. This is where we'll 

682 # get any unique constraint violations. 

683 self._db.insert(self._tags, *tagsRows) 

684 

685 for dataId, datasetId in zip(dataIdList, datasetIdList): 

686 yield DatasetRef( 

687 datasetType=self.datasetType, 

688 dataId=dataId, 

689 id=datasetId, 

690 run=run.name, 

691 ) 

692 

693 

694class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

695 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

696 dataset IDs. 

697 """ 

698 

699 idMaker = DatasetIdFactory() 

700 """Factory for dataset IDs. In the future this factory may be shared with 

701 other classes (e.g. Registry).""" 

702 

703 def insert( 

704 self, 

705 run: RunRecord, 

706 dataIds: Iterable[DataCoordinate], 

707 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

708 ) -> Iterator[DatasetRef]: 

709 # Docstring inherited from DatasetRecordStorage. 

710 

711 # Iterate over data IDs, transforming a possibly-single-pass iterable 

712 # into a list. 

713 dataIdList = [] 

714 rows = [] 

715 summary = CollectionSummary() 

716 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds): 

717 dataIdList.append(dataId) 

718 rows.append( 

719 { 

720 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), 

721 "dataset_type_id": self._dataset_type_id, 

722 self._runKeyColumn: run.key, 

723 } 

724 ) 

725 

726 with self._db.transaction(): 

727 # Insert into the static dataset table. 

728 self._db.insert(self._static.dataset, *rows) 

729 # Update the summary tables for this collection in case this is the 

730 # first time this dataset type or these governor values will be 

731 # inserted there. 

732 self._summaries.update(run, [self._dataset_type_id], summary) 

733 # Combine the generated dataset_id values and data ID fields to 

734 # form rows to be inserted into the tags table. 

735 protoTagsRow = { 

736 "dataset_type_id": self._dataset_type_id, 

737 self._collections.getCollectionForeignKeyName(): run.key, 

738 } 

739 tagsRows = [ 

740 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

741 for dataId, row in zip(dataIdList, rows) 

742 ] 

743 # Insert those rows into the tags table. 

744 self._db.insert(self._tags, *tagsRows) 

745 

746 for dataId, row in zip(dataIdList, rows): 

747 yield DatasetRef( 

748 datasetType=self.datasetType, 

749 dataId=dataId, 

750 id=row["id"], 

751 run=run.name, 

752 ) 

753 

754 def import_( 

755 self, 

756 run: RunRecord, 

757 datasets: Iterable[DatasetRef], 

758 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

759 reuseIds: bool = False, 

760 ) -> Iterator[DatasetRef]: 

761 # Docstring inherited from DatasetRecordStorage. 

762 

763 # Iterate over data IDs, transforming a possibly-single-pass iterable 

764 # into a list. 

765 dataIds = {} 

766 summary = CollectionSummary() 

767 for dataset in summary.add_datasets_generator(datasets): 

768 # Ignore unknown ID types, normally all IDs have the same type but 

769 # this code supports mixed types or missing IDs. 

770 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

771 if datasetId is None: 

772 datasetId = self.idMaker.makeDatasetId( 

773 run.name, self.datasetType, dataset.dataId, idGenerationMode 

774 ) 

775 dataIds[datasetId] = dataset.dataId 

776 

777 with self._db.session() as session: 

778 

779 # insert all new rows into a temporary table 

780 tableSpec = makeTagTableSpec( 

781 self.datasetType, type(self._collections), ddl.GUID, constraints=False 

782 ) 

783 tmp_tags = session.makeTemporaryTable(tableSpec) 

784 

785 collFkName = self._collections.getCollectionForeignKeyName() 

786 protoTagsRow = { 

787 "dataset_type_id": self._dataset_type_id, 

788 collFkName: run.key, 

789 } 

790 tmpRows = [ 

791 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

792 for dataset_id, dataId in dataIds.items() 

793 ] 

794 

795 with self._db.transaction(): 

796 

797 # store all incoming data in a temporary table 

798 self._db.insert(tmp_tags, *tmpRows) 

799 

800 # There are some checks that we want to make for consistency 

801 # of the new datasets with existing ones. 

802 self._validateImport(tmp_tags, run) 

803 

804 # Before we merge temporary table into dataset/tags we need to 

805 # drop datasets which are already there (and do not conflict). 

806 self._db.deleteWhere( 

807 tmp_tags, 

808 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

809 ) 

810 

811 # Copy it into dataset table, need to re-label some columns. 

812 self._db.insert( 

813 self._static.dataset, 

814 select=sqlalchemy.sql.select( 

815 tmp_tags.columns.dataset_id.label("id"), 

816 tmp_tags.columns.dataset_type_id, 

817 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

818 ), 

819 ) 

820 

821 # Update the summary tables for this collection in case this 

822 # is the first time this dataset type or these governor values 

823 # will be inserted there. 

824 self._summaries.update(run, [self._dataset_type_id], summary) 

825 

826 # Copy it into tags table. 

827 self._db.insert(self._tags, select=tmp_tags.select()) 

828 

829 # Return refs in the same order as in the input list. 

830 for dataset_id, dataId in dataIds.items(): 

831 yield DatasetRef( 

832 datasetType=self.datasetType, 

833 id=dataset_id, 

834 dataId=dataId, 

835 run=run.name, 

836 ) 

837 

838 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

839 """Validate imported refs against existing datasets. 

840 

841 Parameters 

842 ---------- 

843 tmp_tags : `sqlalchemy.schema.Table` 

844 Temporary table with new datasets and the same schema as tags 

845 table. 

846 run : `RunRecord` 

847 The record object describing the `~CollectionType.RUN` collection. 

848 

849 Raises 

850 ------ 

851 ConflictingDefinitionError 

852 Raise if new datasets conflict with existing ones. 

853 """ 

854 dataset = self._static.dataset 

855 tags = self._tags 

856 collFkName = self._collections.getCollectionForeignKeyName() 

857 

858 # Check that existing datasets have the same dataset type and 

859 # run. 

860 query = ( 

861 sqlalchemy.sql.select( 

862 dataset.columns.id.label("dataset_id"), 

863 dataset.columns.dataset_type_id.label("dataset_type_id"), 

864 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"), 

865 dataset.columns[self._runKeyColumn].label("run"), 

866 tmp_tags.columns[collFkName].label("new run"), 

867 ) 

868 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

869 .where( 

870 sqlalchemy.sql.or_( 

871 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

872 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

873 ) 

874 ) 

875 ) 

876 result = self._db.query(query) 

877 if (row := result.first()) is not None: 

878 # Only include the first one in the exception message 

879 raise ConflictingDefinitionError( 

880 f"Existing dataset type or run do not match new dataset: {row._asdict()}" 

881 ) 

882 

883 # Check that matching dataset in tags table has the same DataId. 

884 query = ( 

885 sqlalchemy.sql.select( 

886 tags.columns.dataset_id, 

887 tags.columns.dataset_type_id.label("type_id"), 

888 tmp_tags.columns.dataset_type_id.label("new type_id"), 

889 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

890 *[ 

891 tmp_tags.columns[dim].label(f"new {dim}") 

892 for dim in self.datasetType.dimensions.required.names 

893 ], 

894 ) 

895 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

896 .where( 

897 sqlalchemy.sql.or_( 

898 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

899 *[ 

900 tags.columns[dim] != tmp_tags.columns[dim] 

901 for dim in self.datasetType.dimensions.required.names 

902 ], 

903 ) 

904 ) 

905 ) 

906 result = self._db.query(query) 

907 if (row := result.first()) is not None: 

908 # Only include the first one in the exception message 

909 raise ConflictingDefinitionError( 

910 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

911 ) 

912 

913 # Check that matching run+dataId have the same dataset ID. 

914 query = ( 

915 sqlalchemy.sql.select( 

916 tags.columns.dataset_type_id.label("dataset_type_id"), 

917 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

918 tags.columns.dataset_id, 

919 tmp_tags.columns.dataset_id.label("new dataset_id"), 

920 tags.columns[collFkName], 

921 tmp_tags.columns[collFkName].label(f"new {collFkName}"), 

922 ) 

923 .select_from( 

924 tags.join( 

925 tmp_tags, 

926 sqlalchemy.sql.and_( 

927 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

928 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

929 *[ 

930 tags.columns[dim] == tmp_tags.columns[dim] 

931 for dim in self.datasetType.dimensions.required.names 

932 ], 

933 ), 

934 ) 

935 ) 

936 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

937 ) 

938 result = self._db.query(query) 

939 if (row := result.first()) is not None: 

940 # only include the first one in the exception message 

941 raise ConflictingDefinitionError( 

942 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" 

943 )