Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 95%

261 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-05 09:58 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28 

29from __future__ import annotations 

30 

31from .... import ddl 

32 

33__all__ = ("ByDimensionsDatasetRecordStorage",) 

34 

35import datetime 

36from collections.abc import Callable, Iterable, Iterator, Sequence, Set 

37from typing import TYPE_CHECKING 

38 

39import astropy.time 

40import sqlalchemy 

41from lsst.daf.relation import Relation, sql 

42 

43from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag 

44from ...._column_type_info import LogicalColumn 

45from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef 

46from ...._dataset_type import DatasetType 

47from ...._exceptions import CollectionTypeError 

48from ...._timespan import Timespan 

49from ....dimensions import DataCoordinate 

50from ..._collection_summary import CollectionSummary 

51from ..._collection_type import CollectionType 

52from ..._exceptions import ConflictingDefinitionError 

53from ...interfaces import DatasetRecordStorage 

54from ...queries import SqlQueryContext 

55from .tables import makeTagTableSpec 

56 

57if TYPE_CHECKING: 

58 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

59 from .summaries import CollectionSummaryManager 

60 from .tables import StaticDatasetTablesTuple 

61 

62 

63class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

64 """Dataset record storage implementation paired with 

65 `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more 

66 information. 

67 

68 Instances of this class should never be constructed directly; use 

69 `DatasetRecordStorageManager.register` instead. 

70 

71 Parameters 

72 ---------- 

73 datasetType : `DatasetType` 

74 The dataset type to use. 

75 db : `Database` 

76 Database connection. 

77 dataset_type_id : `int` 

78 Dataset type identifier. 

79 collections : `CollectionManager` 

80 The collection manager. 

81 static : `StaticDatasetTablesTuple` 

82 Unknown. 

83 summaries : `CollectionSummaryManager` 

84 Collection summary manager. 

85 tags_table_factory : `~collections.abc.Callable` 

86 Factory for creating tags tables. 

87 use_astropy_ingest_date : `bool` 

88 Whether to use Astropy for ingest date. 

89 calibs_table_factory : `~collections.abc.Callable` 

90 Factory for creating calibration tables. 

91 """ 

92 

93 def __init__( 

94 self, 

95 *, 

96 datasetType: DatasetType, 

97 db: Database, 

98 dataset_type_id: int, 

99 collections: CollectionManager, 

100 static: StaticDatasetTablesTuple, 

101 summaries: CollectionSummaryManager, 

102 tags_table_factory: Callable[[], sqlalchemy.schema.Table], 

103 use_astropy_ingest_date: bool, 

104 calibs_table_factory: Callable[[], sqlalchemy.schema.Table] | None, 

105 ): 

106 super().__init__(datasetType=datasetType) 

107 self._dataset_type_id = dataset_type_id 

108 self._db = db 

109 self._collections = collections 

110 self._static = static 

111 self._summaries = summaries 

112 self._tags_table_factory = tags_table_factory 

113 self._calibs_table_factory = calibs_table_factory 

114 self._runKeyColumn = collections.getRunForeignKeyName() 

115 self._use_astropy = use_astropy_ingest_date 

116 self._tags_table: sqlalchemy.schema.Table | None = None 

117 self._calibs_table: sqlalchemy.schema.Table | None = None 

118 

119 @property 

120 def _tags(self) -> sqlalchemy.schema.Table: 

121 if self._tags_table is None: 

122 self._tags_table = self._tags_table_factory() 

123 return self._tags_table 

124 

125 @property 

126 def _calibs(self) -> sqlalchemy.schema.Table | None: 

127 if self._calibs_table is None: 

128 if self._calibs_table_factory is None: 128 ↛ 129line 128 didn't jump to line 129, because the condition on line 128 was never true

129 return None 

130 self._calibs_table = self._calibs_table_factory() 

131 return self._calibs_table 

132 

133 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

134 # Docstring inherited from DatasetRecordStorage. 

135 # Only delete from common dataset table; ON DELETE foreign key clauses 

136 # will handle the rest. 

137 self._db.delete( 

138 self._static.dataset, 

139 ["id"], 

140 *[{"id": dataset.id} for dataset in datasets], 

141 ) 

142 

143 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

144 # Docstring inherited from DatasetRecordStorage. 

145 if collection.type is not CollectionType.TAGGED: 145 ↛ 146line 145 didn't jump to line 146, because the condition on line 145 was never true

146 raise TypeError( 

147 f"Cannot associate into collection '{collection.name}' " 

148 f"of type {collection.type.name}; must be TAGGED." 

149 ) 

150 protoRow = { 

151 self._collections.getCollectionForeignKeyName(): collection.key, 

152 "dataset_type_id": self._dataset_type_id, 

153 } 

154 rows = [] 

155 summary = CollectionSummary() 

156 for dataset in summary.add_datasets_generator(datasets): 

157 rows.append(dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required)) 

158 # Update the summary tables for this collection in case this is the 

159 # first time this dataset type or these governor values will be 

160 # inserted there. 

161 self._summaries.update(collection, [self._dataset_type_id], summary) 

162 # Update the tag table itself. 

163 self._db.replace(self._tags, *rows) 

164 

165 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

166 # Docstring inherited from DatasetRecordStorage. 

167 if collection.type is not CollectionType.TAGGED: 167 ↛ 168line 167 didn't jump to line 168, because the condition on line 167 was never true

168 raise TypeError( 

169 f"Cannot disassociate from collection '{collection.name}' " 

170 f"of type {collection.type.name}; must be TAGGED." 

171 ) 

172 rows = [ 

173 { 

174 "dataset_id": dataset.id, 

175 self._collections.getCollectionForeignKeyName(): collection.key, 

176 } 

177 for dataset in datasets 

178 ] 

179 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

180 

181 def _buildCalibOverlapQuery( 

182 self, 

183 collection: CollectionRecord, 

184 data_ids: set[DataCoordinate] | None, 

185 timespan: Timespan, 

186 context: SqlQueryContext, 

187 ) -> Relation: 

188 relation = self.make_relation( 

189 collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context 

190 ).with_rows_satisfying( 

191 context.make_timespan_overlap_predicate( 

192 DatasetColumnTag(self.datasetType.name, "timespan"), timespan 

193 ), 

194 ) 

195 if data_ids is not None: 

196 relation = relation.join( 

197 context.make_data_id_relation( 

198 data_ids, self.datasetType.dimensions.required.names 

199 ).transferred_to(context.sql_engine), 

200 ) 

201 return relation 

202 

203 def certify( 

204 self, 

205 collection: CollectionRecord, 

206 datasets: Iterable[DatasetRef], 

207 timespan: Timespan, 

208 context: SqlQueryContext, 

209 ) -> None: 

210 # Docstring inherited from DatasetRecordStorage. 

211 if self._calibs is None: 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true

212 raise CollectionTypeError( 

213 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

214 "DatasetType.isCalibration() is False." 

215 ) 

216 if collection.type is not CollectionType.CALIBRATION: 216 ↛ 217line 216 didn't jump to line 217, because the condition on line 216 was never true

217 raise CollectionTypeError( 

218 f"Cannot certify into collection '{collection.name}' " 

219 f"of type {collection.type.name}; must be CALIBRATION." 

220 ) 

221 TimespanReprClass = self._db.getTimespanRepresentation() 

222 protoRow = { 

223 self._collections.getCollectionForeignKeyName(): collection.key, 

224 "dataset_type_id": self._dataset_type_id, 

225 } 

226 rows = [] 

227 dataIds: set[DataCoordinate] | None = ( 

228 set() if not TimespanReprClass.hasExclusionConstraint() else None 

229 ) 

230 summary = CollectionSummary() 

231 for dataset in summary.add_datasets_generator(datasets): 

232 row = dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required) 

233 TimespanReprClass.update(timespan, result=row) 

234 rows.append(row) 

235 if dataIds is not None: 235 ↛ 231line 235 didn't jump to line 231, because the condition on line 235 was never false

236 dataIds.add(dataset.dataId) 

237 # Update the summary tables for this collection in case this is the 

238 # first time this dataset type or these governor values will be 

239 # inserted there. 

240 self._summaries.update(collection, [self._dataset_type_id], summary) 

241 # Update the association table itself. 

242 if TimespanReprClass.hasExclusionConstraint(): 242 ↛ 245line 242 didn't jump to line 245, because the condition on line 242 was never true

243 # Rely on database constraint to enforce invariants; we just 

244 # reraise the exception for consistency across DB engines. 

245 try: 

246 self._db.insert(self._calibs, *rows) 

247 except sqlalchemy.exc.IntegrityError as err: 

248 raise ConflictingDefinitionError( 

249 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

250 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

251 ) from err 

252 else: 

253 # Have to implement exclusion constraint ourselves. 

254 # Start by building a SELECT query for any rows that would overlap 

255 # this one. 

256 relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context) 

257 # Acquire a table lock to ensure there are no concurrent writes 

258 # could invalidate our checking before we finish the inserts. We 

259 # use a SAVEPOINT in case there is an outer transaction that a 

260 # failure here should not roll back. 

261 with self._db.transaction(lock=[self._calibs], savepoint=True): 

262 # Enter SqlQueryContext in case we need to use a temporary 

263 # table to include the give data IDs in the query. Note that 

264 # by doing this inside the transaction, we make sure it doesn't 

265 # attempt to close the session when its done, since it just 

266 # sees an already-open session that it knows it shouldn't 

267 # manage. 

268 with context: 

269 # Run the check SELECT query. 

270 conflicting = context.count(context.process(relation)) 

271 if conflicting > 0: 

272 raise ConflictingDefinitionError( 

273 f"{conflicting} validity range conflicts certifying datasets of type " 

274 f"{self.datasetType.name} into {collection.name} for range " 

275 f"[{timespan.begin}, {timespan.end})." 

276 ) 

277 # Proceed with the insert. 

278 self._db.insert(self._calibs, *rows) 

279 

280 def decertify( 

281 self, 

282 collection: CollectionRecord, 

283 timespan: Timespan, 

284 *, 

285 dataIds: Iterable[DataCoordinate] | None = None, 

286 context: SqlQueryContext, 

287 ) -> None: 

288 # Docstring inherited from DatasetRecordStorage. 

289 if self._calibs is None: 289 ↛ 290line 289 didn't jump to line 290, because the condition on line 289 was never true

290 raise CollectionTypeError( 

291 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

292 "DatasetType.isCalibration() is False." 

293 ) 

294 if collection.type is not CollectionType.CALIBRATION: 294 ↛ 295line 294 didn't jump to line 295, because the condition on line 294 was never true

295 raise CollectionTypeError( 

296 f"Cannot decertify from collection '{collection.name}' " 

297 f"of type {collection.type.name}; must be CALIBRATION." 

298 ) 

299 TimespanReprClass = self._db.getTimespanRepresentation() 

300 # Construct a SELECT query to find all rows that overlap our inputs. 

301 dataIdSet: set[DataCoordinate] | None 

302 if dataIds is not None: 

303 dataIdSet = set(dataIds) 

304 else: 

305 dataIdSet = None 

306 relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context) 

307 calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey") 

308 dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id") 

309 timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan") 

310 data_id_tags = [ 

311 (name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required.names 

312 ] 

313 # Set up collections to populate with the rows we'll want to modify. 

314 # The insert rows will have the same values for collection and 

315 # dataset type. 

316 protoInsertRow = { 

317 self._collections.getCollectionForeignKeyName(): collection.key, 

318 "dataset_type_id": self._dataset_type_id, 

319 } 

320 rowsToDelete = [] 

321 rowsToInsert = [] 

322 # Acquire a table lock to ensure there are no concurrent writes 

323 # between the SELECT and the DELETE and INSERT queries based on it. 

324 with self._db.transaction(lock=[self._calibs], savepoint=True): 

325 # Enter SqlQueryContext in case we need to use a temporary table to 

326 # include the give data IDs in the query (see similar block in 

327 # certify for details). 

328 with context: 

329 for row in context.fetch_iterable(relation): 

330 rowsToDelete.append({"id": row[calib_pkey_tag]}) 

331 # Construct the insert row(s) by copying the prototype row, 

332 # then adding the dimension column values, then adding 

333 # what's left of the timespan from that row after we 

334 # subtract the given timespan. 

335 newInsertRow = protoInsertRow.copy() 

336 newInsertRow["dataset_id"] = row[dataset_id_tag] 

337 for name, tag in data_id_tags: 

338 newInsertRow[name] = row[tag] 

339 rowTimespan = row[timespan_tag] 

340 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

341 for diffTimespan in rowTimespan.difference(timespan): 

342 rowsToInsert.append( 

343 TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()) 

344 ) 

345 # Run the DELETE and INSERT queries. 

346 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

347 self._db.insert(self._calibs, *rowsToInsert) 

348 

349 def make_relation( 

350 self, 

351 *collections: CollectionRecord, 

352 columns: Set[str], 

353 context: SqlQueryContext, 

354 ) -> Relation: 

355 # Docstring inherited from DatasetRecordStorage. 

356 collection_types = {collection.type for collection in collections} 

357 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

358 TimespanReprClass = self._db.getTimespanRepresentation() 

359 # 

360 # There are two kinds of table in play here: 

361 # 

362 # - the static dataset table (with the dataset ID, dataset type ID, 

363 # run ID/name, and ingest date); 

364 # 

365 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

366 # type ID, collection ID/name, data ID, and possibly validity 

367 # range). 

368 # 

369 # That means that we might want to return a query against either table 

370 # or a JOIN of both, depending on which quantities the caller wants. 

371 # But the data ID is always included, which means we'll always include 

372 # the tags/calibs table and join in the static dataset table only if we 

373 # need things from it that we can't get from the tags/calibs table. 

374 # 

375 # Note that it's important that we include a WHERE constraint on both 

376 # tables for any column (e.g. dataset_type_id) that is in both when 

377 # it's given explicitly; not doing can prevent the query planner from 

378 # using very important indexes. At present, we don't include those 

379 # redundant columns in the JOIN ON expression, however, because the 

380 # FOREIGN KEY (and its index) are defined only on dataset_id. 

381 tag_relation: Relation | None = None 

382 calib_relation: Relation | None = None 

383 if collection_types != {CollectionType.CALIBRATION}: 

384 # We'll need a subquery for the tags table if any of the given 

385 # collections are not a CALIBRATION collection. This intentionally 

386 # also fires when the list of collections is empty as a way to 

387 # create a dummy subquery that we know will fail. 

388 # We give the table an alias because it might appear multiple times 

389 # in the same query, for different dataset types. 

390 tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags")) 

391 if "timespan" in columns: 

392 tags_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = ( 

393 TimespanReprClass.fromLiteral(Timespan(None, None)) 

394 ) 

395 tag_relation = self._finish_single_relation( 

396 tags_parts, 

397 columns, 

398 [ 

399 (record, rank) 

400 for rank, record in enumerate(collections) 

401 if record.type is not CollectionType.CALIBRATION 

402 ], 

403 context, 

404 ) 

405 assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries." 

406 if CollectionType.CALIBRATION in collection_types: 

407 # If at least one collection is a CALIBRATION collection, we'll 

408 # need a subquery for the calibs table, and could include the 

409 # timespan as a result or constraint. 

410 assert ( 

411 self._calibs is not None 

412 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

413 calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs")) 

414 if "timespan" in columns: 

415 calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = ( 

416 TimespanReprClass.from_columns(calibs_parts.from_clause.columns) 

417 ) 

418 if "calib_pkey" in columns: 

419 # This is a private extension not included in the base class 

420 # interface, for internal use only in _buildCalibOverlapQuery, 

421 # which needs access to the autoincrement primary key for the 

422 # calib association table. 

423 calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "calib_pkey")] = ( 

424 calibs_parts.from_clause.columns.id 

425 ) 

426 calib_relation = self._finish_single_relation( 

427 calibs_parts, 

428 columns, 

429 [ 

430 (record, rank) 

431 for rank, record in enumerate(collections) 

432 if record.type is CollectionType.CALIBRATION 

433 ], 

434 context, 

435 ) 

436 if tag_relation is not None: 

437 if calib_relation is not None: 

438 # daf_relation's chain operation does not automatically 

439 # deduplicate; it's more like SQL's UNION ALL. To get UNION 

440 # in SQL here, we add an explicit deduplication. 

441 return tag_relation.chain(calib_relation).without_duplicates() 

442 else: 

443 return tag_relation 

444 elif calib_relation is not None: 

445 return calib_relation 

446 else: 

447 raise AssertionError("Branch should be unreachable.") 

448 

449 def _finish_single_relation( 

450 self, 

451 payload: sql.Payload[LogicalColumn], 

452 requested_columns: Set[str], 

453 collections: Sequence[tuple[CollectionRecord, int]], 

454 context: SqlQueryContext, 

455 ) -> Relation: 

456 """Handle adding columns and WHERE terms that are not specific to 

457 either the tags or calibs tables. 

458 

459 Helper method for `make_relation`. 

460 

461 Parameters 

462 ---------- 

463 payload : `lsst.daf.relation.sql.Payload` 

464 SQL query parts under construction, to be modified in-place and 

465 used to construct the new relation. 

466 requested_columns : `~collections.abc.Set` [ `str` ] 

467 Columns the relation should include. 

468 collections : `~collections.abc.Sequence` [ `tuple` \ 

469 [ `CollectionRecord`, `int` ] ] 

470 Collections to search for the dataset and their ranks. 

471 context : `SqlQueryContext` 

472 Context that manages engines and state for the query. 

473 

474 Returns 

475 ------- 

476 relation : `lsst.daf.relation.Relation` 

477 New dataset query relation. 

478 """ 

479 payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id) 

480 dataset_id_col = payload.from_clause.columns.dataset_id 

481 collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()] 

482 # We always constrain and optionally retrieve the collection(s) via the 

483 # tags/calibs table. 

484 if len(collections) == 1: 

485 payload.where.append(collection_col == collections[0][0].key) 

486 if "collection" in requested_columns: 

487 payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = ( 

488 sqlalchemy.sql.literal(collections[0][0].key) 

489 ) 

490 else: 

491 assert collections, "The no-collections case should be in calling code for better diagnostics." 

492 payload.where.append(collection_col.in_([collection.key for collection, _ in collections])) 

493 if "collection" in requested_columns: 

494 payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = ( 

495 collection_col 

496 ) 

497 # Add rank if requested as a CASE-based calculation the collection 

498 # column. 

499 if "rank" in requested_columns: 

500 payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case( 

501 {record.key: rank for record, rank in collections}, 

502 value=collection_col, 

503 ) 

504 # Add more column definitions, starting with the data ID. 

505 for dimension_name in self.datasetType.dimensions.required.names: 

506 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[ 

507 dimension_name 

508 ] 

509 # We can always get the dataset_id from the tags/calibs table. 

510 if "dataset_id" in requested_columns: 

511 payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col 

512 # It's possible we now have everything we need, from just the 

513 # tags/calibs table. The things we might need to get from the static 

514 # dataset table are the run key and the ingest date. 

515 need_static_table = False 

516 if "run" in requested_columns: 

517 if len(collections) == 1 and collections[0][0].type is CollectionType.RUN: 

518 # If we are searching exactly one RUN collection, we 

519 # know that if we find the dataset in that collection, 

520 # then that's the datasets's run; we don't need to 

521 # query for it. 

522 payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = ( 

523 sqlalchemy.sql.literal(collections[0][0].key) 

524 ) 

525 else: 

526 payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = ( 

527 self._static.dataset.columns[self._runKeyColumn] 

528 ) 

529 need_static_table = True 

530 # Ingest date can only come from the static table. 

531 if "ingest_date" in requested_columns: 

532 need_static_table = True 

533 payload.columns_available[DatasetColumnTag(self.datasetType.name, "ingest_date")] = ( 

534 self._static.dataset.columns.ingest_date 

535 ) 

536 # If we need the static table, join it in via dataset_id and 

537 # dataset_type_id 

538 if need_static_table: 

539 payload.from_clause = payload.from_clause.join( 

540 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id) 

541 ) 

542 # Also constrain dataset_type_id in static table in case that helps 

543 # generate a better plan. 

544 # We could also include this in the JOIN ON clause, but my guess is 

545 # that that's a good idea IFF it's in the foreign key, and right 

546 # now it isn't. 

547 payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

548 leaf = context.sql_engine.make_leaf( 

549 payload.columns_available.keys(), 

550 payload=payload, 

551 name=self.datasetType.name, 

552 parameters={record.name: rank for record, rank in collections}, 

553 ) 

554 return leaf 

555 

556 def getDataId(self, id: DatasetId) -> DataCoordinate: 

557 """Return DataId for a dataset. 

558 

559 Parameters 

560 ---------- 

561 id : `DatasetId` 

562 Unique dataset identifier. 

563 

564 Returns 

565 ------- 

566 dataId : `DataCoordinate` 

567 DataId for the dataset. 

568 """ 

569 # This query could return multiple rows (one for each tagged collection 

570 # the dataset is in, plus one for its run collection), and we don't 

571 # care which of those we get. 

572 sql = ( 

573 self._tags.select() 

574 .where( 

575 sqlalchemy.sql.and_( 

576 self._tags.columns.dataset_id == id, 

577 self._tags.columns.dataset_type_id == self._dataset_type_id, 

578 ) 

579 ) 

580 .limit(1) 

581 ) 

582 with self._db.query(sql) as sql_result: 

583 row = sql_result.mappings().fetchone() 

584 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

585 return DataCoordinate.from_required_values( 

586 self.datasetType.dimensions.as_group(), 

587 tuple(row[dimension] for dimension in self.datasetType.dimensions.required.names), 

588 ) 

589 

590 

591class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

592 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

593 dataset IDs. 

594 """ 

595 

596 idMaker = DatasetIdFactory() 

597 """Factory for dataset IDs. In the future this factory may be shared with 

598 other classes (e.g. Registry).""" 

599 

600 def insert( 

601 self, 

602 run: RunRecord, 

603 dataIds: Iterable[DataCoordinate], 

604 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

605 ) -> Iterator[DatasetRef]: 

606 # Docstring inherited from DatasetRecordStorage. 

607 

608 # Current timestamp, type depends on schema version. Use microsecond 

609 # precision for astropy time to keep things consistent with 

610 # TIMESTAMP(6) SQL type. 

611 timestamp: datetime.datetime | astropy.time.Time 

612 if self._use_astropy: 

613 # Astropy `now()` precision should be the same as `now()` which 

614 # should mean microsecond. 

615 timestamp = astropy.time.Time.now() 

616 else: 

617 timestamp = datetime.datetime.now(datetime.UTC) 

618 

619 # Iterate over data IDs, transforming a possibly-single-pass iterable 

620 # into a list. 

621 dataIdList: list[DataCoordinate] = [] 

622 rows = [] 

623 summary = CollectionSummary() 

624 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds): 

625 dataIdList.append(dataId) 

626 rows.append( 

627 { 

628 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), 

629 "dataset_type_id": self._dataset_type_id, 

630 self._runKeyColumn: run.key, 

631 "ingest_date": timestamp, 

632 } 

633 ) 

634 

635 with self._db.transaction(): 

636 # Insert into the static dataset table. 

637 self._db.insert(self._static.dataset, *rows) 

638 # Update the summary tables for this collection in case this is the 

639 # first time this dataset type or these governor values will be 

640 # inserted there. 

641 self._summaries.update(run, [self._dataset_type_id], summary) 

642 # Combine the generated dataset_id values and data ID fields to 

643 # form rows to be inserted into the tags table. 

644 protoTagsRow = { 

645 "dataset_type_id": self._dataset_type_id, 

646 self._collections.getCollectionForeignKeyName(): run.key, 

647 } 

648 tagsRows = [ 

649 dict(protoTagsRow, dataset_id=row["id"], **dataId.required) 

650 for dataId, row in zip(dataIdList, rows, strict=True) 

651 ] 

652 # Insert those rows into the tags table. 

653 self._db.insert(self._tags, *tagsRows) 

654 

655 for dataId, row in zip(dataIdList, rows, strict=True): 

656 yield DatasetRef( 

657 datasetType=self.datasetType, 

658 dataId=dataId, 

659 id=row["id"], 

660 run=run.name, 

661 ) 

662 

663 def import_( 

664 self, 

665 run: RunRecord, 

666 datasets: Iterable[DatasetRef], 

667 ) -> Iterator[DatasetRef]: 

668 # Docstring inherited from DatasetRecordStorage. 

669 

670 # Current timestamp, type depends on schema version. 

671 if self._use_astropy: 

672 # Astropy `now()` precision should be the same as `now()` which 

673 # should mean microsecond. 

674 timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai) 

675 else: 

676 timestamp = sqlalchemy.sql.literal(datetime.datetime.now(datetime.UTC)) 

677 

678 # Iterate over data IDs, transforming a possibly-single-pass iterable 

679 # into a list. 

680 dataIds: dict[DatasetId, DataCoordinate] = {} 

681 summary = CollectionSummary() 

682 for dataset in summary.add_datasets_generator(datasets): 

683 dataIds[dataset.id] = dataset.dataId 

684 

685 # We'll insert all new rows into a temporary table 

686 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False) 

687 collFkName = self._collections.getCollectionForeignKeyName() 

688 protoTagsRow = { 

689 "dataset_type_id": self._dataset_type_id, 

690 collFkName: run.key, 

691 } 

692 tmpRows = [ 

693 dict(protoTagsRow, dataset_id=dataset_id, **dataId.required) 

694 for dataset_id, dataId in dataIds.items() 

695 ] 

696 with self._db.transaction(for_temp_tables=True), self._db.temporary_table(tableSpec) as tmp_tags: 

697 # store all incoming data in a temporary table 

698 self._db.insert(tmp_tags, *tmpRows) 

699 

700 # There are some checks that we want to make for consistency 

701 # of the new datasets with existing ones. 

702 self._validateImport(tmp_tags, run) 

703 

704 # Before we merge temporary table into dataset/tags we need to 

705 # drop datasets which are already there (and do not conflict). 

706 self._db.deleteWhere( 

707 tmp_tags, 

708 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

709 ) 

710 

711 # Copy it into dataset table, need to re-label some columns. 

712 self._db.insert( 

713 self._static.dataset, 

714 select=sqlalchemy.sql.select( 

715 tmp_tags.columns.dataset_id.label("id"), 

716 tmp_tags.columns.dataset_type_id, 

717 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

718 timestamp.label("ingest_date"), 

719 ), 

720 ) 

721 

722 # Update the summary tables for this collection in case this 

723 # is the first time this dataset type or these governor values 

724 # will be inserted there. 

725 self._summaries.update(run, [self._dataset_type_id], summary) 

726 

727 # Copy it into tags table. 

728 self._db.insert(self._tags, select=tmp_tags.select()) 

729 

730 # Return refs in the same order as in the input list. 

731 for dataset_id, dataId in dataIds.items(): 

732 yield DatasetRef( 

733 datasetType=self.datasetType, 

734 id=dataset_id, 

735 dataId=dataId, 

736 run=run.name, 

737 ) 

738 

739 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

740 """Validate imported refs against existing datasets. 

741 

742 Parameters 

743 ---------- 

744 tmp_tags : `sqlalchemy.schema.Table` 

745 Temporary table with new datasets and the same schema as tags 

746 table. 

747 run : `RunRecord` 

748 The record object describing the `~CollectionType.RUN` collection. 

749 

750 Raises 

751 ------ 

752 ConflictingDefinitionError 

753 Raise if new datasets conflict with existing ones. 

754 """ 

755 dataset = self._static.dataset 

756 tags = self._tags 

757 collFkName = self._collections.getCollectionForeignKeyName() 

758 

759 # Check that existing datasets have the same dataset type and 

760 # run. 

761 query = ( 

762 sqlalchemy.sql.select( 

763 dataset.columns.id.label("dataset_id"), 

764 dataset.columns.dataset_type_id.label("dataset_type_id"), 

765 tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"), 

766 dataset.columns[self._runKeyColumn].label("run"), 

767 tmp_tags.columns[collFkName].label("new_run"), 

768 ) 

769 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

770 .where( 

771 sqlalchemy.sql.or_( 

772 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

773 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

774 ) 

775 ) 

776 .limit(1) 

777 ) 

778 with self._db.query(query) as result: 

779 # Only include the first one in the exception message 

780 if (row := result.first()) is not None: 

781 existing_run = self._collections[row.run].name 

782 new_run = self._collections[row.new_run].name 

783 if row.dataset_type_id == self._dataset_type_id: 

784 if row.new_dataset_type_id == self._dataset_type_id: 784 ↛ 790line 784 didn't jump to line 790, because the condition on line 784 was never false

785 raise ConflictingDefinitionError( 

786 f"Current run {existing_run!r} and new run {new_run!r} do not agree for " 

787 f"dataset {row.dataset_id}." 

788 ) 

789 else: 

790 raise ConflictingDefinitionError( 

791 f"Dataset {row.dataset_id} was provided with type {self.datasetType.name!r} " 

792 f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} " 

793 f"in run {run!r}." 

794 ) 

795 else: 

796 raise ConflictingDefinitionError( 

797 f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} " 

798 f"in run {new_run!r}, but was already defined with type {self.datasetType.name!r} " 

799 f"in run {run!r}." 

800 ) 

801 

802 # Check that matching dataset in tags table has the same DataId. 

803 query = ( 

804 sqlalchemy.sql.select( 

805 tags.columns.dataset_id, 

806 tags.columns.dataset_type_id.label("type_id"), 

807 tmp_tags.columns.dataset_type_id.label("new_type_id"), 

808 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

809 *[ 

810 tmp_tags.columns[dim].label(f"new_{dim}") 

811 for dim in self.datasetType.dimensions.required.names 

812 ], 

813 ) 

814 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

815 .where( 

816 sqlalchemy.sql.or_( 

817 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

818 *[ 

819 tags.columns[dim] != tmp_tags.columns[dim] 

820 for dim in self.datasetType.dimensions.required.names 

821 ], 

822 ) 

823 ) 

824 .limit(1) 

825 ) 

826 

827 with self._db.query(query) as result: 

828 if (row := result.first()) is not None: 

829 # Only include the first one in the exception message 

830 raise ConflictingDefinitionError( 

831 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

832 ) 

833 

834 # Check that matching run+dataId have the same dataset ID. 

835 query = ( 

836 sqlalchemy.sql.select( 

837 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

838 tags.columns.dataset_id, 

839 tmp_tags.columns.dataset_id.label("new_dataset_id"), 

840 tags.columns[collFkName], 

841 tmp_tags.columns[collFkName].label(f"new_{collFkName}"), 

842 ) 

843 .select_from( 

844 tags.join( 

845 tmp_tags, 

846 sqlalchemy.sql.and_( 

847 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

848 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

849 *[ 

850 tags.columns[dim] == tmp_tags.columns[dim] 

851 for dim in self.datasetType.dimensions.required.names 

852 ], 

853 ), 

854 ) 

855 ) 

856 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

857 .limit(1) 

858 ) 

859 with self._db.query(query) as result: 

860 # only include the first one in the exception message 

861 if (row := result.first()) is not None: 

862 data_id = {dim: getattr(row, dim) for dim in self.datasetType.dimensions.required.names} 

863 existing_collection = self._collections[getattr(row, collFkName)].name 

864 new_collection = self._collections[getattr(row, f"new_{collFkName}")].name 

865 raise ConflictingDefinitionError( 

866 f"Dataset with type {self.datasetType.name!r} and data ID {data_id} " 

867 f"has ID {row.dataset_id} in existing collection {existing_collection!r} " 

868 f"but ID {row.new_dataset_id} in new collection {new_collection!r}." 

869 )