Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 95%

245 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 07:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28 

29from __future__ import annotations 

30 

31__all__ = ("ByDimensionsDatasetRecordStorage",) 

32 

33from collections.abc import Iterable, Iterator, Sequence, Set 

34from datetime import datetime 

35from typing import TYPE_CHECKING 

36 

37import astropy.time 

38import sqlalchemy 

39from lsst.daf.relation import Relation, sql 

40 

41from ....core import ( 

42 DataCoordinate, 

43 DatasetColumnTag, 

44 DatasetId, 

45 DatasetIdFactory, 

46 DatasetIdGenEnum, 

47 DatasetRef, 

48 DatasetType, 

49 DimensionKeyColumnTag, 

50 LogicalColumn, 

51 Timespan, 

52 ddl, 

53) 

54from ..._collection_summary import CollectionSummary 

55from ..._collectionType import CollectionType 

56from ..._exceptions import CollectionTypeError, ConflictingDefinitionError 

57from ...interfaces import DatasetRecordStorage 

58from ...queries import SqlQueryContext 

59from .tables import makeTagTableSpec 

60 

61if TYPE_CHECKING: 

62 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

63 from .summaries import CollectionSummaryManager 

64 from .tables import StaticDatasetTablesTuple 

65 

66 

67class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

68 """Dataset record storage implementation paired with 

69 `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more 

70 information. 

71 

72 Instances of this class should never be constructed directly; use 

73 `DatasetRecordStorageManager.register` instead. 

74 """ 

75 

76 def __init__( 

77 self, 

78 *, 

79 datasetType: DatasetType, 

80 db: Database, 

81 dataset_type_id: int, 

82 collections: CollectionManager, 

83 static: StaticDatasetTablesTuple, 

84 summaries: CollectionSummaryManager, 

85 tags: sqlalchemy.schema.Table, 

86 use_astropy_ingest_date: bool, 

87 calibs: sqlalchemy.schema.Table | None, 

88 ): 

89 super().__init__(datasetType=datasetType) 

90 self._dataset_type_id = dataset_type_id 

91 self._db = db 

92 self._collections = collections 

93 self._static = static 

94 self._summaries = summaries 

95 self._tags = tags 

96 self._calibs = calibs 

97 self._runKeyColumn = collections.getRunForeignKeyName() 

98 self._use_astropy = use_astropy_ingest_date 

99 

100 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

101 # Docstring inherited from DatasetRecordStorage. 

102 # Only delete from common dataset table; ON DELETE foreign key clauses 

103 # will handle the rest. 

104 self._db.delete( 

105 self._static.dataset, 

106 ["id"], 

107 *[{"id": dataset.id} for dataset in datasets], 

108 ) 

109 

110 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

111 # Docstring inherited from DatasetRecordStorage. 

112 if collection.type is not CollectionType.TAGGED: 112 ↛ 113line 112 didn't jump to line 113, because the condition on line 112 was never true

113 raise TypeError( 

114 f"Cannot associate into collection '{collection.name}' " 

115 f"of type {collection.type.name}; must be TAGGED." 

116 ) 

117 protoRow = { 

118 self._collections.getCollectionForeignKeyName(): collection.key, 

119 "dataset_type_id": self._dataset_type_id, 

120 } 

121 rows = [] 

122 summary = CollectionSummary() 

123 for dataset in summary.add_datasets_generator(datasets): 

124 row = dict(protoRow, dataset_id=dataset.id) 

125 for dimension, value in dataset.dataId.items(): 

126 row[dimension.name] = value 

127 rows.append(row) 

128 # Update the summary tables for this collection in case this is the 

129 # first time this dataset type or these governor values will be 

130 # inserted there. 

131 self._summaries.update(collection, [self._dataset_type_id], summary) 

132 # Update the tag table itself. 

133 self._db.replace(self._tags, *rows) 

134 

135 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

136 # Docstring inherited from DatasetRecordStorage. 

137 if collection.type is not CollectionType.TAGGED: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true

138 raise TypeError( 

139 f"Cannot disassociate from collection '{collection.name}' " 

140 f"of type {collection.type.name}; must be TAGGED." 

141 ) 

142 rows = [ 

143 { 

144 "dataset_id": dataset.id, 

145 self._collections.getCollectionForeignKeyName(): collection.key, 

146 } 

147 for dataset in datasets 

148 ] 

149 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

150 

151 def _buildCalibOverlapQuery( 

152 self, 

153 collection: CollectionRecord, 

154 data_ids: set[DataCoordinate] | None, 

155 timespan: Timespan, 

156 context: SqlQueryContext, 

157 ) -> Relation: 

158 relation = self.make_relation( 

159 collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context 

160 ).with_rows_satisfying( 

161 context.make_timespan_overlap_predicate( 

162 DatasetColumnTag(self.datasetType.name, "timespan"), timespan 

163 ), 

164 ) 

165 if data_ids is not None: 

166 relation = relation.join( 

167 context.make_data_id_relation( 

168 data_ids, self.datasetType.dimensions.required.names 

169 ).transferred_to(context.sql_engine), 

170 ) 

171 return relation 

172 

173 def certify( 

174 self, 

175 collection: CollectionRecord, 

176 datasets: Iterable[DatasetRef], 

177 timespan: Timespan, 

178 context: SqlQueryContext, 

179 ) -> None: 

180 # Docstring inherited from DatasetRecordStorage. 

181 if self._calibs is None: 181 ↛ 182line 181 didn't jump to line 182, because the condition on line 181 was never true

182 raise CollectionTypeError( 

183 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

184 "DatasetType.isCalibration() is False." 

185 ) 

186 if collection.type is not CollectionType.CALIBRATION: 186 ↛ 187line 186 didn't jump to line 187, because the condition on line 186 was never true

187 raise CollectionTypeError( 

188 f"Cannot certify into collection '{collection.name}' " 

189 f"of type {collection.type.name}; must be CALIBRATION." 

190 ) 

191 TimespanReprClass = self._db.getTimespanRepresentation() 

192 protoRow = { 

193 self._collections.getCollectionForeignKeyName(): collection.key, 

194 "dataset_type_id": self._dataset_type_id, 

195 } 

196 rows = [] 

197 dataIds: set[DataCoordinate] | None = ( 

198 set() if not TimespanReprClass.hasExclusionConstraint() else None 

199 ) 

200 summary = CollectionSummary() 

201 for dataset in summary.add_datasets_generator(datasets): 

202 row = dict(protoRow, dataset_id=dataset.id) 

203 for dimension, value in dataset.dataId.items(): 

204 row[dimension.name] = value 

205 TimespanReprClass.update(timespan, result=row) 

206 rows.append(row) 

207 if dataIds is not None: 207 ↛ 201line 207 didn't jump to line 201, because the condition on line 207 was never false

208 dataIds.add(dataset.dataId) 

209 # Update the summary tables for this collection in case this is the 

210 # first time this dataset type or these governor values will be 

211 # inserted there. 

212 self._summaries.update(collection, [self._dataset_type_id], summary) 

213 # Update the association table itself. 

214 if TimespanReprClass.hasExclusionConstraint(): 214 ↛ 217line 214 didn't jump to line 217, because the condition on line 214 was never true

215 # Rely on database constraint to enforce invariants; we just 

216 # reraise the exception for consistency across DB engines. 

217 try: 

218 self._db.insert(self._calibs, *rows) 

219 except sqlalchemy.exc.IntegrityError as err: 

220 raise ConflictingDefinitionError( 

221 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

222 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

223 ) from err 

224 else: 

225 # Have to implement exclusion constraint ourselves. 

226 # Start by building a SELECT query for any rows that would overlap 

227 # this one. 

228 relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context) 

229 # Acquire a table lock to ensure there are no concurrent writes 

230 # could invalidate our checking before we finish the inserts. We 

231 # use a SAVEPOINT in case there is an outer transaction that a 

232 # failure here should not roll back. 

233 with self._db.transaction(lock=[self._calibs], savepoint=True): 

234 # Enter SqlQueryContext in case we need to use a temporary 

235 # table to include the give data IDs in the query. Note that 

236 # by doing this inside the transaction, we make sure it doesn't 

237 # attempt to close the session when its done, since it just 

238 # sees an already-open session that it knows it shouldn't 

239 # manage. 

240 with context: 

241 # Run the check SELECT query. 

242 conflicting = context.count(context.process(relation)) 

243 if conflicting > 0: 

244 raise ConflictingDefinitionError( 

245 f"{conflicting} validity range conflicts certifying datasets of type " 

246 f"{self.datasetType.name} into {collection.name} for range " 

247 f"[{timespan.begin}, {timespan.end})." 

248 ) 

249 # Proceed with the insert. 

250 self._db.insert(self._calibs, *rows) 

251 

252 def decertify( 

253 self, 

254 collection: CollectionRecord, 

255 timespan: Timespan, 

256 *, 

257 dataIds: Iterable[DataCoordinate] | None = None, 

258 context: SqlQueryContext, 

259 ) -> None: 

260 # Docstring inherited from DatasetRecordStorage. 

261 if self._calibs is None: 261 ↛ 262line 261 didn't jump to line 262, because the condition on line 261 was never true

262 raise CollectionTypeError( 

263 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

264 "DatasetType.isCalibration() is False." 

265 ) 

266 if collection.type is not CollectionType.CALIBRATION: 266 ↛ 267line 266 didn't jump to line 267, because the condition on line 266 was never true

267 raise CollectionTypeError( 

268 f"Cannot decertify from collection '{collection.name}' " 

269 f"of type {collection.type.name}; must be CALIBRATION." 

270 ) 

271 TimespanReprClass = self._db.getTimespanRepresentation() 

272 # Construct a SELECT query to find all rows that overlap our inputs. 

273 dataIdSet: set[DataCoordinate] | None 

274 if dataIds is not None: 

275 dataIdSet = set(dataIds) 

276 else: 

277 dataIdSet = None 

278 relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context) 

279 calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey") 

280 dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id") 

281 timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan") 

282 data_id_tags = [ 

283 (name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required.names 

284 ] 

285 # Set up collections to populate with the rows we'll want to modify. 

286 # The insert rows will have the same values for collection and 

287 # dataset type. 

288 protoInsertRow = { 

289 self._collections.getCollectionForeignKeyName(): collection.key, 

290 "dataset_type_id": self._dataset_type_id, 

291 } 

292 rowsToDelete = [] 

293 rowsToInsert = [] 

294 # Acquire a table lock to ensure there are no concurrent writes 

295 # between the SELECT and the DELETE and INSERT queries based on it. 

296 with self._db.transaction(lock=[self._calibs], savepoint=True): 

297 # Enter SqlQueryContext in case we need to use a temporary table to 

298 # include the give data IDs in the query (see similar block in 

299 # certify for details). 

300 with context: 

301 for row in context.fetch_iterable(relation): 

302 rowsToDelete.append({"id": row[calib_pkey_tag]}) 

303 # Construct the insert row(s) by copying the prototype row, 

304 # then adding the dimension column values, then adding 

305 # what's left of the timespan from that row after we 

306 # subtract the given timespan. 

307 newInsertRow = protoInsertRow.copy() 

308 newInsertRow["dataset_id"] = row[dataset_id_tag] 

309 for name, tag in data_id_tags: 

310 newInsertRow[name] = row[tag] 

311 rowTimespan = row[timespan_tag] 

312 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

313 for diffTimespan in rowTimespan.difference(timespan): 

314 rowsToInsert.append( 

315 TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()) 

316 ) 

317 # Run the DELETE and INSERT queries. 

318 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

319 self._db.insert(self._calibs, *rowsToInsert) 

320 

321 def make_relation( 

322 self, 

323 *collections: CollectionRecord, 

324 columns: Set[str], 

325 context: SqlQueryContext, 

326 ) -> Relation: 

327 # Docstring inherited from DatasetRecordStorage. 

328 collection_types = {collection.type for collection in collections} 

329 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

330 TimespanReprClass = self._db.getTimespanRepresentation() 

331 # 

332 # There are two kinds of table in play here: 

333 # 

334 # - the static dataset table (with the dataset ID, dataset type ID, 

335 # run ID/name, and ingest date); 

336 # 

337 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

338 # type ID, collection ID/name, data ID, and possibly validity 

339 # range). 

340 # 

341 # That means that we might want to return a query against either table 

342 # or a JOIN of both, depending on which quantities the caller wants. 

343 # But the data ID is always included, which means we'll always include 

344 # the tags/calibs table and join in the static dataset table only if we 

345 # need things from it that we can't get from the tags/calibs table. 

346 # 

347 # Note that it's important that we include a WHERE constraint on both 

348 # tables for any column (e.g. dataset_type_id) that is in both when 

349 # it's given explicitly; not doing can prevent the query planner from 

350 # using very important indexes. At present, we don't include those 

351 # redundant columns in the JOIN ON expression, however, because the 

352 # FOREIGN KEY (and its index) are defined only on dataset_id. 

353 tag_relation: Relation | None = None 

354 calib_relation: Relation | None = None 

355 if collection_types != {CollectionType.CALIBRATION}: 

356 # We'll need a subquery for the tags table if any of the given 

357 # collections are not a CALIBRATION collection. This intentionally 

358 # also fires when the list of collections is empty as a way to 

359 # create a dummy subquery that we know will fail. 

360 # We give the table an alias because it might appear multiple times 

361 # in the same query, for different dataset types. 

362 tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags")) 

363 if "timespan" in columns: 

364 tags_parts.columns_available[ 

365 DatasetColumnTag(self.datasetType.name, "timespan") 

366 ] = TimespanReprClass.fromLiteral(Timespan(None, None)) 

367 tag_relation = self._finish_single_relation( 

368 tags_parts, 

369 columns, 

370 [ 

371 (record, rank) 

372 for rank, record in enumerate(collections) 

373 if record.type is not CollectionType.CALIBRATION 

374 ], 

375 context, 

376 ) 

377 assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries." 

378 if CollectionType.CALIBRATION in collection_types: 

379 # If at least one collection is a CALIBRATION collection, we'll 

380 # need a subquery for the calibs table, and could include the 

381 # timespan as a result or constraint. 

382 assert ( 

383 self._calibs is not None 

384 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

385 calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs")) 

386 if "timespan" in columns: 

387 calibs_parts.columns_available[ 

388 DatasetColumnTag(self.datasetType.name, "timespan") 

389 ] = TimespanReprClass.from_columns(calibs_parts.from_clause.columns) 

390 if "calib_pkey" in columns: 

391 # This is a private extension not included in the base class 

392 # interface, for internal use only in _buildCalibOverlapQuery, 

393 # which needs access to the autoincrement primary key for the 

394 # calib association table. 

395 calibs_parts.columns_available[ 

396 DatasetColumnTag(self.datasetType.name, "calib_pkey") 

397 ] = calibs_parts.from_clause.columns.id 

398 calib_relation = self._finish_single_relation( 

399 calibs_parts, 

400 columns, 

401 [ 

402 (record, rank) 

403 for rank, record in enumerate(collections) 

404 if record.type is CollectionType.CALIBRATION 

405 ], 

406 context, 

407 ) 

408 if tag_relation is not None: 

409 if calib_relation is not None: 

410 # daf_relation's chain operation does not automatically 

411 # deduplicate; it's more like SQL's UNION ALL. To get UNION 

412 # in SQL here, we add an explicit deduplication. 

413 return tag_relation.chain(calib_relation).without_duplicates() 

414 else: 

415 return tag_relation 

416 elif calib_relation is not None: 

417 return calib_relation 

418 else: 

419 raise AssertionError("Branch should be unreachable.") 

420 

421 def _finish_single_relation( 

422 self, 

423 payload: sql.Payload[LogicalColumn], 

424 requested_columns: Set[str], 

425 collections: Sequence[tuple[CollectionRecord, int]], 

426 context: SqlQueryContext, 

427 ) -> Relation: 

428 """Handle adding columns and WHERE terms that are not specific to 

429 either the tags or calibs tables. 

430 

431 Helper method for `make_relation`. 

432 

433 Parameters 

434 ---------- 

435 payload : `lsst.daf.relation.sql.Payload` 

436 SQL query parts under construction, to be modified in-place and 

437 used to construct the new relation. 

438 requested_columns : `~collections.abc.Set` [ `str` ] 

439 Columns the relation should include. 

440 collections : `~collections.abc.Sequence` [ `tuple` \ 

441 [ `CollectionRecord`, `int` ] ] 

442 Collections to search for the dataset and their ranks. 

443 context : `SqlQueryContext` 

444 Context that manages engines and state for the query. 

445 

446 Returns 

447 ------- 

448 relation : `lsst.daf.relation.Relation` 

449 New dataset query relation. 

450 """ 

451 payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id) 

452 dataset_id_col = payload.from_clause.columns.dataset_id 

453 collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()] 

454 # We always constrain and optionally retrieve the collection(s) via the 

455 # tags/calibs table. 

456 if len(collections) == 1: 

457 payload.where.append(collection_col == collections[0][0].key) 

458 if "collection" in requested_columns: 

459 payload.columns_available[ 

460 DatasetColumnTag(self.datasetType.name, "collection") 

461 ] = sqlalchemy.sql.literal(collections[0][0].key) 

462 else: 

463 assert collections, "The no-collections case should be in calling code for better diagnostics." 

464 payload.where.append(collection_col.in_([collection.key for collection, _ in collections])) 

465 if "collection" in requested_columns: 

466 payload.columns_available[ 

467 DatasetColumnTag(self.datasetType.name, "collection") 

468 ] = collection_col 

469 # Add rank if requested as a CASE-based calculation the collection 

470 # column. 

471 if "rank" in requested_columns: 

472 payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case( 

473 {record.key: rank for record, rank in collections}, 

474 value=collection_col, 

475 ) 

476 # Add more column definitions, starting with the data ID. 

477 for dimension_name in self.datasetType.dimensions.required.names: 

478 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[ 

479 dimension_name 

480 ] 

481 # We can always get the dataset_id from the tags/calibs table. 

482 if "dataset_id" in requested_columns: 

483 payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col 

484 # It's possible we now have everything we need, from just the 

485 # tags/calibs table. The things we might need to get from the static 

486 # dataset table are the run key and the ingest date. 

487 need_static_table = False 

488 if "run" in requested_columns: 

489 if len(collections) == 1 and collections[0][0].type is CollectionType.RUN: 

490 # If we are searching exactly one RUN collection, we 

491 # know that if we find the dataset in that collection, 

492 # then that's the datasets's run; we don't need to 

493 # query for it. 

494 payload.columns_available[ 

495 DatasetColumnTag(self.datasetType.name, "run") 

496 ] = sqlalchemy.sql.literal(collections[0][0].key) 

497 else: 

498 payload.columns_available[ 

499 DatasetColumnTag(self.datasetType.name, "run") 

500 ] = self._static.dataset.columns[self._runKeyColumn] 

501 need_static_table = True 

502 # Ingest date can only come from the static table. 

503 if "ingest_date" in requested_columns: 

504 need_static_table = True 

505 payload.columns_available[ 

506 DatasetColumnTag(self.datasetType.name, "ingest_date") 

507 ] = self._static.dataset.columns.ingest_date 

508 # If we need the static table, join it in via dataset_id and 

509 # dataset_type_id 

510 if need_static_table: 

511 payload.from_clause = payload.from_clause.join( 

512 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id) 

513 ) 

514 # Also constrain dataset_type_id in static table in case that helps 

515 # generate a better plan. 

516 # We could also include this in the JOIN ON clause, but my guess is 

517 # that that's a good idea IFF it's in the foreign key, and right 

518 # now it isn't. 

519 payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

520 leaf = context.sql_engine.make_leaf( 

521 payload.columns_available.keys(), 

522 payload=payload, 

523 name=self.datasetType.name, 

524 parameters={record.name: rank for record, rank in collections}, 

525 ) 

526 return leaf 

527 

528 def getDataId(self, id: DatasetId) -> DataCoordinate: 

529 """Return DataId for a dataset. 

530 

531 Parameters 

532 ---------- 

533 id : `DatasetId` 

534 Unique dataset identifier. 

535 

536 Returns 

537 ------- 

538 dataId : `DataCoordinate` 

539 DataId for the dataset. 

540 """ 

541 # This query could return multiple rows (one for each tagged collection 

542 # the dataset is in, plus one for its run collection), and we don't 

543 # care which of those we get. 

544 sql = ( 

545 self._tags.select() 

546 .where( 

547 sqlalchemy.sql.and_( 

548 self._tags.columns.dataset_id == id, 

549 self._tags.columns.dataset_type_id == self._dataset_type_id, 

550 ) 

551 ) 

552 .limit(1) 

553 ) 

554 with self._db.query(sql) as sql_result: 

555 row = sql_result.mappings().fetchone() 

556 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

557 return DataCoordinate.standardize( 

558 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

559 graph=self.datasetType.dimensions, 

560 ) 

561 

562 

563class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

564 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

565 dataset IDs. 

566 """ 

567 

568 idMaker = DatasetIdFactory() 

569 """Factory for dataset IDs. In the future this factory may be shared with 

570 other classes (e.g. Registry).""" 

571 

572 def insert( 

573 self, 

574 run: RunRecord, 

575 dataIds: Iterable[DataCoordinate], 

576 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

577 ) -> Iterator[DatasetRef]: 

578 # Docstring inherited from DatasetRecordStorage. 

579 

580 # Current timestamp, type depends on schema version. Use microsecond 

581 # precision for astropy time to keep things consistent with 

582 # TIMESTAMP(6) SQL type. 

583 timestamp: datetime | astropy.time.Time 

584 if self._use_astropy: 

585 # Astropy `now()` precision should be the same as `utcnow()` which 

586 # should mean microsecond. 

587 timestamp = astropy.time.Time.now() 

588 else: 

589 timestamp = datetime.utcnow() 

590 

591 # Iterate over data IDs, transforming a possibly-single-pass iterable 

592 # into a list. 

593 dataIdList = [] 

594 rows = [] 

595 summary = CollectionSummary() 

596 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds): 

597 dataIdList.append(dataId) 

598 rows.append( 

599 { 

600 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), 

601 "dataset_type_id": self._dataset_type_id, 

602 self._runKeyColumn: run.key, 

603 "ingest_date": timestamp, 

604 } 

605 ) 

606 

607 with self._db.transaction(): 

608 # Insert into the static dataset table. 

609 self._db.insert(self._static.dataset, *rows) 

610 # Update the summary tables for this collection in case this is the 

611 # first time this dataset type or these governor values will be 

612 # inserted there. 

613 self._summaries.update(run, [self._dataset_type_id], summary) 

614 # Combine the generated dataset_id values and data ID fields to 

615 # form rows to be inserted into the tags table. 

616 protoTagsRow = { 

617 "dataset_type_id": self._dataset_type_id, 

618 self._collections.getCollectionForeignKeyName(): run.key, 

619 } 

620 tagsRows = [ 

621 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

622 for dataId, row in zip(dataIdList, rows, strict=True) 

623 ] 

624 # Insert those rows into the tags table. 

625 self._db.insert(self._tags, *tagsRows) 

626 

627 for dataId, row in zip(dataIdList, rows, strict=True): 

628 yield DatasetRef( 

629 datasetType=self.datasetType, 

630 dataId=dataId, 

631 id=row["id"], 

632 run=run.name, 

633 ) 

634 

635 def import_( 

636 self, 

637 run: RunRecord, 

638 datasets: Iterable[DatasetRef], 

639 ) -> Iterator[DatasetRef]: 

640 # Docstring inherited from DatasetRecordStorage. 

641 

642 # Current timestamp, type depends on schema version. 

643 if self._use_astropy: 

644 # Astropy `now()` precision should be the same as `utcnow()` which 

645 # should mean microsecond. 

646 timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai) 

647 else: 

648 timestamp = sqlalchemy.sql.literal(datetime.utcnow()) 

649 

650 # Iterate over data IDs, transforming a possibly-single-pass iterable 

651 # into a list. 

652 dataIds = {} 

653 summary = CollectionSummary() 

654 for dataset in summary.add_datasets_generator(datasets): 

655 dataIds[dataset.id] = dataset.dataId 

656 

657 # We'll insert all new rows into a temporary table 

658 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False) 

659 collFkName = self._collections.getCollectionForeignKeyName() 

660 protoTagsRow = { 

661 "dataset_type_id": self._dataset_type_id, 

662 collFkName: run.key, 

663 } 

664 tmpRows = [ 

665 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

666 for dataset_id, dataId in dataIds.items() 

667 ] 

668 with self._db.transaction(for_temp_tables=True), self._db.temporary_table(tableSpec) as tmp_tags: 

669 # store all incoming data in a temporary table 

670 self._db.insert(tmp_tags, *tmpRows) 

671 

672 # There are some checks that we want to make for consistency 

673 # of the new datasets with existing ones. 

674 self._validateImport(tmp_tags, run) 

675 

676 # Before we merge temporary table into dataset/tags we need to 

677 # drop datasets which are already there (and do not conflict). 

678 self._db.deleteWhere( 

679 tmp_tags, 

680 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

681 ) 

682 

683 # Copy it into dataset table, need to re-label some columns. 

684 self._db.insert( 

685 self._static.dataset, 

686 select=sqlalchemy.sql.select( 

687 tmp_tags.columns.dataset_id.label("id"), 

688 tmp_tags.columns.dataset_type_id, 

689 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

690 timestamp.label("ingest_date"), 

691 ), 

692 ) 

693 

694 # Update the summary tables for this collection in case this 

695 # is the first time this dataset type or these governor values 

696 # will be inserted there. 

697 self._summaries.update(run, [self._dataset_type_id], summary) 

698 

699 # Copy it into tags table. 

700 self._db.insert(self._tags, select=tmp_tags.select()) 

701 

702 # Return refs in the same order as in the input list. 

703 for dataset_id, dataId in dataIds.items(): 

704 yield DatasetRef( 

705 datasetType=self.datasetType, 

706 id=dataset_id, 

707 dataId=dataId, 

708 run=run.name, 

709 ) 

710 

711 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

712 """Validate imported refs against existing datasets. 

713 

714 Parameters 

715 ---------- 

716 tmp_tags : `sqlalchemy.schema.Table` 

717 Temporary table with new datasets and the same schema as tags 

718 table. 

719 run : `RunRecord` 

720 The record object describing the `~CollectionType.RUN` collection. 

721 

722 Raises 

723 ------ 

724 ConflictingDefinitionError 

725 Raise if new datasets conflict with existing ones. 

726 """ 

727 dataset = self._static.dataset 

728 tags = self._tags 

729 collFkName = self._collections.getCollectionForeignKeyName() 

730 

731 # Check that existing datasets have the same dataset type and 

732 # run. 

733 query = ( 

734 sqlalchemy.sql.select( 

735 dataset.columns.id.label("dataset_id"), 

736 dataset.columns.dataset_type_id.label("dataset_type_id"), 

737 tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"), 

738 dataset.columns[self._runKeyColumn].label("run"), 

739 tmp_tags.columns[collFkName].label("new_run"), 

740 ) 

741 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

742 .where( 

743 sqlalchemy.sql.or_( 

744 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

745 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

746 ) 

747 ) 

748 .limit(1) 

749 ) 

750 with self._db.query(query) as result: 

751 # Only include the first one in the exception message 

752 if (row := result.first()) is not None: 

753 existing_run = self._collections[row.run].name 

754 new_run = self._collections[row.new_run].name 

755 if row.dataset_type_id == self._dataset_type_id: 

756 if row.new_dataset_type_id == self._dataset_type_id: 756 ↛ 762line 756 didn't jump to line 762, because the condition on line 756 was never false

757 raise ConflictingDefinitionError( 

758 f"Current run {existing_run!r} and new run {new_run!r} do not agree for " 

759 f"dataset {row.dataset_id}." 

760 ) 

761 else: 

762 raise ConflictingDefinitionError( 

763 f"Dataset {row.dataset_id} was provided with type {self.datasetType.name!r} " 

764 f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} " 

765 f"in run {run!r}." 

766 ) 

767 else: 

768 raise ConflictingDefinitionError( 

769 f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} " 

770 f"in run {new_run!r}, but was already defined with type {self.datasetType.name!r} " 

771 f"in run {run!r}." 

772 ) 

773 

774 # Check that matching dataset in tags table has the same DataId. 

775 query = ( 

776 sqlalchemy.sql.select( 

777 tags.columns.dataset_id, 

778 tags.columns.dataset_type_id.label("type_id"), 

779 tmp_tags.columns.dataset_type_id.label("new_type_id"), 

780 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

781 *[ 

782 tmp_tags.columns[dim].label(f"new_{dim}") 

783 for dim in self.datasetType.dimensions.required.names 

784 ], 

785 ) 

786 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

787 .where( 

788 sqlalchemy.sql.or_( 

789 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

790 *[ 

791 tags.columns[dim] != tmp_tags.columns[dim] 

792 for dim in self.datasetType.dimensions.required.names 

793 ], 

794 ) 

795 ) 

796 .limit(1) 

797 ) 

798 

799 with self._db.query(query) as result: 

800 if (row := result.first()) is not None: 

801 # Only include the first one in the exception message 

802 raise ConflictingDefinitionError( 

803 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

804 ) 

805 

806 # Check that matching run+dataId have the same dataset ID. 

807 query = ( 

808 sqlalchemy.sql.select( 

809 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

810 tags.columns.dataset_id, 

811 tmp_tags.columns.dataset_id.label("new_dataset_id"), 

812 tags.columns[collFkName], 

813 tmp_tags.columns[collFkName].label(f"new_{collFkName}"), 

814 ) 

815 .select_from( 

816 tags.join( 

817 tmp_tags, 

818 sqlalchemy.sql.and_( 

819 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

820 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

821 *[ 

822 tags.columns[dim] == tmp_tags.columns[dim] 

823 for dim in self.datasetType.dimensions.required.names 

824 ], 

825 ), 

826 ) 

827 ) 

828 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

829 .limit(1) 

830 ) 

831 with self._db.query(query) as result: 

832 # only include the first one in the exception message 

833 if (row := result.first()) is not None: 

834 data_id = {dim: getattr(row, dim) for dim in self.datasetType.dimensions.required.names} 

835 existing_collection = self._collections[getattr(row, collFkName)].name 

836 new_collection = self._collections[getattr(row, f"new_{collFkName}")].name 

837 raise ConflictingDefinitionError( 

838 f"Dataset with type {self.datasetType.name!r} and data ID {data_id} " 

839 f"has ID {row.dataset_id} in existing collection {existing_collection!r} " 

840 f"but ID {row.new_dataset_id} in new collection {new_collection!r}." 

841 )