Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 95%

241 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-19 03:42 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23from __future__ import annotations 

24 

25__all__ = ("ByDimensionsDatasetRecordStorage",) 

26 

27import uuid 

28from collections.abc import Iterable, Iterator, Sequence, Set 

29from datetime import datetime 

30from typing import TYPE_CHECKING 

31 

32import astropy.time 

33import sqlalchemy 

34from lsst.daf.relation import Relation, sql 

35 

36from ....core import ( 

37 DataCoordinate, 

38 DatasetColumnTag, 

39 DatasetId, 

40 DatasetRef, 

41 DatasetType, 

42 DimensionKeyColumnTag, 

43 LogicalColumn, 

44 Timespan, 

45 ddl, 

46) 

47from ..._collection_summary import CollectionSummary 

48from ..._collectionType import CollectionType 

49from ..._exceptions import CollectionTypeError, ConflictingDefinitionError 

50from ...interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage 

51from ...queries import SqlQueryContext 

52from .tables import makeTagTableSpec 

53 

54if TYPE_CHECKING: 

55 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

56 from .summaries import CollectionSummaryManager 

57 from .tables import StaticDatasetTablesTuple 

58 

59 

60class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

61 """Dataset record storage implementation paired with 

62 `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more 

63 information. 

64 

65 Instances of this class should never be constructed directly; use 

66 `DatasetRecordStorageManager.register` instead. 

67 """ 

68 

69 def __init__( 

70 self, 

71 *, 

72 datasetType: DatasetType, 

73 db: Database, 

74 dataset_type_id: int, 

75 collections: CollectionManager, 

76 static: StaticDatasetTablesTuple, 

77 summaries: CollectionSummaryManager, 

78 tags: sqlalchemy.schema.Table, 

79 use_astropy_ingest_date: bool, 

80 calibs: sqlalchemy.schema.Table | None, 

81 ): 

82 super().__init__(datasetType=datasetType) 

83 self._dataset_type_id = dataset_type_id 

84 self._db = db 

85 self._collections = collections 

86 self._static = static 

87 self._summaries = summaries 

88 self._tags = tags 

89 self._calibs = calibs 

90 self._runKeyColumn = collections.getRunForeignKeyName() 

91 self._use_astropy = use_astropy_ingest_date 

92 

93 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

94 # Docstring inherited from DatasetRecordStorage. 

95 # Only delete from common dataset table; ON DELETE foreign key clauses 

96 # will handle the rest. 

97 self._db.delete( 

98 self._static.dataset, 

99 ["id"], 

100 *[{"id": dataset.getCheckedId()} for dataset in datasets], 

101 ) 

102 

103 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

104 # Docstring inherited from DatasetRecordStorage. 

105 if collection.type is not CollectionType.TAGGED: 105 ↛ 106line 105 didn't jump to line 106, because the condition on line 105 was never true

106 raise TypeError( 

107 f"Cannot associate into collection '{collection.name}' " 

108 f"of type {collection.type.name}; must be TAGGED." 

109 ) 

110 protoRow = { 

111 self._collections.getCollectionForeignKeyName(): collection.key, 

112 "dataset_type_id": self._dataset_type_id, 

113 } 

114 rows = [] 

115 summary = CollectionSummary() 

116 for dataset in summary.add_datasets_generator(datasets): 

117 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

118 for dimension, value in dataset.dataId.items(): 

119 row[dimension.name] = value 

120 rows.append(row) 

121 # Update the summary tables for this collection in case this is the 

122 # first time this dataset type or these governor values will be 

123 # inserted there. 

124 self._summaries.update(collection, [self._dataset_type_id], summary) 

125 # Update the tag table itself. 

126 self._db.replace(self._tags, *rows) 

127 

128 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

129 # Docstring inherited from DatasetRecordStorage. 

130 if collection.type is not CollectionType.TAGGED: 130 ↛ 131line 130 didn't jump to line 131, because the condition on line 130 was never true

131 raise TypeError( 

132 f"Cannot disassociate from collection '{collection.name}' " 

133 f"of type {collection.type.name}; must be TAGGED." 

134 ) 

135 rows = [ 

136 { 

137 "dataset_id": dataset.getCheckedId(), 

138 self._collections.getCollectionForeignKeyName(): collection.key, 

139 } 

140 for dataset in datasets 

141 ] 

142 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

143 

144 def _buildCalibOverlapQuery( 

145 self, 

146 collection: CollectionRecord, 

147 data_ids: set[DataCoordinate] | None, 

148 timespan: Timespan, 

149 context: SqlQueryContext, 

150 ) -> Relation: 

151 relation = self.make_relation( 

152 collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context 

153 ).with_rows_satisfying( 

154 context.make_timespan_overlap_predicate( 

155 DatasetColumnTag(self.datasetType.name, "timespan"), timespan 

156 ), 

157 ) 

158 if data_ids is not None: 

159 relation = relation.join( 

160 context.make_data_id_relation( 

161 data_ids, self.datasetType.dimensions.required.names 

162 ).transferred_to(context.sql_engine), 

163 ) 

164 return relation 

165 

166 def certify( 

167 self, 

168 collection: CollectionRecord, 

169 datasets: Iterable[DatasetRef], 

170 timespan: Timespan, 

171 context: SqlQueryContext, 

172 ) -> None: 

173 # Docstring inherited from DatasetRecordStorage. 

174 if self._calibs is None: 174 ↛ 175line 174 didn't jump to line 175, because the condition on line 174 was never true

175 raise CollectionTypeError( 

176 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

177 "DatasetType.isCalibration() is False." 

178 ) 

179 if collection.type is not CollectionType.CALIBRATION: 179 ↛ 180line 179 didn't jump to line 180, because the condition on line 179 was never true

180 raise CollectionTypeError( 

181 f"Cannot certify into collection '{collection.name}' " 

182 f"of type {collection.type.name}; must be CALIBRATION." 

183 ) 

184 TimespanReprClass = self._db.getTimespanRepresentation() 

185 protoRow = { 

186 self._collections.getCollectionForeignKeyName(): collection.key, 

187 "dataset_type_id": self._dataset_type_id, 

188 } 

189 rows = [] 

190 dataIds: set[DataCoordinate] | None = ( 

191 set() if not TimespanReprClass.hasExclusionConstraint() else None 

192 ) 

193 summary = CollectionSummary() 

194 for dataset in summary.add_datasets_generator(datasets): 

195 row = dict(protoRow, dataset_id=dataset.getCheckedId()) 

196 for dimension, value in dataset.dataId.items(): 

197 row[dimension.name] = value 

198 TimespanReprClass.update(timespan, result=row) 

199 rows.append(row) 

200 if dataIds is not None: 200 ↛ 194line 200 didn't jump to line 194, because the condition on line 200 was never false

201 dataIds.add(dataset.dataId) 

202 # Update the summary tables for this collection in case this is the 

203 # first time this dataset type or these governor values will be 

204 # inserted there. 

205 self._summaries.update(collection, [self._dataset_type_id], summary) 

206 # Update the association table itself. 

207 if TimespanReprClass.hasExclusionConstraint(): 207 ↛ 210line 207 didn't jump to line 210, because the condition on line 207 was never true

208 # Rely on database constraint to enforce invariants; we just 

209 # reraise the exception for consistency across DB engines. 

210 try: 

211 self._db.insert(self._calibs, *rows) 

212 except sqlalchemy.exc.IntegrityError as err: 

213 raise ConflictingDefinitionError( 

214 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

215 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

216 ) from err 

217 else: 

218 # Have to implement exclusion constraint ourselves. 

219 # Start by building a SELECT query for any rows that would overlap 

220 # this one. 

221 relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context) 

222 # Acquire a table lock to ensure there are no concurrent writes 

223 # could invalidate our checking before we finish the inserts. We 

224 # use a SAVEPOINT in case there is an outer transaction that a 

225 # failure here should not roll back. 

226 with self._db.transaction(lock=[self._calibs], savepoint=True): 

227 # Enter SqlQueryContext in case we need to use a temporary 

228 # table to include the give data IDs in the query. Note that 

229 # by doing this inside the transaction, we make sure it doesn't 

230 # attempt to close the session when its done, since it just 

231 # sees an already-open session that it knows it shouldn't 

232 # manage. 

233 with context: 

234 # Run the check SELECT query. 

235 conflicting = context.count(context.process(relation)) 

236 if conflicting > 0: 

237 raise ConflictingDefinitionError( 

238 f"{conflicting} validity range conflicts certifying datasets of type " 

239 f"{self.datasetType.name} into {collection.name} for range " 

240 f"[{timespan.begin}, {timespan.end})." 

241 ) 

242 # Proceed with the insert. 

243 self._db.insert(self._calibs, *rows) 

244 

245 def decertify( 

246 self, 

247 collection: CollectionRecord, 

248 timespan: Timespan, 

249 *, 

250 dataIds: Iterable[DataCoordinate] | None = None, 

251 context: SqlQueryContext, 

252 ) -> None: 

253 # Docstring inherited from DatasetRecordStorage. 

254 if self._calibs is None: 254 ↛ 255line 254 didn't jump to line 255, because the condition on line 254 was never true

255 raise CollectionTypeError( 

256 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

257 "DatasetType.isCalibration() is False." 

258 ) 

259 if collection.type is not CollectionType.CALIBRATION: 259 ↛ 260line 259 didn't jump to line 260, because the condition on line 259 was never true

260 raise CollectionTypeError( 

261 f"Cannot decertify from collection '{collection.name}' " 

262 f"of type {collection.type.name}; must be CALIBRATION." 

263 ) 

264 TimespanReprClass = self._db.getTimespanRepresentation() 

265 # Construct a SELECT query to find all rows that overlap our inputs. 

266 dataIdSet: set[DataCoordinate] | None 

267 if dataIds is not None: 

268 dataIdSet = set(dataIds) 

269 else: 

270 dataIdSet = None 

271 relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context) 

272 calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey") 

273 dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id") 

274 timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan") 

275 data_id_tags = [ 

276 (name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required.names 

277 ] 

278 # Set up collections to populate with the rows we'll want to modify. 

279 # The insert rows will have the same values for collection and 

280 # dataset type. 

281 protoInsertRow = { 

282 self._collections.getCollectionForeignKeyName(): collection.key, 

283 "dataset_type_id": self._dataset_type_id, 

284 } 

285 rowsToDelete = [] 

286 rowsToInsert = [] 

287 # Acquire a table lock to ensure there are no concurrent writes 

288 # between the SELECT and the DELETE and INSERT queries based on it. 

289 with self._db.transaction(lock=[self._calibs], savepoint=True): 

290 # Enter SqlQueryContext in case we need to use a temporary table to 

291 # include the give data IDs in the query (see similar block in 

292 # certify for details). 

293 with context: 

294 for row in context.fetch_iterable(relation): 

295 rowsToDelete.append({"id": row[calib_pkey_tag]}) 

296 # Construct the insert row(s) by copying the prototype row, 

297 # then adding the dimension column values, then adding 

298 # what's left of the timespan from that row after we 

299 # subtract the given timespan. 

300 newInsertRow = protoInsertRow.copy() 

301 newInsertRow["dataset_id"] = row[dataset_id_tag] 

302 for name, tag in data_id_tags: 

303 newInsertRow[name] = row[tag] 

304 rowTimespan = row[timespan_tag] 

305 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

306 for diffTimespan in rowTimespan.difference(timespan): 

307 rowsToInsert.append( 

308 TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()) 

309 ) 

310 # Run the DELETE and INSERT queries. 

311 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

312 self._db.insert(self._calibs, *rowsToInsert) 

313 

314 def make_relation( 

315 self, 

316 *collections: CollectionRecord, 

317 columns: Set[str], 

318 context: SqlQueryContext, 

319 ) -> Relation: 

320 # Docstring inherited from DatasetRecordStorage. 

321 collection_types = {collection.type for collection in collections} 

322 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

323 TimespanReprClass = self._db.getTimespanRepresentation() 

324 # 

325 # There are two kinds of table in play here: 

326 # 

327 # - the static dataset table (with the dataset ID, dataset type ID, 

328 # run ID/name, and ingest date); 

329 # 

330 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

331 # type ID, collection ID/name, data ID, and possibly validity 

332 # range). 

333 # 

334 # That means that we might want to return a query against either table 

335 # or a JOIN of both, depending on which quantities the caller wants. 

336 # But the data ID is always included, which means we'll always include 

337 # the tags/calibs table and join in the static dataset table only if we 

338 # need things from it that we can't get from the tags/calibs table. 

339 # 

340 # Note that it's important that we include a WHERE constraint on both 

341 # tables for any column (e.g. dataset_type_id) that is in both when 

342 # it's given explicitly; not doing can prevent the query planner from 

343 # using very important indexes. At present, we don't include those 

344 # redundant columns in the JOIN ON expression, however, because the 

345 # FOREIGN KEY (and its index) are defined only on dataset_id. 

346 tag_relation: Relation | None = None 

347 calib_relation: Relation | None = None 

348 if collection_types != {CollectionType.CALIBRATION}: 

349 # We'll need a subquery for the tags table if any of the given 

350 # collections are not a CALIBRATION collection. This intentionally 

351 # also fires when the list of collections is empty as a way to 

352 # create a dummy subquery that we know will fail. 

353 # We give the table an alias because it might appear multiple times 

354 # in the same query, for different dataset types. 

355 tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags")) 

356 if "timespan" in columns: 

357 tags_parts.columns_available[ 

358 DatasetColumnTag(self.datasetType.name, "timespan") 

359 ] = TimespanReprClass.fromLiteral(Timespan(None, None)) 

360 tag_relation = self._finish_single_relation( 

361 tags_parts, 

362 columns, 

363 [ 

364 (record, rank) 

365 for rank, record in enumerate(collections) 

366 if record.type is not CollectionType.CALIBRATION 

367 ], 

368 context, 

369 ) 

370 assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries." 

371 if CollectionType.CALIBRATION in collection_types: 

372 # If at least one collection is a CALIBRATION collection, we'll 

373 # need a subquery for the calibs table, and could include the 

374 # timespan as a result or constraint. 

375 assert ( 

376 self._calibs is not None 

377 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

378 calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs")) 

379 if "timespan" in columns: 

380 calibs_parts.columns_available[ 

381 DatasetColumnTag(self.datasetType.name, "timespan") 

382 ] = TimespanReprClass.from_columns(calibs_parts.from_clause.columns) 

383 if "calib_pkey" in columns: 

384 # This is a private extension not included in the base class 

385 # interface, for internal use only in _buildCalibOverlapQuery, 

386 # which needs access to the autoincrement primary key for the 

387 # calib association table. 

388 calibs_parts.columns_available[ 

389 DatasetColumnTag(self.datasetType.name, "calib_pkey") 

390 ] = calibs_parts.from_clause.columns.id 

391 calib_relation = self._finish_single_relation( 

392 calibs_parts, 

393 columns, 

394 [ 

395 (record, rank) 

396 for rank, record in enumerate(collections) 

397 if record.type is CollectionType.CALIBRATION 

398 ], 

399 context, 

400 ) 

401 if tag_relation is not None: 

402 if calib_relation is not None: 

403 # daf_relation's chain operation does not automatically 

404 # deduplicate; it's more like SQL's UNION ALL. To get UNION 

405 # in SQL here, we add an explicit deduplication. 

406 return tag_relation.chain(calib_relation).without_duplicates() 

407 else: 

408 return tag_relation 

409 elif calib_relation is not None: 

410 return calib_relation 

411 else: 

412 raise AssertionError("Branch should be unreachable.") 

413 

414 def _finish_single_relation( 

415 self, 

416 payload: sql.Payload[LogicalColumn], 

417 requested_columns: Set[str], 

418 collections: Sequence[tuple[CollectionRecord, int]], 

419 context: SqlQueryContext, 

420 ) -> Relation: 

421 """Helper method for `make_relation`. 

422 

423 This handles adding columns and WHERE terms that are not specific to 

424 either the tags or calibs tables. 

425 

426 Parameters 

427 ---------- 

428 payload : `lsst.daf.relation.sql.Payload` 

429 SQL query parts under construction, to be modified in-place and 

430 used to construct the new relation. 

431 requested_columns : `~collections.abc.Set` [ `str` ] 

432 Columns the relation should include. 

433 collections : `Sequence` [ `tuple` [ `CollectionRecord`, `int` ] ] 

434 Collections to search for the dataset and their ranks. 

435 context : `SqlQueryContext` 

436 Context that manages engines and state for the query. 

437 

438 Returns 

439 ------- 

440 relation : `lsst.daf.relation.Relation` 

441 New dataset query relation. 

442 """ 

443 payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id) 

444 dataset_id_col = payload.from_clause.columns.dataset_id 

445 collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()] 

446 # We always constrain and optionally retrieve the collection(s) via the 

447 # tags/calibs table. 

448 if len(collections) == 1: 

449 payload.where.append(collection_col == collections[0][0].key) 

450 if "collection" in requested_columns: 

451 payload.columns_available[ 

452 DatasetColumnTag(self.datasetType.name, "collection") 

453 ] = sqlalchemy.sql.literal(collections[0][0].key) 

454 else: 

455 assert collections, "The no-collections case should be in calling code for better diagnostics." 

456 payload.where.append(collection_col.in_([collection.key for collection, _ in collections])) 

457 if "collection" in requested_columns: 

458 payload.columns_available[ 

459 DatasetColumnTag(self.datasetType.name, "collection") 

460 ] = collection_col 

461 # Add rank if requested as a CASE-based calculation the collection 

462 # column. 

463 if "rank" in requested_columns: 

464 payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case( 

465 {record.key: rank for record, rank in collections}, 

466 value=collection_col, 

467 ) 

468 # Add more column definitions, starting with the data ID. 

469 for dimension_name in self.datasetType.dimensions.required.names: 

470 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[ 

471 dimension_name 

472 ] 

473 # We can always get the dataset_id from the tags/calibs table. 

474 if "dataset_id" in requested_columns: 

475 payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col 

476 # It's possible we now have everything we need, from just the 

477 # tags/calibs table. The things we might need to get from the static 

478 # dataset table are the run key and the ingest date. 

479 need_static_table = False 

480 if "run" in requested_columns: 

481 if len(collections) == 1 and collections[0][0].type is CollectionType.RUN: 

482 # If we are searching exactly one RUN collection, we 

483 # know that if we find the dataset in that collection, 

484 # then that's the datasets's run; we don't need to 

485 # query for it. 

486 payload.columns_available[ 

487 DatasetColumnTag(self.datasetType.name, "run") 

488 ] = sqlalchemy.sql.literal(collections[0][0].key) 

489 else: 

490 payload.columns_available[ 

491 DatasetColumnTag(self.datasetType.name, "run") 

492 ] = self._static.dataset.columns[self._runKeyColumn] 

493 need_static_table = True 

494 # Ingest date can only come from the static table. 

495 if "ingest_date" in requested_columns: 

496 need_static_table = True 

497 payload.columns_available[ 

498 DatasetColumnTag(self.datasetType.name, "ingest_date") 

499 ] = self._static.dataset.columns.ingest_date 

500 # If we need the static table, join it in via dataset_id and 

501 # dataset_type_id 

502 if need_static_table: 

503 payload.from_clause = payload.from_clause.join( 

504 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id) 

505 ) 

506 # Also constrain dataset_type_id in static table in case that helps 

507 # generate a better plan. 

508 # We could also include this in the JOIN ON clause, but my guess is 

509 # that that's a good idea IFF it's in the foreign key, and right 

510 # now it isn't. 

511 payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

512 leaf = context.sql_engine.make_leaf( 

513 payload.columns_available.keys(), 

514 payload=payload, 

515 name=self.datasetType.name, 

516 parameters={record.name: rank for record, rank in collections}, 

517 ) 

518 return leaf 

519 

520 def getDataId(self, id: DatasetId) -> DataCoordinate: 

521 """Return DataId for a dataset. 

522 

523 Parameters 

524 ---------- 

525 id : `DatasetId` 

526 Unique dataset identifier. 

527 

528 Returns 

529 ------- 

530 dataId : `DataCoordinate` 

531 DataId for the dataset. 

532 """ 

533 # This query could return multiple rows (one for each tagged collection 

534 # the dataset is in, plus one for its run collection), and we don't 

535 # care which of those we get. 

536 sql = ( 

537 self._tags.select() 

538 .where( 

539 sqlalchemy.sql.and_( 

540 self._tags.columns.dataset_id == id, 

541 self._tags.columns.dataset_type_id == self._dataset_type_id, 

542 ) 

543 ) 

544 .limit(1) 

545 ) 

546 with self._db.query(sql) as sql_result: 

547 row = sql_result.mappings().fetchone() 

548 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

549 return DataCoordinate.standardize( 

550 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, 

551 graph=self.datasetType.dimensions, 

552 ) 

553 

554 

555class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

556 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

557 dataset IDs. 

558 """ 

559 

560 idMaker = DatasetIdFactory() 

561 """Factory for dataset IDs. In the future this factory may be shared with 

562 other classes (e.g. Registry).""" 

563 

564 def insert( 

565 self, 

566 run: RunRecord, 

567 dataIds: Iterable[DataCoordinate], 

568 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

569 ) -> Iterator[DatasetRef]: 

570 # Docstring inherited from DatasetRecordStorage. 

571 

572 # Current timestamp, type depends on schema version. Use microsecond 

573 # precision for astropy time to keep things consistent with 

574 # TIMESTAMP(6) SQL type. 

575 timestamp: datetime | astropy.time.Time 

576 if self._use_astropy: 

577 # Astropy `now()` precision should be the same as `utcnow()` which 

578 # should mean microsecond. 

579 timestamp = astropy.time.Time.now() 

580 else: 

581 timestamp = datetime.utcnow() 

582 

583 # Iterate over data IDs, transforming a possibly-single-pass iterable 

584 # into a list. 

585 dataIdList = [] 

586 rows = [] 

587 summary = CollectionSummary() 

588 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds): 

589 dataIdList.append(dataId) 

590 rows.append( 

591 { 

592 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), 

593 "dataset_type_id": self._dataset_type_id, 

594 self._runKeyColumn: run.key, 

595 "ingest_date": timestamp, 

596 } 

597 ) 

598 

599 with self._db.transaction(): 

600 # Insert into the static dataset table. 

601 self._db.insert(self._static.dataset, *rows) 

602 # Update the summary tables for this collection in case this is the 

603 # first time this dataset type or these governor values will be 

604 # inserted there. 

605 self._summaries.update(run, [self._dataset_type_id], summary) 

606 # Combine the generated dataset_id values and data ID fields to 

607 # form rows to be inserted into the tags table. 

608 protoTagsRow = { 

609 "dataset_type_id": self._dataset_type_id, 

610 self._collections.getCollectionForeignKeyName(): run.key, 

611 } 

612 tagsRows = [ 

613 dict(protoTagsRow, dataset_id=row["id"], **dataId.byName()) 

614 for dataId, row in zip(dataIdList, rows) 

615 ] 

616 # Insert those rows into the tags table. 

617 self._db.insert(self._tags, *tagsRows) 

618 

619 for dataId, row in zip(dataIdList, rows): 

620 yield DatasetRef( 

621 datasetType=self.datasetType, 

622 dataId=dataId, 

623 id=row["id"], 

624 run=run.name, 

625 ) 

626 

627 def import_( 

628 self, 

629 run: RunRecord, 

630 datasets: Iterable[DatasetRef], 

631 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

632 reuseIds: bool = False, 

633 ) -> Iterator[DatasetRef]: 

634 # Docstring inherited from DatasetRecordStorage. 

635 

636 # Current timestamp, type depends on schema version. 

637 if self._use_astropy: 

638 # Astropy `now()` precision should be the same as `utcnow()` which 

639 # should mean microsecond. 

640 timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai) 

641 else: 

642 timestamp = sqlalchemy.sql.literal(datetime.utcnow()) 

643 

644 # Iterate over data IDs, transforming a possibly-single-pass iterable 

645 # into a list. 

646 dataIds = {} 

647 summary = CollectionSummary() 

648 for dataset in summary.add_datasets_generator(datasets): 

649 # Ignore unknown ID types, normally all IDs have the same type but 

650 # this code supports mixed types or missing IDs. 

651 datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None 

652 if datasetId is None: 

653 datasetId = self.idMaker.makeDatasetId( 

654 run.name, self.datasetType, dataset.dataId, idGenerationMode 

655 ) 

656 dataIds[datasetId] = dataset.dataId 

657 

658 # We'll insert all new rows into a temporary table 

659 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False) 

660 collFkName = self._collections.getCollectionForeignKeyName() 

661 protoTagsRow = { 

662 "dataset_type_id": self._dataset_type_id, 

663 collFkName: run.key, 

664 } 

665 tmpRows = [ 

666 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName()) 

667 for dataset_id, dataId in dataIds.items() 

668 ] 

669 with self._db.transaction(for_temp_tables=True): 

670 with self._db.temporary_table(tableSpec) as tmp_tags: 

671 # store all incoming data in a temporary table 

672 self._db.insert(tmp_tags, *tmpRows) 

673 

674 # There are some checks that we want to make for consistency 

675 # of the new datasets with existing ones. 

676 self._validateImport(tmp_tags, run) 

677 

678 # Before we merge temporary table into dataset/tags we need to 

679 # drop datasets which are already there (and do not conflict). 

680 self._db.deleteWhere( 

681 tmp_tags, 

682 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

683 ) 

684 

685 # Copy it into dataset table, need to re-label some columns. 

686 self._db.insert( 

687 self._static.dataset, 

688 select=sqlalchemy.sql.select( 

689 tmp_tags.columns.dataset_id.label("id"), 

690 tmp_tags.columns.dataset_type_id, 

691 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

692 timestamp.label("ingest_date"), 

693 ), 

694 ) 

695 

696 # Update the summary tables for this collection in case this 

697 # is the first time this dataset type or these governor values 

698 # will be inserted there. 

699 self._summaries.update(run, [self._dataset_type_id], summary) 

700 

701 # Copy it into tags table. 

702 self._db.insert(self._tags, select=tmp_tags.select()) 

703 

704 # Return refs in the same order as in the input list. 

705 for dataset_id, dataId in dataIds.items(): 

706 yield DatasetRef( 

707 datasetType=self.datasetType, 

708 id=dataset_id, 

709 dataId=dataId, 

710 run=run.name, 

711 ) 

712 

713 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

714 """Validate imported refs against existing datasets. 

715 

716 Parameters 

717 ---------- 

718 tmp_tags : `sqlalchemy.schema.Table` 

719 Temporary table with new datasets and the same schema as tags 

720 table. 

721 run : `RunRecord` 

722 The record object describing the `~CollectionType.RUN` collection. 

723 

724 Raises 

725 ------ 

726 ConflictingDefinitionError 

727 Raise if new datasets conflict with existing ones. 

728 """ 

729 dataset = self._static.dataset 

730 tags = self._tags 

731 collFkName = self._collections.getCollectionForeignKeyName() 

732 

733 # Check that existing datasets have the same dataset type and 

734 # run. 

735 query = ( 

736 sqlalchemy.sql.select( 

737 dataset.columns.id.label("dataset_id"), 

738 dataset.columns.dataset_type_id.label("dataset_type_id"), 

739 tmp_tags.columns.dataset_type_id.label("new dataset_type_id"), 

740 dataset.columns[self._runKeyColumn].label("run"), 

741 tmp_tags.columns[collFkName].label("new run"), 

742 ) 

743 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

744 .where( 

745 sqlalchemy.sql.or_( 

746 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

747 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

748 ) 

749 ) 

750 .limit(1) 

751 ) 

752 with self._db.query(query) as result: 

753 if (row := result.first()) is not None: 

754 # Only include the first one in the exception message 

755 raise ConflictingDefinitionError( 

756 f"Existing dataset type or run do not match new dataset: {row._asdict()}" 

757 ) 

758 

759 # Check that matching dataset in tags table has the same DataId. 

760 query = ( 

761 sqlalchemy.sql.select( 

762 tags.columns.dataset_id, 

763 tags.columns.dataset_type_id.label("type_id"), 

764 tmp_tags.columns.dataset_type_id.label("new type_id"), 

765 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

766 *[ 

767 tmp_tags.columns[dim].label(f"new {dim}") 

768 for dim in self.datasetType.dimensions.required.names 

769 ], 

770 ) 

771 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

772 .where( 

773 sqlalchemy.sql.or_( 

774 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

775 *[ 

776 tags.columns[dim] != tmp_tags.columns[dim] 

777 for dim in self.datasetType.dimensions.required.names 

778 ], 

779 ) 

780 ) 

781 .limit(1) 

782 ) 

783 

784 with self._db.query(query) as result: 

785 if (row := result.first()) is not None: 

786 # Only include the first one in the exception message 

787 raise ConflictingDefinitionError( 

788 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

789 ) 

790 

791 # Check that matching run+dataId have the same dataset ID. 

792 query = ( 

793 sqlalchemy.sql.select( 

794 tags.columns.dataset_type_id.label("dataset_type_id"), 

795 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

796 tags.columns.dataset_id, 

797 tmp_tags.columns.dataset_id.label("new dataset_id"), 

798 tags.columns[collFkName], 

799 tmp_tags.columns[collFkName].label(f"new {collFkName}"), 

800 ) 

801 .select_from( 

802 tags.join( 

803 tmp_tags, 

804 sqlalchemy.sql.and_( 

805 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

806 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

807 *[ 

808 tags.columns[dim] == tmp_tags.columns[dim] 

809 for dim in self.datasetType.dimensions.required.names 

810 ], 

811 ), 

812 ) 

813 ) 

814 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

815 .limit(1) 

816 ) 

817 with self._db.query(query) as result: 

818 if (row := result.first()) is not None: 

819 # only include the first one in the exception message 

820 raise ConflictingDefinitionError( 

821 f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" 

822 )