Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py: 87%

328 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 02:52 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28 

29from __future__ import annotations 

30 

31from .... import ddl 

32 

33__all__ = ("ByDimensionsDatasetRecordStorage",) 

34 

35import datetime 

36from collections.abc import Callable, Iterable, Iterator, Sequence, Set 

37from typing import TYPE_CHECKING 

38 

39import astropy.time 

40import sqlalchemy 

41from lsst.daf.relation import Relation, sql 

42 

43from ...._column_tags import DatasetColumnTag, DimensionKeyColumnTag 

44from ...._column_type_info import LogicalColumn 

45from ...._dataset_ref import DatasetId, DatasetIdFactory, DatasetIdGenEnum, DatasetRef 

46from ...._dataset_type import DatasetType 

47from ...._exceptions import CollectionTypeError 

48from ...._timespan import Timespan 

49from ....dimensions import DataCoordinate 

50from ....direct_query_driver import QueryBuilder, QueryJoiner # new query system, server+direct only 

51from ....queries import tree as qt # new query system, both clients + server 

52from ..._collection_summary import CollectionSummary 

53from ..._collection_type import CollectionType 

54from ..._exceptions import ConflictingDefinitionError 

55from ...interfaces import DatasetRecordStorage 

56from ...queries import SqlQueryContext # old registry query system 

57from .tables import makeTagTableSpec 

58 

59if TYPE_CHECKING: 

60 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord 

61 from .summaries import CollectionSummaryManager 

62 from .tables import StaticDatasetTablesTuple 

63 

64 

65class ByDimensionsDatasetRecordStorage(DatasetRecordStorage): 

66 """Dataset record storage implementation paired with 

67 `ByDimensionsDatasetRecordStorageManagerUUID`; see that class for more 

68 information. 

69 

70 Instances of this class should never be constructed directly; use 

71 `DatasetRecordStorageManager.register` instead. 

72 

73 Parameters 

74 ---------- 

75 datasetType : `DatasetType` 

76 The dataset type to use. 

77 db : `Database` 

78 Database connection. 

79 dataset_type_id : `int` 

80 Dataset type identifier. 

81 collections : `CollectionManager` 

82 The collection manager. 

83 static : `StaticDatasetTablesTuple` 

84 Unknown. 

85 summaries : `CollectionSummaryManager` 

86 Collection summary manager. 

87 tags_table_factory : `~collections.abc.Callable` 

88 Factory for creating tags tables. 

89 use_astropy_ingest_date : `bool` 

90 Whether to use Astropy for ingest date. 

91 calibs_table_factory : `~collections.abc.Callable` 

92 Factory for creating calibration tables. 

93 """ 

94 

95 def __init__( 

96 self, 

97 *, 

98 datasetType: DatasetType, 

99 db: Database, 

100 dataset_type_id: int, 

101 collections: CollectionManager, 

102 static: StaticDatasetTablesTuple, 

103 summaries: CollectionSummaryManager, 

104 tags_table_factory: Callable[[], sqlalchemy.schema.Table], 

105 use_astropy_ingest_date: bool, 

106 calibs_table_factory: Callable[[], sqlalchemy.schema.Table] | None, 

107 ): 

108 super().__init__(datasetType=datasetType) 

109 self._dataset_type_id = dataset_type_id 

110 self._db = db 

111 self._collections = collections 

112 self._static = static 

113 self._summaries = summaries 

114 self._tags_table_factory = tags_table_factory 

115 self._calibs_table_factory = calibs_table_factory 

116 self._runKeyColumn = collections.getRunForeignKeyName() 

117 self._use_astropy = use_astropy_ingest_date 

118 self._tags_table: sqlalchemy.schema.Table | None = None 

119 self._calibs_table: sqlalchemy.schema.Table | None = None 

120 

121 @property 

122 def _tags(self) -> sqlalchemy.schema.Table: 

123 if self._tags_table is None: 

124 self._tags_table = self._tags_table_factory() 

125 return self._tags_table 

126 

127 @property 

128 def _calibs(self) -> sqlalchemy.schema.Table | None: 

129 if self._calibs_table is None: 

130 if self._calibs_table_factory is None: 130 ↛ 131line 130 didn't jump to line 131, because the condition on line 130 was never true

131 return None 

132 self._calibs_table = self._calibs_table_factory() 

133 return self._calibs_table 

134 

135 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

136 # Docstring inherited from DatasetRecordStorage. 

137 # Only delete from common dataset table; ON DELETE foreign key clauses 

138 # will handle the rest. 

139 self._db.delete( 

140 self._static.dataset, 

141 ["id"], 

142 *[{"id": dataset.id} for dataset in datasets], 

143 ) 

144 

145 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

146 # Docstring inherited from DatasetRecordStorage. 

147 if collection.type is not CollectionType.TAGGED: 147 ↛ 148line 147 didn't jump to line 148, because the condition on line 147 was never true

148 raise TypeError( 

149 f"Cannot associate into collection '{collection.name}' " 

150 f"of type {collection.type.name}; must be TAGGED." 

151 ) 

152 protoRow = { 

153 self._collections.getCollectionForeignKeyName(): collection.key, 

154 "dataset_type_id": self._dataset_type_id, 

155 } 

156 rows = [] 

157 summary = CollectionSummary() 

158 for dataset in summary.add_datasets_generator(datasets): 

159 rows.append(dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required)) 

160 # Update the summary tables for this collection in case this is the 

161 # first time this dataset type or these governor values will be 

162 # inserted there. 

163 self._summaries.update(collection, [self._dataset_type_id], summary) 

164 # Update the tag table itself. 

165 self._db.replace(self._tags, *rows) 

166 

167 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

168 # Docstring inherited from DatasetRecordStorage. 

169 if collection.type is not CollectionType.TAGGED: 169 ↛ 170line 169 didn't jump to line 170, because the condition on line 169 was never true

170 raise TypeError( 

171 f"Cannot disassociate from collection '{collection.name}' " 

172 f"of type {collection.type.name}; must be TAGGED." 

173 ) 

174 rows = [ 

175 { 

176 "dataset_id": dataset.id, 

177 self._collections.getCollectionForeignKeyName(): collection.key, 

178 } 

179 for dataset in datasets 

180 ] 

181 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()], *rows) 

182 

183 def _buildCalibOverlapQuery( 

184 self, 

185 collection: CollectionRecord, 

186 data_ids: set[DataCoordinate] | None, 

187 timespan: Timespan, 

188 context: SqlQueryContext, 

189 ) -> Relation: 

190 relation = self.make_relation( 

191 collection, columns={"timespan", "dataset_id", "calib_pkey"}, context=context 

192 ).with_rows_satisfying( 

193 context.make_timespan_overlap_predicate( 

194 DatasetColumnTag(self.datasetType.name, "timespan"), timespan 

195 ), 

196 ) 

197 if data_ids is not None: 

198 relation = relation.join( 

199 context.make_data_id_relation( 

200 data_ids, self.datasetType.dimensions.required.names 

201 ).transferred_to(context.sql_engine), 

202 ) 

203 return relation 

204 

205 def certify( 

206 self, 

207 collection: CollectionRecord, 

208 datasets: Iterable[DatasetRef], 

209 timespan: Timespan, 

210 context: SqlQueryContext, 

211 ) -> None: 

212 # Docstring inherited from DatasetRecordStorage. 

213 if self._calibs is None: 213 ↛ 214line 213 didn't jump to line 214, because the condition on line 213 was never true

214 raise CollectionTypeError( 

215 f"Cannot certify datasets of type {self.datasetType.name}, for which " 

216 "DatasetType.isCalibration() is False." 

217 ) 

218 if collection.type is not CollectionType.CALIBRATION: 218 ↛ 219line 218 didn't jump to line 219, because the condition on line 218 was never true

219 raise CollectionTypeError( 

220 f"Cannot certify into collection '{collection.name}' " 

221 f"of type {collection.type.name}; must be CALIBRATION." 

222 ) 

223 TimespanReprClass = self._db.getTimespanRepresentation() 

224 protoRow = { 

225 self._collections.getCollectionForeignKeyName(): collection.key, 

226 "dataset_type_id": self._dataset_type_id, 

227 } 

228 rows = [] 

229 dataIds: set[DataCoordinate] | None = ( 

230 set() if not TimespanReprClass.hasExclusionConstraint() else None 

231 ) 

232 summary = CollectionSummary() 

233 for dataset in summary.add_datasets_generator(datasets): 

234 row = dict(protoRow, dataset_id=dataset.id, **dataset.dataId.required) 

235 TimespanReprClass.update(timespan, result=row) 

236 rows.append(row) 

237 if dataIds is not None: 237 ↛ 233line 237 didn't jump to line 233, because the condition on line 237 was never false

238 dataIds.add(dataset.dataId) 

239 # Update the summary tables for this collection in case this is the 

240 # first time this dataset type or these governor values will be 

241 # inserted there. 

242 self._summaries.update(collection, [self._dataset_type_id], summary) 

243 # Update the association table itself. 

244 if TimespanReprClass.hasExclusionConstraint(): 244 ↛ 247line 244 didn't jump to line 247, because the condition on line 244 was never true

245 # Rely on database constraint to enforce invariants; we just 

246 # reraise the exception for consistency across DB engines. 

247 try: 

248 self._db.insert(self._calibs, *rows) 

249 except sqlalchemy.exc.IntegrityError as err: 

250 raise ConflictingDefinitionError( 

251 f"Validity range conflict certifying datasets of type {self.datasetType.name} " 

252 f"into {collection.name} for range [{timespan.begin}, {timespan.end})." 

253 ) from err 

254 else: 

255 # Have to implement exclusion constraint ourselves. 

256 # Start by building a SELECT query for any rows that would overlap 

257 # this one. 

258 relation = self._buildCalibOverlapQuery(collection, dataIds, timespan, context) 

259 # Acquire a table lock to ensure there are no concurrent writes 

260 # could invalidate our checking before we finish the inserts. We 

261 # use a SAVEPOINT in case there is an outer transaction that a 

262 # failure here should not roll back. 

263 with self._db.transaction(lock=[self._calibs], savepoint=True): 

264 # Enter SqlQueryContext in case we need to use a temporary 

265 # table to include the give data IDs in the query. Note that 

266 # by doing this inside the transaction, we make sure it doesn't 

267 # attempt to close the session when its done, since it just 

268 # sees an already-open session that it knows it shouldn't 

269 # manage. 

270 with context: 

271 # Run the check SELECT query. 

272 conflicting = context.count(context.process(relation)) 

273 if conflicting > 0: 

274 raise ConflictingDefinitionError( 

275 f"{conflicting} validity range conflicts certifying datasets of type " 

276 f"{self.datasetType.name} into {collection.name} for range " 

277 f"[{timespan.begin}, {timespan.end})." 

278 ) 

279 # Proceed with the insert. 

280 self._db.insert(self._calibs, *rows) 

281 

282 def decertify( 

283 self, 

284 collection: CollectionRecord, 

285 timespan: Timespan, 

286 *, 

287 dataIds: Iterable[DataCoordinate] | None = None, 

288 context: SqlQueryContext, 

289 ) -> None: 

290 # Docstring inherited from DatasetRecordStorage. 

291 if self._calibs is None: 291 ↛ 292line 291 didn't jump to line 292, because the condition on line 291 was never true

292 raise CollectionTypeError( 

293 f"Cannot decertify datasets of type {self.datasetType.name}, for which " 

294 "DatasetType.isCalibration() is False." 

295 ) 

296 if collection.type is not CollectionType.CALIBRATION: 296 ↛ 297line 296 didn't jump to line 297, because the condition on line 296 was never true

297 raise CollectionTypeError( 

298 f"Cannot decertify from collection '{collection.name}' " 

299 f"of type {collection.type.name}; must be CALIBRATION." 

300 ) 

301 TimespanReprClass = self._db.getTimespanRepresentation() 

302 # Construct a SELECT query to find all rows that overlap our inputs. 

303 dataIdSet: set[DataCoordinate] | None 

304 if dataIds is not None: 

305 dataIdSet = set(dataIds) 

306 else: 

307 dataIdSet = None 

308 relation = self._buildCalibOverlapQuery(collection, dataIdSet, timespan, context) 

309 calib_pkey_tag = DatasetColumnTag(self.datasetType.name, "calib_pkey") 

310 dataset_id_tag = DatasetColumnTag(self.datasetType.name, "dataset_id") 

311 timespan_tag = DatasetColumnTag(self.datasetType.name, "timespan") 

312 data_id_tags = [ 

313 (name, DimensionKeyColumnTag(name)) for name in self.datasetType.dimensions.required.names 

314 ] 

315 # Set up collections to populate with the rows we'll want to modify. 

316 # The insert rows will have the same values for collection and 

317 # dataset type. 

318 protoInsertRow = { 

319 self._collections.getCollectionForeignKeyName(): collection.key, 

320 "dataset_type_id": self._dataset_type_id, 

321 } 

322 rowsToDelete = [] 

323 rowsToInsert = [] 

324 # Acquire a table lock to ensure there are no concurrent writes 

325 # between the SELECT and the DELETE and INSERT queries based on it. 

326 with self._db.transaction(lock=[self._calibs], savepoint=True): 

327 # Enter SqlQueryContext in case we need to use a temporary table to 

328 # include the give data IDs in the query (see similar block in 

329 # certify for details). 

330 with context: 

331 for row in context.fetch_iterable(relation): 

332 rowsToDelete.append({"id": row[calib_pkey_tag]}) 

333 # Construct the insert row(s) by copying the prototype row, 

334 # then adding the dimension column values, then adding 

335 # what's left of the timespan from that row after we 

336 # subtract the given timespan. 

337 newInsertRow = protoInsertRow.copy() 

338 newInsertRow["dataset_id"] = row[dataset_id_tag] 

339 for name, tag in data_id_tags: 

340 newInsertRow[name] = row[tag] 

341 rowTimespan = row[timespan_tag] 

342 assert rowTimespan is not None, "Field should have a NOT NULL constraint." 

343 for diffTimespan in rowTimespan.difference(timespan): 

344 rowsToInsert.append( 

345 TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()) 

346 ) 

347 # Run the DELETE and INSERT queries. 

348 self._db.delete(self._calibs, ["id"], *rowsToDelete) 

349 self._db.insert(self._calibs, *rowsToInsert) 

350 

351 def make_relation( 

352 self, 

353 *collections: CollectionRecord, 

354 columns: Set[str], 

355 context: SqlQueryContext, 

356 ) -> Relation: 

357 # Docstring inherited from DatasetRecordStorage. 

358 collection_types = {collection.type for collection in collections} 

359 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

360 TimespanReprClass = self._db.getTimespanRepresentation() 

361 # 

362 # There are two kinds of table in play here: 

363 # 

364 # - the static dataset table (with the dataset ID, dataset type ID, 

365 # run ID/name, and ingest date); 

366 # 

367 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

368 # type ID, collection ID/name, data ID, and possibly validity 

369 # range). 

370 # 

371 # That means that we might want to return a query against either table 

372 # or a JOIN of both, depending on which quantities the caller wants. 

373 # But the data ID is always included, which means we'll always include 

374 # the tags/calibs table and join in the static dataset table only if we 

375 # need things from it that we can't get from the tags/calibs table. 

376 # 

377 # Note that it's important that we include a WHERE constraint on both 

378 # tables for any column (e.g. dataset_type_id) that is in both when 

379 # it's given explicitly; not doing can prevent the query planner from 

380 # using very important indexes. At present, we don't include those 

381 # redundant columns in the JOIN ON expression, however, because the 

382 # FOREIGN KEY (and its index) are defined only on dataset_id. 

383 tag_relation: Relation | None = None 

384 calib_relation: Relation | None = None 

385 if collection_types != {CollectionType.CALIBRATION}: 

386 # We'll need a subquery for the tags table if any of the given 

387 # collections are not a CALIBRATION collection. This intentionally 

388 # also fires when the list of collections is empty as a way to 

389 # create a dummy subquery that we know will fail. 

390 # We give the table an alias because it might appear multiple times 

391 # in the same query, for different dataset types. 

392 tags_parts = sql.Payload[LogicalColumn](self._tags.alias(f"{self.datasetType.name}_tags")) 

393 if "timespan" in columns: 

394 tags_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = ( 

395 TimespanReprClass.fromLiteral(Timespan(None, None)) 

396 ) 

397 tag_relation = self._finish_single_relation( 

398 tags_parts, 

399 columns, 

400 [ 

401 (record, rank) 

402 for rank, record in enumerate(collections) 

403 if record.type is not CollectionType.CALIBRATION 

404 ], 

405 context, 

406 ) 

407 assert "calib_pkey" not in columns, "For internal use only, and only for pure-calib queries." 

408 if CollectionType.CALIBRATION in collection_types: 

409 # If at least one collection is a CALIBRATION collection, we'll 

410 # need a subquery for the calibs table, and could include the 

411 # timespan as a result or constraint. 

412 assert ( 

413 self._calibs is not None 

414 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

415 calibs_parts = sql.Payload[LogicalColumn](self._calibs.alias(f"{self.datasetType.name}_calibs")) 

416 if "timespan" in columns: 

417 calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "timespan")] = ( 

418 TimespanReprClass.from_columns(calibs_parts.from_clause.columns) 

419 ) 

420 if "calib_pkey" in columns: 

421 # This is a private extension not included in the base class 

422 # interface, for internal use only in _buildCalibOverlapQuery, 

423 # which needs access to the autoincrement primary key for the 

424 # calib association table. 

425 calibs_parts.columns_available[DatasetColumnTag(self.datasetType.name, "calib_pkey")] = ( 

426 calibs_parts.from_clause.columns.id 

427 ) 

428 calib_relation = self._finish_single_relation( 

429 calibs_parts, 

430 columns, 

431 [ 

432 (record, rank) 

433 for rank, record in enumerate(collections) 

434 if record.type is CollectionType.CALIBRATION 

435 ], 

436 context, 

437 ) 

438 if tag_relation is not None: 

439 if calib_relation is not None: 

440 # daf_relation's chain operation does not automatically 

441 # deduplicate; it's more like SQL's UNION ALL. To get UNION 

442 # in SQL here, we add an explicit deduplication. 

443 return tag_relation.chain(calib_relation).without_duplicates() 

444 else: 

445 return tag_relation 

446 elif calib_relation is not None: 

447 return calib_relation 

448 else: 

449 raise AssertionError("Branch should be unreachable.") 

450 

451 def _finish_single_relation( 

452 self, 

453 payload: sql.Payload[LogicalColumn], 

454 requested_columns: Set[str], 

455 collections: Sequence[tuple[CollectionRecord, int]], 

456 context: SqlQueryContext, 

457 ) -> Relation: 

458 """Handle adding columns and WHERE terms that are not specific to 

459 either the tags or calibs tables. 

460 

461 Helper method for `make_relation`. 

462 

463 Parameters 

464 ---------- 

465 payload : `lsst.daf.relation.sql.Payload` 

466 SQL query parts under construction, to be modified in-place and 

467 used to construct the new relation. 

468 requested_columns : `~collections.abc.Set` [ `str` ] 

469 Columns the relation should include. 

470 collections : `~collections.abc.Sequence` [ `tuple` \ 

471 [ `CollectionRecord`, `int` ] ] 

472 Collections to search for the dataset and their ranks. 

473 context : `SqlQueryContext` 

474 Context that manages engines and state for the query. 

475 

476 Returns 

477 ------- 

478 relation : `lsst.daf.relation.Relation` 

479 New dataset query relation. 

480 """ 

481 payload.where.append(payload.from_clause.columns.dataset_type_id == self._dataset_type_id) 

482 dataset_id_col = payload.from_clause.columns.dataset_id 

483 collection_col = payload.from_clause.columns[self._collections.getCollectionForeignKeyName()] 

484 # We always constrain and optionally retrieve the collection(s) via the 

485 # tags/calibs table. 

486 if len(collections) == 1: 

487 payload.where.append(collection_col == collections[0][0].key) 

488 if "collection" in requested_columns: 

489 payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = ( 

490 sqlalchemy.sql.literal(collections[0][0].key) 

491 ) 

492 else: 

493 assert collections, "The no-collections case should be in calling code for better diagnostics." 

494 payload.where.append(collection_col.in_([collection.key for collection, _ in collections])) 

495 if "collection" in requested_columns: 

496 payload.columns_available[DatasetColumnTag(self.datasetType.name, "collection")] = ( 

497 collection_col 

498 ) 

499 # Add rank if requested as a CASE-based calculation the collection 

500 # column. 

501 if "rank" in requested_columns: 

502 payload.columns_available[DatasetColumnTag(self.datasetType.name, "rank")] = sqlalchemy.sql.case( 

503 {record.key: rank for record, rank in collections}, 

504 value=collection_col, 

505 ) 

506 # Add more column definitions, starting with the data ID. 

507 for dimension_name in self.datasetType.dimensions.required.names: 

508 payload.columns_available[DimensionKeyColumnTag(dimension_name)] = payload.from_clause.columns[ 

509 dimension_name 

510 ] 

511 # We can always get the dataset_id from the tags/calibs table. 

512 if "dataset_id" in requested_columns: 

513 payload.columns_available[DatasetColumnTag(self.datasetType.name, "dataset_id")] = dataset_id_col 

514 # It's possible we now have everything we need, from just the 

515 # tags/calibs table. The things we might need to get from the static 

516 # dataset table are the run key and the ingest date. 

517 need_static_table = False 

518 if "run" in requested_columns: 

519 if len(collections) == 1 and collections[0][0].type is CollectionType.RUN: 

520 # If we are searching exactly one RUN collection, we 

521 # know that if we find the dataset in that collection, 

522 # then that's the datasets's run; we don't need to 

523 # query for it. 

524 payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = ( 

525 sqlalchemy.sql.literal(collections[0][0].key) 

526 ) 

527 else: 

528 payload.columns_available[DatasetColumnTag(self.datasetType.name, "run")] = ( 

529 self._static.dataset.columns[self._runKeyColumn] 

530 ) 

531 need_static_table = True 

532 # Ingest date can only come from the static table. 

533 if "ingest_date" in requested_columns: 

534 need_static_table = True 

535 payload.columns_available[DatasetColumnTag(self.datasetType.name, "ingest_date")] = ( 

536 self._static.dataset.columns.ingest_date 

537 ) 

538 # If we need the static table, join it in via dataset_id and 

539 # dataset_type_id 

540 if need_static_table: 

541 payload.from_clause = payload.from_clause.join( 

542 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.columns.id) 

543 ) 

544 # Also constrain dataset_type_id in static table in case that helps 

545 # generate a better plan. 

546 # We could also include this in the JOIN ON clause, but my guess is 

547 # that that's a good idea IFF it's in the foreign key, and right 

548 # now it isn't. 

549 payload.where.append(self._static.dataset.columns.dataset_type_id == self._dataset_type_id) 

550 leaf = context.sql_engine.make_leaf( 

551 payload.columns_available.keys(), 

552 payload=payload, 

553 name=self.datasetType.name, 

554 parameters={record.name: rank for record, rank in collections}, 

555 ) 

556 return leaf 

557 

558 def make_query_joiner(self, collections: Sequence[CollectionRecord], fields: Set[str]) -> QueryJoiner: 

559 # This method largely mimics `make_relation`, but it uses the new query 

560 # system primitives instead of the old one. In terms of the SQL 

561 # queries it builds, there are two more main differences: 

562 # 

563 # - Collection and run columns are now string names rather than IDs. 

564 # This insulates the query result-processing code from collection 

565 # caching and the collection manager subclass details. 

566 # 

567 # - The subquery always has unique rows, which is achieved by using 

568 # SELECT DISTINCT when necessary. 

569 # 

570 collection_types = {collection.type for collection in collections} 

571 assert CollectionType.CHAINED not in collection_types, "CHAINED collections must be flattened." 

572 # 

573 # There are two kinds of table in play here: 

574 # 

575 # - the static dataset table (with the dataset ID, dataset type ID, 

576 # run ID/name, and ingest date); 

577 # 

578 # - the dynamic tags/calibs table (with the dataset ID, dataset type 

579 # type ID, collection ID/name, data ID, and possibly validity 

580 # range). 

581 # 

582 # That means that we might want to return a query against either table 

583 # or a JOIN of both, depending on which quantities the caller wants. 

584 # But the data ID is always included, which means we'll always include 

585 # the tags/calibs table and join in the static dataset table only if we 

586 # need things from it that we can't get from the tags/calibs table. 

587 # 

588 # Note that it's important that we include a WHERE constraint on both 

589 # tables for any column (e.g. dataset_type_id) that is in both when 

590 # it's given explicitly; not doing can prevent the query planner from 

591 # using very important indexes. At present, we don't include those 

592 # redundant columns in the JOIN ON expression, however, because the 

593 # FOREIGN KEY (and its index) are defined only on dataset_id. 

594 columns = qt.ColumnSet(self.datasetType.dimensions.as_group()) 

595 columns.drop_implied_dimension_keys() 

596 columns.dataset_fields[self.datasetType.name].update(fields) 

597 tags_builder: QueryBuilder | None = None 

598 if collection_types != {CollectionType.CALIBRATION}: 598 ↛ 614line 598 didn't jump to line 614, because the condition on line 598 was never false

599 # We'll need a subquery for the tags table if any of the given 

600 # collections are not a CALIBRATION collection. This intentionally 

601 # also fires when the list of collections is empty as a way to 

602 # create a dummy subquery that we know will fail. 

603 # We give the table an alias because it might appear multiple times 

604 # in the same query, for different dataset types. 

605 tags_builder = self._finish_query_builder( 

606 QueryJoiner(self._db, self._tags.alias(f"{self.datasetType.name}_tags")).to_builder(columns), 

607 [record for record in collections if record.type is not CollectionType.CALIBRATION], 

608 fields, 

609 ) 

610 if "timespan" in fields: 610 ↛ 611line 610 didn't jump to line 611

611 tags_builder.joiner.timespans[self.datasetType.name] = ( 

612 self._db.getTimespanRepresentation().fromLiteral(Timespan(None, None)) 

613 ) 

614 calibs_builder: QueryBuilder | None = None 

615 if CollectionType.CALIBRATION in collection_types: 615 ↛ 619line 615 didn't jump to line 619, because the condition on line 615 was never true

616 # If at least one collection is a CALIBRATION collection, we'll 

617 # need a subquery for the calibs table, and could include the 

618 # timespan as a result or constraint. 

619 assert ( 

620 self._calibs is not None 

621 ), "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection." 

622 calibs_builder = self._finish_query_builder( 

623 QueryJoiner(self._db, self._calibs.alias(f"{self.datasetType.name}_calibs")).to_builder( 

624 columns 

625 ), 

626 [record for record in collections if record.type is CollectionType.CALIBRATION], 

627 fields, 

628 ) 

629 if "timespan" in fields: 

630 calibs_builder.joiner.timespans[self.datasetType.name] = ( 

631 self._db.getTimespanRepresentation().from_columns(self._calibs.columns) 

632 ) 

633 

634 # In calibration collections, we need timespan as well as data ID 

635 # to ensure unique rows. 

636 calibs_builder.distinct = calibs_builder.distinct and "timespan" not in fields 

637 if tags_builder is not None: 637 ↛ 643line 637 didn't jump to line 643, because the condition on line 637 was never false

638 if calibs_builder is not None: 638 ↛ 640line 638 didn't jump to line 640, because the condition on line 638 was never true

639 # Need a UNION subquery. 

640 return tags_builder.union_subquery([calibs_builder]) 

641 else: 

642 return tags_builder.to_joiner() 

643 elif calibs_builder is not None: 

644 return calibs_builder.to_joiner() 

645 else: 

646 raise AssertionError("Branch should be unreachable.") 

647 

648 def _finish_query_builder( 

649 self, 

650 sql_projection: QueryBuilder, 

651 collections: Sequence[CollectionRecord], 

652 fields: Set[str], 

653 ) -> QueryBuilder: 

654 # This method plays the same role as _finish_single_relation in the new 

655 # query system. It is called exactly one or two times by 

656 # make_sql_builder, just as _finish_single_relation is called exactly 

657 # one or two times by make_relation. See make_sql_builder comments for 

658 # what's different. 

659 assert sql_projection.joiner.from_clause is not None 

660 run_collections_only = all(record.type is CollectionType.RUN for record in collections) 

661 sql_projection.joiner.where( 

662 sql_projection.joiner.from_clause.c.dataset_type_id == self._dataset_type_id 

663 ) 

664 dataset_id_col = sql_projection.joiner.from_clause.c.dataset_id 

665 collection_col = sql_projection.joiner.from_clause.c[self._collections.getCollectionForeignKeyName()] 

666 fields_provided = sql_projection.joiner.fields[self.datasetType.name] 

667 # We always constrain and optionally retrieve the collection(s) via the 

668 # tags/calibs table. 

669 if "collection_key" in fields: 669 ↛ 670line 669 didn't jump to line 670, because the condition on line 669 was never true

670 sql_projection.joiner.fields[self.datasetType.name]["collection_key"] = collection_col 

671 if len(collections) == 1: 

672 only_collection_record = collections[0] 

673 sql_projection.joiner.where(collection_col == only_collection_record.key) 

674 if "collection" in fields: 674 ↛ 675line 674 didn't jump to line 675, because the condition on line 674 was never true

675 fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name) 

676 elif not collections: 

677 sql_projection.joiner.where(sqlalchemy.literal(False)) 

678 if "collection" in fields: 678 ↛ 679line 678 didn't jump to line 679, because the condition on line 678 was never true

679 fields_provided["collection"] = sqlalchemy.literal("NO COLLECTIONS") 

680 else: 

681 sql_projection.joiner.where(collection_col.in_([collection.key for collection in collections])) 

682 if "collection" in fields: 

683 # Avoid a join to the collection table to get the name by using 

684 # a CASE statement. The SQL will be a bit more verbose but 

685 # more efficient. 

686 fields_provided["collection"] = sqlalchemy.case( 

687 {record.key: record.name for record in collections}, value=collection_col 

688 ) 

689 # Add more column definitions, starting with the data ID. 

690 sql_projection.joiner.extract_dimensions(self.datasetType.dimensions.required.names) 

691 # We can always get the dataset_id from the tags/calibs table, even if 

692 # could also get it from the 'static' dataset table. 

693 if "dataset_id" in fields: 693 ↛ 694line 693 didn't jump to line 694, because the condition on line 693 was never true

694 fields_provided["dataset_id"] = dataset_id_col 

695 

696 # It's possible we now have everything we need, from just the 

697 # tags/calibs table. The things we might need to get from the static 

698 # dataset table are the run key and the ingest date. 

699 need_static_table = False 

700 if "run" in fields: 700 ↛ 701line 700 didn't jump to line 701, because the condition on line 700 was never true

701 if len(collections) == 1 and run_collections_only: 

702 # If we are searching exactly one RUN collection, we 

703 # know that if we find the dataset in that collection, 

704 # then that's the datasets's run; we don't need to 

705 # query for it. 

706 fields_provided["run"] = sqlalchemy.literal(only_collection_record.name) 

707 elif run_collections_only: 

708 # Once again we can avoid joining to the collection table by 

709 # adding a CASE statement. 

710 fields_provided["run"] = sqlalchemy.case( 

711 {record.key: record.name for record in collections}, 

712 value=self._static.dataset.c[self._runKeyColumn], 

713 ) 

714 need_static_table = True 

715 else: 

716 # Here we can't avoid a join to the collection table, because 

717 # we might find a dataset via something other than its RUN 

718 # collection. 

719 ( 

720 fields_provided["run"], 

721 sql_projection.joiner.from_clause, 

722 ) = self._collections.lookup_name_sql( 

723 self._static.dataset.c[self._runKeyColumn], 

724 sql_projection.joiner.from_clause, 

725 ) 

726 need_static_table = True 

727 # Ingest date can only come from the static table. 

728 if "ingest_date" in fields: 728 ↛ 729line 728 didn't jump to line 729, because the condition on line 728 was never true

729 fields_provided["ingest_date"] = self._static.dataset.c.ingest_date 

730 need_static_table = True 

731 if need_static_table: 731 ↛ 735line 731 didn't jump to line 735, because the condition on line 731 was never true

732 # If we need the static table, join it in via dataset_id. We don't 

733 # use QueryJoiner.join because we're joining on dataset ID, not 

734 # dimensions. 

735 sql_projection.joiner.from_clause = sql_projection.joiner.from_clause.join( 

736 self._static.dataset, onclause=(dataset_id_col == self._static.dataset.c.id) 

737 ) 

738 # Also constrain dataset_type_id in static table in case that helps 

739 # generate a better plan. We could also include this in the JOIN ON 

740 # clause, but my guess is that that's a good idea IFF it's in the 

741 # foreign key, and right now it isn't. 

742 sql_projection.joiner.where(self._static.dataset.c.dataset_type_id == self._dataset_type_id) 

743 sql_projection.distinct = ( 

744 # If there are multiple collections, this subquery might have 

745 # non-unique rows. 

746 len(collections) > 1 

747 and not fields 

748 ) 

749 return sql_projection 

750 

751 def getDataId(self, id: DatasetId) -> DataCoordinate: 

752 """Return DataId for a dataset. 

753 

754 Parameters 

755 ---------- 

756 id : `DatasetId` 

757 Unique dataset identifier. 

758 

759 Returns 

760 ------- 

761 dataId : `DataCoordinate` 

762 DataId for the dataset. 

763 """ 

764 # This query could return multiple rows (one for each tagged collection 

765 # the dataset is in, plus one for its run collection), and we don't 

766 # care which of those we get. 

767 sql = ( 

768 self._tags.select() 

769 .where( 

770 sqlalchemy.sql.and_( 

771 self._tags.columns.dataset_id == id, 

772 self._tags.columns.dataset_type_id == self._dataset_type_id, 

773 ) 

774 ) 

775 .limit(1) 

776 ) 

777 with self._db.query(sql) as sql_result: 

778 row = sql_result.mappings().fetchone() 

779 assert row is not None, "Should be guaranteed by caller and foreign key constraints." 

780 return DataCoordinate.from_required_values( 

781 self.datasetType.dimensions.as_group(), 

782 tuple(row[dimension] for dimension in self.datasetType.dimensions.required.names), 

783 ) 

784 

785 

786class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): 

787 """Implementation of ByDimensionsDatasetRecordStorage which uses UUID for 

788 dataset IDs. 

789 """ 

790 

791 idMaker = DatasetIdFactory() 

792 """Factory for dataset IDs. In the future this factory may be shared with 

793 other classes (e.g. Registry).""" 

794 

795 def insert( 

796 self, 

797 run: RunRecord, 

798 dataIds: Iterable[DataCoordinate], 

799 idMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

800 ) -> Iterator[DatasetRef]: 

801 # Docstring inherited from DatasetRecordStorage. 

802 

803 # Current timestamp, type depends on schema version. Use microsecond 

804 # precision for astropy time to keep things consistent with 

805 # TIMESTAMP(6) SQL type. 

806 timestamp: datetime.datetime | astropy.time.Time 

807 if self._use_astropy: 

808 # Astropy `now()` precision should be the same as `now()` which 

809 # should mean microsecond. 

810 timestamp = astropy.time.Time.now() 

811 else: 

812 timestamp = datetime.datetime.now(datetime.UTC) 

813 

814 # Iterate over data IDs, transforming a possibly-single-pass iterable 

815 # into a list. 

816 dataIdList: list[DataCoordinate] = [] 

817 rows = [] 

818 summary = CollectionSummary() 

819 for dataId in summary.add_data_ids_generator(self.datasetType, dataIds): 

820 dataIdList.append(dataId) 

821 rows.append( 

822 { 

823 "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), 

824 "dataset_type_id": self._dataset_type_id, 

825 self._runKeyColumn: run.key, 

826 "ingest_date": timestamp, 

827 } 

828 ) 

829 

830 with self._db.transaction(): 

831 # Insert into the static dataset table. 

832 self._db.insert(self._static.dataset, *rows) 

833 # Update the summary tables for this collection in case this is the 

834 # first time this dataset type or these governor values will be 

835 # inserted there. 

836 self._summaries.update(run, [self._dataset_type_id], summary) 

837 # Combine the generated dataset_id values and data ID fields to 

838 # form rows to be inserted into the tags table. 

839 protoTagsRow = { 

840 "dataset_type_id": self._dataset_type_id, 

841 self._collections.getCollectionForeignKeyName(): run.key, 

842 } 

843 tagsRows = [ 

844 dict(protoTagsRow, dataset_id=row["id"], **dataId.required) 

845 for dataId, row in zip(dataIdList, rows, strict=True) 

846 ] 

847 # Insert those rows into the tags table. 

848 self._db.insert(self._tags, *tagsRows) 

849 

850 for dataId, row in zip(dataIdList, rows, strict=True): 

851 yield DatasetRef( 

852 datasetType=self.datasetType, 

853 dataId=dataId, 

854 id=row["id"], 

855 run=run.name, 

856 ) 

857 

858 def import_( 

859 self, 

860 run: RunRecord, 

861 datasets: Iterable[DatasetRef], 

862 ) -> Iterator[DatasetRef]: 

863 # Docstring inherited from DatasetRecordStorage. 

864 

865 # Current timestamp, type depends on schema version. 

866 if self._use_astropy: 

867 # Astropy `now()` precision should be the same as `now()` which 

868 # should mean microsecond. 

869 timestamp = sqlalchemy.sql.literal(astropy.time.Time.now(), type_=ddl.AstropyTimeNsecTai) 

870 else: 

871 timestamp = sqlalchemy.sql.literal(datetime.datetime.now(datetime.UTC)) 

872 

873 # Iterate over data IDs, transforming a possibly-single-pass iterable 

874 # into a list. 

875 dataIds: dict[DatasetId, DataCoordinate] = {} 

876 summary = CollectionSummary() 

877 for dataset in summary.add_datasets_generator(datasets): 

878 dataIds[dataset.id] = dataset.dataId 

879 

880 # We'll insert all new rows into a temporary table 

881 tableSpec = makeTagTableSpec(self.datasetType, type(self._collections), ddl.GUID, constraints=False) 

882 collFkName = self._collections.getCollectionForeignKeyName() 

883 protoTagsRow = { 

884 "dataset_type_id": self._dataset_type_id, 

885 collFkName: run.key, 

886 } 

887 tmpRows = [ 

888 dict(protoTagsRow, dataset_id=dataset_id, **dataId.required) 

889 for dataset_id, dataId in dataIds.items() 

890 ] 

891 with self._db.transaction(for_temp_tables=True), self._db.temporary_table(tableSpec) as tmp_tags: 

892 # store all incoming data in a temporary table 

893 self._db.insert(tmp_tags, *tmpRows) 

894 

895 # There are some checks that we want to make for consistency 

896 # of the new datasets with existing ones. 

897 self._validateImport(tmp_tags, run) 

898 

899 # Before we merge temporary table into dataset/tags we need to 

900 # drop datasets which are already there (and do not conflict). 

901 self._db.deleteWhere( 

902 tmp_tags, 

903 tmp_tags.columns.dataset_id.in_(sqlalchemy.sql.select(self._static.dataset.columns.id)), 

904 ) 

905 

906 # Copy it into dataset table, need to re-label some columns. 

907 self._db.insert( 

908 self._static.dataset, 

909 select=sqlalchemy.sql.select( 

910 tmp_tags.columns.dataset_id.label("id"), 

911 tmp_tags.columns.dataset_type_id, 

912 tmp_tags.columns[collFkName].label(self._runKeyColumn), 

913 timestamp.label("ingest_date"), 

914 ), 

915 ) 

916 

917 # Update the summary tables for this collection in case this 

918 # is the first time this dataset type or these governor values 

919 # will be inserted there. 

920 self._summaries.update(run, [self._dataset_type_id], summary) 

921 

922 # Copy it into tags table. 

923 self._db.insert(self._tags, select=tmp_tags.select()) 

924 

925 # Return refs in the same order as in the input list. 

926 for dataset_id, dataId in dataIds.items(): 

927 yield DatasetRef( 

928 datasetType=self.datasetType, 

929 id=dataset_id, 

930 dataId=dataId, 

931 run=run.name, 

932 ) 

933 

934 def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> None: 

935 """Validate imported refs against existing datasets. 

936 

937 Parameters 

938 ---------- 

939 tmp_tags : `sqlalchemy.schema.Table` 

940 Temporary table with new datasets and the same schema as tags 

941 table. 

942 run : `RunRecord` 

943 The record object describing the `~CollectionType.RUN` collection. 

944 

945 Raises 

946 ------ 

947 ConflictingDefinitionError 

948 Raise if new datasets conflict with existing ones. 

949 """ 

950 dataset = self._static.dataset 

951 tags = self._tags 

952 collFkName = self._collections.getCollectionForeignKeyName() 

953 

954 # Check that existing datasets have the same dataset type and 

955 # run. 

956 query = ( 

957 sqlalchemy.sql.select( 

958 dataset.columns.id.label("dataset_id"), 

959 dataset.columns.dataset_type_id.label("dataset_type_id"), 

960 tmp_tags.columns.dataset_type_id.label("new_dataset_type_id"), 

961 dataset.columns[self._runKeyColumn].label("run"), 

962 tmp_tags.columns[collFkName].label("new_run"), 

963 ) 

964 .select_from(dataset.join(tmp_tags, dataset.columns.id == tmp_tags.columns.dataset_id)) 

965 .where( 

966 sqlalchemy.sql.or_( 

967 dataset.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

968 dataset.columns[self._runKeyColumn] != tmp_tags.columns[collFkName], 

969 ) 

970 ) 

971 .limit(1) 

972 ) 

973 with self._db.query(query) as result: 

974 # Only include the first one in the exception message 

975 if (row := result.first()) is not None: 

976 existing_run = self._collections[row.run].name 

977 new_run = self._collections[row.new_run].name 

978 if row.dataset_type_id == self._dataset_type_id: 

979 if row.new_dataset_type_id == self._dataset_type_id: 979 ↛ 985line 979 didn't jump to line 985, because the condition on line 979 was never false

980 raise ConflictingDefinitionError( 

981 f"Current run {existing_run!r} and new run {new_run!r} do not agree for " 

982 f"dataset {row.dataset_id}." 

983 ) 

984 else: 

985 raise ConflictingDefinitionError( 

986 f"Dataset {row.dataset_id} was provided with type {self.datasetType.name!r} " 

987 f"in run {new_run!r}, but was already defined with type ID {row.dataset_type_id} " 

988 f"in run {run!r}." 

989 ) 

990 else: 

991 raise ConflictingDefinitionError( 

992 f"Dataset {row.dataset_id} was provided with type ID {row.new_dataset_type_id} " 

993 f"in run {new_run!r}, but was already defined with type {self.datasetType.name!r} " 

994 f"in run {run!r}." 

995 ) 

996 

997 # Check that matching dataset in tags table has the same DataId. 

998 query = ( 

999 sqlalchemy.sql.select( 

1000 tags.columns.dataset_id, 

1001 tags.columns.dataset_type_id.label("type_id"), 

1002 tmp_tags.columns.dataset_type_id.label("new_type_id"), 

1003 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

1004 *[ 

1005 tmp_tags.columns[dim].label(f"new_{dim}") 

1006 for dim in self.datasetType.dimensions.required.names 

1007 ], 

1008 ) 

1009 .select_from(tags.join(tmp_tags, tags.columns.dataset_id == tmp_tags.columns.dataset_id)) 

1010 .where( 

1011 sqlalchemy.sql.or_( 

1012 tags.columns.dataset_type_id != tmp_tags.columns.dataset_type_id, 

1013 *[ 

1014 tags.columns[dim] != tmp_tags.columns[dim] 

1015 for dim in self.datasetType.dimensions.required.names 

1016 ], 

1017 ) 

1018 ) 

1019 .limit(1) 

1020 ) 

1021 

1022 with self._db.query(query) as result: 

1023 if (row := result.first()) is not None: 

1024 # Only include the first one in the exception message 

1025 raise ConflictingDefinitionError( 

1026 f"Existing dataset type or dataId do not match new dataset: {row._asdict()}" 

1027 ) 

1028 

1029 # Check that matching run+dataId have the same dataset ID. 

1030 query = ( 

1031 sqlalchemy.sql.select( 

1032 *[tags.columns[dim] for dim in self.datasetType.dimensions.required.names], 

1033 tags.columns.dataset_id, 

1034 tmp_tags.columns.dataset_id.label("new_dataset_id"), 

1035 tags.columns[collFkName], 

1036 tmp_tags.columns[collFkName].label(f"new_{collFkName}"), 

1037 ) 

1038 .select_from( 

1039 tags.join( 

1040 tmp_tags, 

1041 sqlalchemy.sql.and_( 

1042 tags.columns.dataset_type_id == tmp_tags.columns.dataset_type_id, 

1043 tags.columns[collFkName] == tmp_tags.columns[collFkName], 

1044 *[ 

1045 tags.columns[dim] == tmp_tags.columns[dim] 

1046 for dim in self.datasetType.dimensions.required.names 

1047 ], 

1048 ), 

1049 ) 

1050 ) 

1051 .where(tags.columns.dataset_id != tmp_tags.columns.dataset_id) 

1052 .limit(1) 

1053 ) 

1054 with self._db.query(query) as result: 

1055 # only include the first one in the exception message 

1056 if (row := result.first()) is not None: 

1057 data_id = {dim: getattr(row, dim) for dim in self.datasetType.dimensions.required.names} 

1058 existing_collection = self._collections[getattr(row, collFkName)].name 

1059 new_collection = self._collections[getattr(row, f"new_{collFkName}")].name 

1060 raise ConflictingDefinitionError( 

1061 f"Dataset with type {self.datasetType.name!r} and data ID {data_id} " 

1062 f"has ID {row.dataset_id} in existing collection {existing_collection!r} " 

1063 f"but ID {row.new_dataset_id} in new collection {new_collection!r}." 

1064 )