Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py : 86%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5from typing import (
6 Any,
7 Dict,
8 Iterable,
9 Iterator,
10 Optional,
11 Set,
12 TYPE_CHECKING,
13)
15import sqlalchemy
17from lsst.daf.butler import (
18 CollectionType,
19 DataCoordinate,
20 DataCoordinateSet,
21 DatasetRef,
22 DatasetType,
23 SimpleQuery,
24 Timespan,
25)
26from lsst.daf.butler.registry import ConflictingDefinitionError
27from lsst.daf.butler.registry.interfaces import DatasetRecordStorage
30if TYPE_CHECKING: 30 ↛ 31line 30 didn't jump to line 31, because the condition on line 30 was never true
31 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
32 from .tables import CollectionSummaryTables, StaticDatasetTablesTuple
35class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
36 """Dataset record storage implementation paired with
37 `ByDimensionsDatasetRecordStorageManager`; see that class for more
38 information.
40 Instances of this class should never be constructed directly; use
41 `DatasetRecordStorageManager.register` instead.
42 """
43 def __init__(self, *, datasetType: DatasetType,
44 db: Database,
45 dataset_type_id: int,
46 collections: CollectionManager,
47 static: StaticDatasetTablesTuple,
48 summaries: CollectionSummaryTables,
49 tags: sqlalchemy.schema.Table,
50 calibs: Optional[sqlalchemy.schema.Table]):
51 super().__init__(datasetType=datasetType)
52 self._dataset_type_id = dataset_type_id
53 self._db = db
54 self._collections = collections
55 self._static = static
56 self._summaries = summaries
57 self._tags = tags
58 self._calibs = calibs
59 self._runKeyColumn = collections.getRunForeignKeyName()
61 def _ensureSummaries(self, collection: CollectionRecord, governorValues: Dict[str, Set[str]]) -> None:
62 """Update the summary table to associate the given collection with
63 ``self.datasetType`` and the given governor dimension values.
65 Parameters
66 ----------
67 collection : `CollectionRecord`
68 Collection whose summary should be updated.
69 governorValues : `dict` [ `str`, `set` [ `str ` ] ]
70 Mapping from `GovernorDimension` names to sets of values they may
71 have in the data IDs of the dataests in this collection.
72 """
73 self._db.ensure(
74 self._summaries.datasetType,
75 {
76 "dataset_type_id": self._dataset_type_id,
77 self._collections.getCollectionForeignKeyName(): collection.key,
78 }
79 )
80 for governorName, values in governorValues.items():
81 self._db.ensure(
82 self._summaries.dimensions[governorName],
83 *[{
84 self._collections.getCollectionForeignKeyName(): collection.key,
85 governorName: v
86 } for v in values],
87 )
89 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
90 # Docstring inherited from DatasetRecordStorage.
91 staticRow = {
92 "dataset_type_id": self._dataset_type_id,
93 self._runKeyColumn: run.key,
94 }
95 # Iterate over data IDs, transforming a possibly-single-pass iterable
96 # into a list, and remembering any governor dimension values we see.
97 governorValues: Dict[str, Set[str]] = {
98 name: set() for name in self.datasetType.dimensions.governors.names
99 }
100 dataIdList = []
101 for dataId in dataIds:
102 dataIdList.append(dataId)
103 for governorName, values in governorValues.items():
104 values.add(dataId[governorName]) # type: ignore
105 with self._db.transaction():
106 # Insert into the static dataset table, generating autoincrement
107 # dataset_id values.
108 datasetIds = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)),
109 returnIds=True)
110 assert datasetIds is not None
111 # Update the summary tables for this collection in case this is the
112 # first time this dataset type or these governor values will be
113 # inserted there.
114 self._ensureSummaries(run, governorValues)
115 # Combine the generated dataset_id values and data ID fields to
116 # form rows to be inserted into the tags table.
117 protoTagsRow = {
118 "dataset_type_id": self._dataset_type_id,
119 self._collections.getCollectionForeignKeyName(): run.key,
120 }
121 tagsRows = [
122 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
123 for dataId, dataset_id in zip(dataIdList, datasetIds)
124 ]
125 # Insert those rows into the tags table. This is where we'll
126 # get any unique constraint violations.
127 self._db.insert(self._tags, *tagsRows)
128 for dataId, datasetId in zip(dataIdList, datasetIds):
129 yield DatasetRef(
130 datasetType=self.datasetType,
131 dataId=dataId,
132 id=datasetId,
133 run=run.name,
134 )
136 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
137 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
138 # Docstring inherited from DatasetRecordStorage.
139 assert dataId.graph == self.datasetType.dimensions
140 if collection.type is CollectionType.CALIBRATION and timespan is None: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} "
142 f"without an input timespan.")
143 sql = self.select(collection=collection, dataId=dataId, id=SimpleQuery.Select,
144 run=SimpleQuery.Select, timespan=timespan).combine()
145 results = self._db.query(sql)
146 row = results.fetchone()
147 if row is None:
148 return None
149 if collection.type is CollectionType.CALIBRATION:
150 # For temporal calibration lookups (only!) our invariants do not
151 # guarantee that the number of result rows is <= 1.
152 # They would if `select` constrained the given timespan to be
153 # _contained_ by the validity range in the self._calibs table,
154 # instead of simply _overlapping_ it, because we do guarantee that
155 # the validity ranges are disjoint for a particular dataset type,
156 # collection, and data ID. But using an overlap test and a check
157 # for multiple result rows here allows us to provide a more useful
158 # diagnostic, as well as allowing `select` to support more general
159 # queries where multiple results are not an error.
160 if results.fetchone() is not None:
161 raise RuntimeError(
162 f"Multiple matches found for calibration lookup in {collection.name} for "
163 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
164 )
165 return DatasetRef(
166 datasetType=self.datasetType,
167 dataId=dataId,
168 id=row["id"],
169 run=self._collections[row[self._runKeyColumn]].name
170 )
172 def delete(self, datasets: Iterable[DatasetRef]) -> None:
173 # Docstring inherited from DatasetRecordStorage.
174 # Only delete from common dataset table; ON DELETE foreign key clauses
175 # will handle the rest.
176 self._db.delete(
177 self._static.dataset,
178 ["id"],
179 *[{"id": dataset.getCheckedId()} for dataset in datasets],
180 )
182 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
183 # Docstring inherited from DatasetRecordStorage.
184 if collection.type is not CollectionType.TAGGED: 184 ↛ 185line 184 didn't jump to line 185, because the condition on line 184 was never true
185 raise TypeError(f"Cannot associate into collection '{collection.name}' "
186 f"of type {collection.type.name}; must be TAGGED.")
187 protoRow = {
188 self._collections.getCollectionForeignKeyName(): collection.key,
189 "dataset_type_id": self._dataset_type_id,
190 }
191 rows = []
192 governorValues: Dict[str, Set[str]] = {
193 name: set() for name in self.datasetType.dimensions.governors.names
194 }
195 for dataset in datasets:
196 row = dict(protoRow, dataset_id=dataset.getCheckedId())
197 for dimension, value in dataset.dataId.items():
198 row[dimension.name] = value
199 for governorName, values in governorValues.items():
200 values.add(dataset.dataId[governorName]) # type: ignore
201 rows.append(row)
202 # Update the summary tables for this collection in case this is the
203 # first time this dataset type or these governor values will be
204 # inserted there.
205 self._ensureSummaries(collection, governorValues)
206 # Update the tag table itself.
207 self._db.replace(self._tags, *rows)
209 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
210 # Docstring inherited from DatasetRecordStorage.
211 if collection.type is not CollectionType.TAGGED: 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 raise TypeError(f"Cannot disassociate from collection '{collection.name}' "
213 f"of type {collection.type.name}; must be TAGGED.")
214 rows = [
215 {
216 "dataset_id": dataset.getCheckedId(),
217 self._collections.getCollectionForeignKeyName(): collection.key
218 }
219 for dataset in datasets
220 ]
221 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()],
222 *rows)
224 def _buildCalibOverlapQuery(self, collection: CollectionRecord,
225 dataIds: Optional[DataCoordinateSet],
226 timespan: Timespan) -> SimpleQuery:
227 assert self._calibs is not None
228 # Start by building a SELECT query for any rows that would overlap
229 # this one.
230 query = SimpleQuery()
231 query.join(self._calibs)
232 # Add a WHERE clause matching the dataset type and collection.
233 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
234 query.where.append(
235 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
236 )
237 # Add a WHERE clause matching any of the given data IDs.
238 if dataIds is not None:
239 dataIds.constrain(
240 query,
241 lambda name: self._calibs.columns[name], # type: ignore
242 )
243 # Add WHERE clause for timespan overlaps.
244 TimespanReprClass = self._db.getTimespanRepresentation()
245 query.where.append(
246 TimespanReprClass.fromSelectable(self._calibs).overlaps(TimespanReprClass.fromLiteral(timespan))
247 )
248 return query
250 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
251 timespan: Timespan) -> None:
252 # Docstring inherited from DatasetRecordStorage.
253 if self._calibs is None: 253 ↛ 254line 253 didn't jump to line 254, because the condition on line 253 was never true
254 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which "
255 f"DatasetType.isCalibration() is False.")
256 if collection.type is not CollectionType.CALIBRATION: 256 ↛ 257line 256 didn't jump to line 257, because the condition on line 256 was never true
257 raise TypeError(f"Cannot certify into collection '{collection.name}' "
258 f"of type {collection.type.name}; must be CALIBRATION.")
259 TimespanReprClass = self._db.getTimespanRepresentation()
260 protoRow = {
261 self._collections.getCollectionForeignKeyName(): collection.key,
262 "dataset_type_id": self._dataset_type_id,
263 }
264 rows = []
265 governorValues: Dict[str, Set[str]] = {
266 name: set() for name in self.datasetType.dimensions.governors.names
267 }
268 dataIds: Optional[Set[DataCoordinate]] = (
269 set() if not TimespanReprClass.hasExclusionConstraint() else None
270 )
271 for dataset in datasets:
272 row = dict(protoRow, dataset_id=dataset.getCheckedId())
273 for dimension, value in dataset.dataId.items():
274 row[dimension.name] = value
275 TimespanReprClass.update(timespan, result=row)
276 for governorName, values in governorValues.items():
277 values.add(dataset.dataId[governorName]) # type: ignore
278 rows.append(row)
279 if dataIds is not None: 279 ↛ 271line 279 didn't jump to line 271, because the condition on line 279 was never false
280 dataIds.add(dataset.dataId)
281 # Update the summary tables for this collection in case this is the
282 # first time this dataset type or these governor values will be
283 # inserted there.
284 self._ensureSummaries(collection, governorValues)
285 # Update the association table itself.
286 if TimespanReprClass.hasExclusionConstraint(): 286 ↛ 289line 286 didn't jump to line 289, because the condition on line 286 was never true
287 # Rely on database constraint to enforce invariants; we just
288 # reraise the exception for consistency across DB engines.
289 try:
290 self._db.insert(self._calibs, *rows)
291 except sqlalchemy.exc.IntegrityError as err:
292 raise ConflictingDefinitionError(
293 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
294 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
295 ) from err
296 else:
297 # Have to implement exclusion constraint ourselves.
298 # Start by building a SELECT query for any rows that would overlap
299 # this one.
300 query = self._buildCalibOverlapQuery(
301 collection,
302 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
303 timespan
304 )
305 query.columns.append(sqlalchemy.sql.func.count())
306 sql = query.combine()
307 # Acquire a table lock to ensure there are no concurrent writes
308 # could invalidate our checking before we finish the inserts. We
309 # use a SAVEPOINT in case there is an outer transaction that a
310 # failure here should not roll back.
311 with self._db.transaction(lock=[self._calibs], savepoint=True):
312 # Run the check SELECT query.
313 conflicting = self._db.query(sql).scalar()
314 if conflicting > 0:
315 raise ConflictingDefinitionError(
316 f"{conflicting} validity range conflicts certifying datasets of type "
317 f"{self.datasetType.name} into {collection.name} for range "
318 f"[{timespan.begin}, {timespan.end})."
319 )
320 # Proceed with the insert.
321 self._db.insert(self._calibs, *rows)
323 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
324 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
325 # Docstring inherited from DatasetRecordStorage.
326 if self._calibs is None: 326 ↛ 327line 326 didn't jump to line 327, because the condition on line 326 was never true
327 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which "
328 f"DatasetType.isCalibration() is False.")
329 if collection.type is not CollectionType.CALIBRATION: 329 ↛ 330line 329 didn't jump to line 330, because the condition on line 329 was never true
330 raise TypeError(f"Cannot decertify from collection '{collection.name}' "
331 f"of type {collection.type.name}; must be CALIBRATION.")
332 TimespanReprClass = self._db.getTimespanRepresentation()
333 # Construct a SELECT query to find all rows that overlap our inputs.
334 dataIdSet: Optional[DataCoordinateSet]
335 if dataIds is not None:
336 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
337 else:
338 dataIdSet = None
339 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
340 query.columns.extend(self._calibs.columns)
341 sql = query.combine()
342 # Set up collections to populate with the rows we'll want to modify.
343 # The insert rows will have the same values for collection and
344 # dataset type.
345 protoInsertRow = {
346 self._collections.getCollectionForeignKeyName(): collection.key,
347 "dataset_type_id": self._dataset_type_id,
348 }
349 rowsToDelete = []
350 rowsToInsert = []
351 # Acquire a table lock to ensure there are no concurrent writes
352 # between the SELECT and the DELETE and INSERT queries based on it.
353 with self._db.transaction(lock=[self._calibs], savepoint=True):
354 for row in self._db.query(sql):
355 rowsToDelete.append({"id": row["id"]})
356 # Construct the insert row(s) by copying the prototype row,
357 # then adding the dimension column values, then adding what's
358 # left of the timespan from that row after we subtract the
359 # given timespan.
360 newInsertRow = protoInsertRow.copy()
361 newInsertRow["dataset_id"] = row["dataset_id"]
362 for name in self.datasetType.dimensions.required.names:
363 newInsertRow[name] = row[name]
364 rowTimespan = TimespanReprClass.extract(row)
365 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
366 for diffTimespan in rowTimespan.difference(timespan):
367 rowsToInsert.append(TimespanReprClass.update(diffTimespan, result=newInsertRow.copy()))
368 # Run the DELETE and INSERT queries.
369 self._db.delete(self._calibs, ["id"], *rowsToDelete)
370 self._db.insert(self._calibs, *rowsToInsert)
372 def select(self, collection: CollectionRecord,
373 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
374 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
375 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
376 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
377 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
378 ) -> SimpleQuery:
379 # Docstring inherited from DatasetRecordStorage.
380 assert collection.type is not CollectionType.CHAINED
381 query = SimpleQuery()
382 # We always include the _static.dataset table, and we can always get
383 # the id and run fields from that; passing them as kwargs here tells
384 # SimpleQuery to handle them whether they're constraints or results.
385 # We always constraint the dataset_type_id here as well.
386 static_kwargs = {self._runKeyColumn: run}
387 if ingestDate is not None:
388 static_kwargs["ingest_date"] = SimpleQuery.Select
389 query.join(
390 self._static.dataset,
391 id=id,
392 dataset_type_id=self._dataset_type_id,
393 **static_kwargs
394 )
395 # If and only if the collection is a RUN, we constrain it in the static
396 # table (and also the tags or calibs table below)
397 if collection.type is CollectionType.RUN:
398 query.where.append(self._static.dataset.columns[self._runKeyColumn]
399 == collection.key)
400 # We get or constrain the data ID from the tags/calibs table, but
401 # that's multiple columns, not one, so we need to transform the one
402 # Select.Or argument into a dictionary of them.
403 kwargs: Dict[str, Any]
404 if dataId is SimpleQuery.Select:
405 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
406 else:
407 kwargs = dict(dataId.byName())
408 # We always constrain (never retrieve) the collection from the tags
409 # table.
410 kwargs[self._collections.getCollectionForeignKeyName()] = collection.key
411 # constrain ingest time
412 if isinstance(ingestDate, Timespan): 412 ↛ 415line 412 didn't jump to line 415, because the condition on line 412 was never true
413 # Tmespan is astropy Time (usually in TAI) and ingest_date is
414 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
415 if ingestDate.isEmpty():
416 raise RuntimeError("Empty timespan constraint provided for ingest_date.")
417 if ingestDate.begin is not None:
418 begin = ingestDate.begin.utc.datetime # type: ignore
419 query.where.append(self._static.dataset.ingest_date >= begin)
420 if ingestDate.end is not None:
421 end = ingestDate.end.utc.datetime # type: ignore
422 query.where.append(self._static.dataset.ingest_date < end)
423 # And now we finally join in the tags or calibs table.
424 if collection.type is CollectionType.CALIBRATION:
425 assert self._calibs is not None, \
426 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
427 TimespanReprClass = self._db.getTimespanRepresentation()
428 # Add the timespan column(s) to the result columns, or constrain
429 # the timespan via an overlap condition.
430 if timespan is SimpleQuery.Select:
431 kwargs.update({k: SimpleQuery.Select for k in TimespanReprClass.getFieldNames()})
432 elif timespan is not None: 432 ↛ 438line 432 didn't jump to line 438, because the condition on line 432 was never false
433 query.where.append(
434 TimespanReprClass.fromSelectable(self._calibs).overlaps(
435 TimespanReprClass.fromLiteral(timespan)
436 )
437 )
438 query.join(
439 self._calibs,
440 onclause=(self._static.dataset.columns.id == self._calibs.columns.dataset_id),
441 **kwargs
442 )
443 else:
444 query.join(
445 self._tags,
446 onclause=(self._static.dataset.columns.id == self._tags.columns.dataset_id),
447 **kwargs
448 )
449 return query
451 def getDataId(self, id: int) -> DataCoordinate:
452 # Docstring inherited from DatasetRecordStorage.
453 # This query could return multiple rows (one for each tagged collection
454 # the dataset is in, plus one for its run collection), and we don't
455 # care which of those we get.
456 sql = self._tags.select().where(
457 sqlalchemy.sql.and_(
458 self._tags.columns.dataset_id == id,
459 self._tags.columns.dataset_type_id == self._dataset_type_id
460 )
461 ).limit(1)
462 row = self._db.query(sql).fetchone()
463 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
464 return DataCoordinate.standardize(
465 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
466 graph=self.datasetType.dimensions
467 )