Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py : 88%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
3__all__ = ("ByDimensionsDatasetRecordStorage",)
5from typing import (
6 Any,
7 Dict,
8 Iterable,
9 Iterator,
10 Optional,
11 Set,
12 TYPE_CHECKING,
13)
15import sqlalchemy
17from lsst.daf.butler import (
18 CollectionType,
19 DataCoordinate,
20 DataCoordinateSet,
21 DatasetRef,
22 DatasetType,
23 SimpleQuery,
24 Timespan,
25)
26from lsst.daf.butler.registry import ConflictingDefinitionError
27from lsst.daf.butler.registry.interfaces import DatasetRecordStorage
30if TYPE_CHECKING: 30 ↛ 31line 30 didn't jump to line 31, because the condition on line 30 was never true
31 from ...interfaces import CollectionManager, CollectionRecord, Database, RunRecord
32 from .tables import CollectionSummaryTables, StaticDatasetTablesTuple
35class ByDimensionsDatasetRecordStorage(DatasetRecordStorage):
36 """Dataset record storage implementation paired with
37 `ByDimensionsDatasetRecordStorageManager`; see that class for more
38 information.
40 Instances of this class should never be constructed directly; use
41 `DatasetRecordStorageManager.register` instead.
42 """
43 def __init__(self, *, datasetType: DatasetType,
44 db: Database,
45 dataset_type_id: int,
46 collections: CollectionManager,
47 static: StaticDatasetTablesTuple,
48 summaries: CollectionSummaryTables,
49 tags: sqlalchemy.schema.Table,
50 calibs: Optional[sqlalchemy.schema.Table]):
51 super().__init__(datasetType=datasetType)
52 self._dataset_type_id = dataset_type_id
53 self._db = db
54 self._collections = collections
55 self._static = static
56 self._summaries = summaries
57 self._tags = tags
58 self._calibs = calibs
59 self._runKeyColumn = collections.getRunForeignKeyName()
61 def _ensureSummaries(self, collection: CollectionRecord, governorValues: Dict[str, Set[str]]) -> None:
62 """Update the summary table to associate the given collection with
63 ``self.datasetType`` and the given governor dimension values.
65 Parameters
66 ----------
67 collection : `CollectionRecord`
68 Collection whose summary should be updated.
69 governorValues : `dict` [ `str`, `set` [ `str ` ] ]
70 Mapping from `GovernorDimension` names to sets of values they may
71 have in the data IDs of the dataests in this collection.
72 """
73 self._db.ensure(
74 self._summaries.datasetType,
75 {
76 "dataset_type_id": self._dataset_type_id,
77 self._collections.getCollectionForeignKeyName(): collection.key,
78 }
79 )
80 for governorName, values in governorValues.items():
81 self._db.ensure(
82 self._summaries.dimensions[governorName],
83 *[{
84 self._collections.getCollectionForeignKeyName(): collection.key,
85 governorName: v
86 } for v in values],
87 )
89 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate]) -> Iterator[DatasetRef]:
90 # Docstring inherited from DatasetRecordStorage.
91 staticRow = {
92 "dataset_type_id": self._dataset_type_id,
93 self._runKeyColumn: run.key,
94 }
95 # Iterate over data IDs, transforming a possibly-single-pass iterable
96 # into a list, and remembering any governor dimension values we see.
97 governorValues: Dict[str, Set[str]] = {
98 name: set() for name in self.datasetType.dimensions.governors.names
99 }
100 dataIdList = []
101 for dataId in dataIds:
102 dataIdList.append(dataId)
103 for governorName, values in governorValues.items():
104 values.add(dataId[governorName]) # type: ignore
105 with self._db.transaction():
106 # Insert into the static dataset table, generating autoincrement
107 # dataset_id values.
108 datasetIds = self._db.insert(self._static.dataset, *([staticRow]*len(dataIdList)),
109 returnIds=True)
110 assert datasetIds is not None
111 # Update the summary tables for this collection in case this is the
112 # first time this dataset type or these governor values will be
113 # inserted there.
114 self._ensureSummaries(run, governorValues)
115 # Combine the generated dataset_id values and data ID fields to
116 # form rows to be inserted into the tags table.
117 protoTagsRow = {
118 "dataset_type_id": self._dataset_type_id,
119 self._collections.getCollectionForeignKeyName(): run.key,
120 }
121 tagsRows = [
122 dict(protoTagsRow, dataset_id=dataset_id, **dataId.byName())
123 for dataId, dataset_id in zip(dataIdList, datasetIds)
124 ]
125 # Insert those rows into the tags table. This is where we'll
126 # get any unique constraint violations.
127 self._db.insert(self._tags, *tagsRows)
128 for dataId, datasetId in zip(dataIdList, datasetIds):
129 yield DatasetRef(
130 datasetType=self.datasetType,
131 dataId=dataId,
132 id=datasetId,
133 run=run.name,
134 )
136 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
137 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
138 # Docstring inherited from DatasetRecordStorage.
139 assert dataId.graph == self.datasetType.dimensions
140 if collection.type is CollectionType.CALIBRATION and timespan is None: 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 raise TypeError(f"Cannot search for dataset in CALIBRATION collection {collection.name} "
142 f"without an input timespan.")
143 sql = self.select(collection=collection, dataId=dataId, id=SimpleQuery.Select,
144 run=SimpleQuery.Select, timespan=timespan).combine()
145 results = self._db.query(sql)
146 row = results.fetchone()
147 if row is None:
148 return None
149 if collection.type is CollectionType.CALIBRATION:
150 # For temporal calibration lookups (only!) our invariants do not
151 # guarantee that the number of result rows is <= 1.
152 # They would if `select` constrained the given timespan to be
153 # _contained_ by the validity range in the self._calibs table,
154 # instead of simply _overlapping_ it, because we do guarantee that
155 # the validity ranges are disjoint for a particular dataset type,
156 # collection, and data ID. But using an overlap test and a check
157 # for multiple result rows here allows us to provide a more useful
158 # diagnostic, as well as allowing `select` to support more general
159 # queries where multiple results are not an error.
160 if results.fetchone() is not None:
161 raise RuntimeError(
162 f"Multiple matches found for calibration lookup in {collection.name} for "
163 f"{self.datasetType.name} with {dataId} overlapping {timespan}. "
164 )
165 return DatasetRef(
166 datasetType=self.datasetType,
167 dataId=dataId,
168 id=row["id"],
169 run=self._collections[row[self._runKeyColumn]].name
170 )
172 def delete(self, datasets: Iterable[DatasetRef]) -> None:
173 # Docstring inherited from DatasetRecordStorage.
174 # Only delete from common dataset table; ON DELETE foreign key clauses
175 # will handle the rest.
176 self._db.delete(
177 self._static.dataset,
178 ["id"],
179 *[{"id": dataset.getCheckedId()} for dataset in datasets],
180 )
182 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
183 # Docstring inherited from DatasetRecordStorage.
184 if collection.type is not CollectionType.TAGGED: 184 ↛ 185line 184 didn't jump to line 185, because the condition on line 184 was never true
185 raise TypeError(f"Cannot associate into collection '{collection.name}' "
186 f"of type {collection.type.name}; must be TAGGED.")
187 protoRow = {
188 self._collections.getCollectionForeignKeyName(): collection.key,
189 "dataset_type_id": self._dataset_type_id,
190 }
191 rows = []
192 governorValues: Dict[str, Set[str]] = {
193 name: set() for name in self.datasetType.dimensions.governors.names
194 }
195 for dataset in datasets:
196 row = dict(protoRow, dataset_id=dataset.getCheckedId())
197 for dimension, value in dataset.dataId.items():
198 row[dimension.name] = value
199 for governorName, values in governorValues.items():
200 values.add(dataset.dataId[governorName]) # type: ignore
201 rows.append(row)
202 # Update the summary tables for this collection in case this is the
203 # first time this dataset type or these governor values will be
204 # inserted there.
205 self._ensureSummaries(collection, governorValues)
206 # Update the tag table itself.
207 self._db.replace(self._tags, *rows)
209 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
210 # Docstring inherited from DatasetRecordStorage.
211 if collection.type is not CollectionType.TAGGED: 211 ↛ 212line 211 didn't jump to line 212, because the condition on line 211 was never true
212 raise TypeError(f"Cannot disassociate from collection '{collection.name}' "
213 f"of type {collection.type.name}; must be TAGGED.")
214 rows = [
215 {
216 "dataset_id": dataset.getCheckedId(),
217 self._collections.getCollectionForeignKeyName(): collection.key
218 }
219 for dataset in datasets
220 ]
221 self._db.delete(self._tags, ["dataset_id", self._collections.getCollectionForeignKeyName()],
222 *rows)
224 def _buildCalibOverlapQuery(self, collection: CollectionRecord,
225 dataIds: Optional[DataCoordinateSet],
226 timespan: Timespan) -> SimpleQuery:
227 assert self._calibs is not None
228 # Start by building a SELECT query for any rows that would overlap
229 # this one.
230 query = SimpleQuery()
231 query.join(self._calibs)
232 # Add a WHERE clause matching the dataset type and collection.
233 query.where.append(self._calibs.columns.dataset_type_id == self._dataset_type_id)
234 query.where.append(
235 self._calibs.columns[self._collections.getCollectionForeignKeyName()] == collection.key
236 )
237 # Add a WHERE clause matching any of the given data IDs.
238 if dataIds is not None:
239 dataIds.constrain(
240 query,
241 lambda name: self._calibs.columns[name], # type: ignore
242 )
243 # Add WHERE clause for timespan overlaps.
244 tsRepr = self._db.getTimespanRepresentation()
245 query.where.append(tsRepr.fromSelectable(self._calibs).overlaps(timespan))
246 return query
248 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
249 timespan: Timespan) -> None:
250 # Docstring inherited from DatasetRecordStorage.
251 if self._calibs is None: 251 ↛ 252line 251 didn't jump to line 252, because the condition on line 251 was never true
252 raise TypeError(f"Cannot certify datasets of type {self.datasetType.name}, for which "
253 f"DatasetType.isCalibration() is False.")
254 if collection.type is not CollectionType.CALIBRATION: 254 ↛ 255line 254 didn't jump to line 255, because the condition on line 254 was never true
255 raise TypeError(f"Cannot certify into collection '{collection.name}' "
256 f"of type {collection.type.name}; must be CALIBRATION.")
257 tsRepr = self._db.getTimespanRepresentation()
258 protoRow = {
259 self._collections.getCollectionForeignKeyName(): collection.key,
260 "dataset_type_id": self._dataset_type_id,
261 }
262 rows = []
263 governorValues: Dict[str, Set[str]] = {
264 name: set() for name in self.datasetType.dimensions.governors.names
265 }
266 dataIds: Optional[Set[DataCoordinate]] = set() if not tsRepr.hasExclusionConstraint() else None
267 for dataset in datasets:
268 row = dict(protoRow, dataset_id=dataset.getCheckedId())
269 for dimension, value in dataset.dataId.items():
270 row[dimension.name] = value
271 tsRepr.update(timespan, result=row)
272 for governorName, values in governorValues.items():
273 values.add(dataset.dataId[governorName]) # type: ignore
274 rows.append(row)
275 if dataIds is not None: 275 ↛ 267line 275 didn't jump to line 267, because the condition on line 275 was never false
276 dataIds.add(dataset.dataId)
277 # Update the summary tables for this collection in case this is the
278 # first time this dataset type or these governor values will be
279 # inserted there.
280 self._ensureSummaries(collection, governorValues)
281 # Update the association table itself.
282 if tsRepr.hasExclusionConstraint(): 282 ↛ 285line 282 didn't jump to line 285, because the condition on line 282 was never true
283 # Rely on database constraint to enforce invariants; we just
284 # reraise the exception for consistency across DB engines.
285 try:
286 self._db.insert(self._calibs, *rows)
287 except sqlalchemy.exc.IntegrityError as err:
288 raise ConflictingDefinitionError(
289 f"Validity range conflict certifying datasets of type {self.datasetType.name} "
290 f"into {collection.name} for range [{timespan.begin}, {timespan.end})."
291 ) from err
292 else:
293 # Have to implement exclusion constraint ourselves.
294 # Start by building a SELECT query for any rows that would overlap
295 # this one.
296 query = self._buildCalibOverlapQuery(
297 collection,
298 DataCoordinateSet(dataIds, graph=self.datasetType.dimensions), # type: ignore
299 timespan
300 )
301 query.columns.append(sqlalchemy.sql.func.count())
302 sql = query.combine()
303 # Acquire a table lock to ensure there are no concurrent writes
304 # could invalidate our checking before we finish the inserts. We
305 # use a SAVEPOINT in case there is an outer transaction that a
306 # failure here should not roll back.
307 with self._db.transaction(lock=[self._calibs], savepoint=True):
308 # Run the check SELECT query.
309 conflicting = self._db.query(sql).scalar()
310 if conflicting > 0:
311 raise ConflictingDefinitionError(
312 f"{conflicting} validity range conflicts certifying datasets of type "
313 f"{self.datasetType.name} into {collection.name} for range "
314 f"[{timespan.begin}, {timespan.end})."
315 )
316 # Proceed with the insert.
317 self._db.insert(self._calibs, *rows)
319 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
320 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
321 # Docstring inherited from DatasetRecordStorage.
322 if self._calibs is None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true
323 raise TypeError(f"Cannot decertify datasets of type {self.datasetType.name}, for which "
324 f"DatasetType.isCalibration() is False.")
325 if collection.type is not CollectionType.CALIBRATION: 325 ↛ 326line 325 didn't jump to line 326, because the condition on line 325 was never true
326 raise TypeError(f"Cannot decertify from collection '{collection.name}' "
327 f"of type {collection.type.name}; must be CALIBRATION.")
328 tsRepr = self._db.getTimespanRepresentation()
329 # Construct a SELECT query to find all rows that overlap our inputs.
330 dataIdSet: Optional[DataCoordinateSet]
331 if dataIds is not None:
332 dataIdSet = DataCoordinateSet(set(dataIds), graph=self.datasetType.dimensions)
333 else:
334 dataIdSet = None
335 query = self._buildCalibOverlapQuery(collection, dataIdSet, timespan)
336 query.columns.extend(self._calibs.columns)
337 sql = query.combine()
338 # Set up collections to populate with the rows we'll want to modify.
339 # The insert rows will have the same values for collection and
340 # dataset type.
341 protoInsertRow = {
342 self._collections.getCollectionForeignKeyName(): collection.key,
343 "dataset_type_id": self._dataset_type_id,
344 }
345 rowsToDelete = []
346 rowsToInsert = []
347 # Acquire a table lock to ensure there are no concurrent writes
348 # between the SELECT and the DELETE and INSERT queries based on it.
349 with self._db.transaction(lock=[self._calibs], savepoint=True):
350 for row in self._db.query(sql):
351 rowsToDelete.append({"id": row["id"]})
352 # Construct the insert row(s) by copying the prototype row,
353 # then adding the dimension column values, then adding what's
354 # left of the timespan from that row after we subtract the
355 # given timespan.
356 newInsertRow = protoInsertRow.copy()
357 newInsertRow["dataset_id"] = row["dataset_id"]
358 for name in self.datasetType.dimensions.required.names:
359 newInsertRow[name] = row[name]
360 rowTimespan = tsRepr.extract(row)
361 assert rowTimespan is not None, "Field should have a NOT NULL constraint."
362 for diffTimespan in rowTimespan.difference(timespan):
363 rowsToInsert.append(tsRepr.update(diffTimespan, result=newInsertRow.copy()))
364 # Run the DELETE and INSERT queries.
365 self._db.delete(self._calibs, ["id"], *rowsToDelete)
366 self._db.insert(self._calibs, *rowsToInsert)
368 def select(self, collection: CollectionRecord,
369 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
370 id: SimpleQuery.Select.Or[Optional[int]] = SimpleQuery.Select,
371 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
372 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
373 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
374 ) -> SimpleQuery:
375 # Docstring inherited from DatasetRecordStorage.
376 assert collection.type is not CollectionType.CHAINED
377 query = SimpleQuery()
378 # We always include the _static.dataset table, and we can always get
379 # the id and run fields from that; passing them as kwargs here tells
380 # SimpleQuery to handle them whether they're constraints or results.
381 # We always constraint the dataset_type_id here as well.
382 static_kwargs = {self._runKeyColumn: run}
383 if ingestDate is not None:
384 static_kwargs["ingest_date"] = SimpleQuery.Select
385 query.join(
386 self._static.dataset,
387 id=id,
388 dataset_type_id=self._dataset_type_id,
389 **static_kwargs
390 )
391 # If and only if the collection is a RUN, we constrain it in the static
392 # table (and also the tags or calibs table below)
393 if collection.type is CollectionType.RUN:
394 query.where.append(self._static.dataset.columns[self._runKeyColumn]
395 == collection.key)
396 # We get or constrain the data ID from the tags/calibs table, but
397 # that's multiple columns, not one, so we need to transform the one
398 # Select.Or argument into a dictionary of them.
399 kwargs: Dict[str, Any]
400 if dataId is SimpleQuery.Select:
401 kwargs = {dim.name: SimpleQuery.Select for dim in self.datasetType.dimensions.required}
402 else:
403 kwargs = dict(dataId.byName())
404 # We always constrain (never retrieve) the collection from the tags
405 # table.
406 kwargs[self._collections.getCollectionForeignKeyName()] = collection.key
407 # constrain ingest time
408 if isinstance(ingestDate, Timespan): 408 ↛ 411line 408 didn't jump to line 411, because the condition on line 408 was never true
409 # Tmespan is astropy Time (usually in TAI) and ingest_date is
410 # TIMESTAMP, convert values to Python datetime for sqlalchemy.
411 if ingestDate.begin is not None:
412 begin = ingestDate.begin.utc.datetime
413 query.where.append(self._static.dataset.ingest_date >= begin)
414 if ingestDate.end is not None:
415 end = ingestDate.end.utc.datetime
416 query.where.append(self._static.dataset.ingest_date < end)
417 # And now we finally join in the tags or calibs table.
418 if collection.type is CollectionType.CALIBRATION:
419 assert self._calibs is not None, \
420 "DatasetTypes with isCalibration() == False can never be found in a CALIBRATION collection."
421 tsRepr = self._db.getTimespanRepresentation()
422 # Add the timespan column(s) to the result columns, or constrain
423 # the timespan via an overlap condition.
424 if timespan is SimpleQuery.Select:
425 kwargs.update({k: SimpleQuery.Select for k in tsRepr.getFieldNames()})
426 elif timespan is not None: 426 ↛ 428line 426 didn't jump to line 428, because the condition on line 426 was never false
427 query.where.append(tsRepr.fromSelectable(self._calibs).overlaps(timespan))
428 query.join(
429 self._calibs,
430 onclause=(self._static.dataset.columns.id == self._calibs.columns.dataset_id),
431 **kwargs
432 )
433 else:
434 query.join(
435 self._tags,
436 onclause=(self._static.dataset.columns.id == self._tags.columns.dataset_id),
437 **kwargs
438 )
439 return query
441 def getDataId(self, id: int) -> DataCoordinate:
442 # Docstring inherited from DatasetRecordStorage.
443 # This query could return multiple rows (one for each tagged collection
444 # the dataset is in, plus one for its run collection), and we don't
445 # care which of those we get.
446 sql = self._tags.select().where(
447 sqlalchemy.sql.and_(
448 self._tags.columns.dataset_id == id,
449 self._tags.columns.dataset_type_id == self._dataset_type_id
450 )
451 ).limit(1)
452 row = self._db.query(sql).fetchone()
453 assert row is not None, "Should be guaranteed by caller and foreign key constraints."
454 return DataCoordinate.standardize(
455 {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required},
456 graph=self.datasetType.dimensions
457 )