Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 67%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum")
26from abc import ABC, abstractmethod
27import enum
28from typing import (
29 Any,
30 Iterable,
31 Iterator,
32 Optional,
33 Tuple,
34 TYPE_CHECKING,
35)
37from ...core import (
38 DataCoordinate,
39 DatasetId,
40 DatasetRef,
41 DatasetType,
42 ddl,
43 SimpleQuery,
44 Timespan,
45)
46from ._versioning import VersionedExtension
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from ..summaries import CollectionSummary
50 from ._database import Database, StaticTablesContext
51 from ._dimensions import DimensionRecordStorageManager
52 from ._collections import CollectionManager, CollectionRecord, RunRecord
55class DatasetIdGenEnum(enum.Enum):
56 """This enum is used to specify dataset ID generation options for
57 ``insert()`` method.
58 """
60 UNIQUE = 0
61 """Unique mode generates unique ID for each inserted dataset, e.g.
62 auto-generated by database or random UUID.
63 """
65 DATAID_TYPE = 1
66 """In this mode ID is computed deterministically from a combination of
67 dataset type and dataId.
68 """
70 DATAID_TYPE_RUN = 2
71 """In this mode ID is computed deterministically from a combination of
72 dataset type, dataId, and run collection name.
73 """
76class DatasetRecordStorage(ABC):
77 """An interface that manages the records associated with a particular
78 `DatasetType`.
80 Parameters
81 ----------
82 datasetType : `DatasetType`
83 Dataset type whose records this object manages.
84 """
85 def __init__(self, datasetType: DatasetType):
86 self.datasetType = datasetType
88 @abstractmethod
89 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
90 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
91 """Insert one or more dataset entries into the database.
93 Parameters
94 ----------
95 run : `RunRecord`
96 The record object describing the `~CollectionType.RUN` collection
97 this dataset will be associated with.
98 dataIds : `Iterable` [ `DataCoordinate` ]
99 Expanded data IDs (`DataCoordinate` instances) for the
100 datasets to be added. The dimensions of all data IDs must be the
101 same as ``self.datasetType.dimensions``.
102 idMode : `DatasetIdGenEnum`
103 With `UNIQUE` each new dataset is inserted with its new unique ID.
104 With non-`UNIQUE` mode ID is computed from some combination of
105 dataset type, dataId, and run collection name; if the same ID is
106 already in the database then new record is not inserted.
108 Returns
109 -------
110 datasets : `Iterable` [ `DatasetRef` ]
111 References to the inserted datasets.
112 """
113 raise NotImplementedError()
115 @abstractmethod
116 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
117 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
118 reuseIds: bool = False) -> Iterator[DatasetRef]:
119 """Insert one or more dataset entries into the database.
121 Parameters
122 ----------
123 run : `RunRecord`
124 The record object describing the `~CollectionType.RUN` collection
125 this dataset will be associated with.
126 datasets : `~collections.abc.Iterable` of `DatasetRef`
127 Datasets to be inserted. Datasets can specify ``id`` attribute
128 which will be used for inserted datasets. All dataset IDs must
129 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
130 does not match type supported by this class then IDs will be
131 ignored and new IDs will be generated by backend.
132 idGenerationMode : `DatasetIdGenEnum`
133 With `UNIQUE` each new dataset is inserted with its new unique ID.
134 With non-`UNIQUE` mode ID is computed from some combination of
135 dataset type, dataId, and run collection name; if the same ID is
136 already in the database then new record is not inserted.
137 reuseIds : `bool`, optional
138 If `True` then forces re-use of imported dataset IDs for integer
139 IDs which are normally generated as auto-incremented; exception
140 will be raised if imported IDs clash with existing ones. This
141 option has no effect on the use of globally-unique IDs which are
142 always re-used (or generated if integer IDs are being imported).
144 Returns
145 -------
146 datasets : `Iterable` [ `DatasetRef` ]
147 References to the inserted or existing datasets.
149 Notes
150 -----
151 The ``datasetType`` and ``run`` attributes of datasets are supposed to
152 be identical across all datasets but this is not checked and it should
153 be enforced by higher level registry code. This method does not need
154 to use those attributes from datasets, only ``dataId`` and ``id`` are
155 relevant.
156 """
157 raise NotImplementedError()
159 @abstractmethod
160 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
161 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
162 """Search a collection for a dataset with the given data ID.
164 Parameters
165 ----------
166 collection : `CollectionRecord`
167 The record object describing the collection to search for the
168 dataset. May have any `CollectionType`.
169 dataId: `DataCoordinate`
170 Complete (but not necessarily expanded) data ID to search with,
171 with ``dataId.graph == self.datasetType.dimensions``.
172 timespan : `Timespan`, optional
173 A timespan that the validity range of the dataset must overlap.
174 Required if ``collection.type is CollectionType.CALIBRATION``, and
175 ignored otherwise.
177 Returns
178 -------
179 ref : `DatasetRef`
180 A resolved `DatasetRef` (without components populated), or `None`
181 if no matching dataset was found.
182 """
183 raise NotImplementedError()
185 @abstractmethod
186 def delete(self, datasets: Iterable[DatasetRef]) -> None:
187 """Fully delete the given datasets from the registry.
189 Parameters
190 ----------
191 datasets : `Iterable` [ `DatasetRef` ]
192 Datasets to be deleted. All datasets must be resolved and have
193 the same `DatasetType` as ``self``.
195 Raises
196 ------
197 AmbiguousDatasetError
198 Raised if any of the given `DatasetRef` instances is unresolved.
199 """
200 raise NotImplementedError()
202 @abstractmethod
203 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
204 """Associate one or more datasets with a collection.
206 Parameters
207 ----------
208 collection : `CollectionRecord`
209 The record object describing the collection. ``collection.type``
210 must be `~CollectionType.TAGGED`.
211 datasets : `Iterable` [ `DatasetRef` ]
212 Datasets to be associated. All datasets must be resolved and have
213 the same `DatasetType` as ``self``.
215 Raises
216 ------
217 AmbiguousDatasetError
218 Raised if any of the given `DatasetRef` instances is unresolved.
220 Notes
221 -----
222 Associating a dataset with into collection that already contains a
223 different dataset with the same `DatasetType` and data ID will remove
224 the existing dataset from that collection.
226 Associating the same dataset into a collection multiple times is a
227 no-op, but is still not permitted on read-only databases.
228 """
229 raise NotImplementedError()
231 @abstractmethod
232 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
233 """Remove one or more datasets from a collection.
235 Parameters
236 ----------
237 collection : `CollectionRecord`
238 The record object describing the collection. ``collection.type``
239 must be `~CollectionType.TAGGED`.
240 datasets : `Iterable` [ `DatasetRef` ]
241 Datasets to be disassociated. All datasets must be resolved and
242 have the same `DatasetType` as ``self``.
244 Raises
245 ------
246 AmbiguousDatasetError
247 Raised if any of the given `DatasetRef` instances is unresolved.
248 """
249 raise NotImplementedError()
251 @abstractmethod
252 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
253 timespan: Timespan) -> None:
254 """Associate one or more datasets with a calibration collection and a
255 validity range within it.
257 Parameters
258 ----------
259 collection : `CollectionRecord`
260 The record object describing the collection. ``collection.type``
261 must be `~CollectionType.CALIBRATION`.
262 datasets : `Iterable` [ `DatasetRef` ]
263 Datasets to be associated. All datasets must be resolved and have
264 the same `DatasetType` as ``self``.
265 timespan : `Timespan`
266 The validity range for these datasets within the collection.
268 Raises
269 ------
270 AmbiguousDatasetError
271 Raised if any of the given `DatasetRef` instances is unresolved.
272 ConflictingDefinitionError
273 Raised if the collection already contains a different dataset with
274 the same `DatasetType` and data ID and an overlapping validity
275 range.
276 TypeError
277 Raised if
278 ``collection.type is not CollectionType.CALIBRATION`` or if
279 ``self.datasetType.isCalibration() is False``.
280 """
281 raise NotImplementedError()
283 @abstractmethod
284 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
285 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
286 """Remove or adjust datasets to clear a validity range within a
287 calibration collection.
289 Parameters
290 ----------
291 collection : `CollectionRecord`
292 The record object describing the collection. ``collection.type``
293 must be `~CollectionType.CALIBRATION`.
294 timespan : `Timespan`
295 The validity range to remove datasets from within the collection.
296 Datasets that overlap this range but are not contained by it will
297 have their validity ranges adjusted to not overlap it, which may
298 split a single dataset validity range into two.
299 dataIds : `Iterable` [ `DataCoordinate` ], optional
300 Data IDs that should be decertified within the given validity range
301 If `None`, all data IDs for ``self.datasetType`` will be
302 decertified.
304 Raises
305 ------
306 TypeError
307 Raised if ``collection.type is not CollectionType.CALIBRATION``.
308 """
309 raise NotImplementedError()
311 @abstractmethod
312 def select(self, *collections: CollectionRecord,
313 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
314 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select,
315 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
316 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
317 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
318 ) -> SimpleQuery:
319 """Return a SQLAlchemy object that represents a ``SELECT`` query for
320 this `DatasetType`.
322 All arguments can either be a value that constrains the query or
323 the `SimpleQuery.Select` tag object to indicate that the value should
324 be returned in the columns in the ``SELECT`` clause. The default is
325 `SimpleQuery.Select`.
327 Parameters
328 ----------
329 *collections : `CollectionRecord`
330 The record object(s) describing the collection(s) to query. May
331 not be of type `CollectionType.CHAINED`. If multiple collections
332 are passed, the query will search all of them in an unspecified
333 order, and all collections must have the same type.
334 dataId : `DataCoordinate` or `Select`
335 The data ID to restrict results with, or an instruction to return
336 the data ID via columns with names
337 ``self.datasetType.dimensions.names``.
338 id : `DatasetId`, `Select` or None,
339 The primary key value for the dataset, an instruction to return it
340 via a ``id`` column, or `None` to ignore it entirely.
341 run : `None` or `Select`
342 If `Select` (default), include the dataset's run key value (as
343 column labeled with the return value of
344 ``CollectionManager.getRunForeignKeyName``).
345 If `None`, do not include this column (to constrain the run,
346 pass a `RunRecord` as the ``collection`` argument instead).
347 timespan : `None`, `Select`, or `Timespan`
348 If `Select` (default), include the validity range timespan in the
349 result columns. If a `Timespan` instance, constrain the results to
350 those whose validity ranges overlap that given timespan. Ignored
351 unless ``collection.type is CollectionType.CALIBRATION``.
352 ingestDate : `None`, `Select`, or `Timespan`
353 If `Select` include the ingest timestamp in the result columns.
354 If a `Timespan` instance, constrain the results to those whose
355 ingest times which are inside given timespan and also include
356 timestamp in the result columns. If `None` (default) then there is
357 no constraint and timestamp is not returned.
359 Returns
360 -------
361 query : `SimpleQuery`
362 A struct containing the SQLAlchemy object that representing a
363 simple ``SELECT`` query.
364 """
365 raise NotImplementedError()
367 datasetType: DatasetType
368 """Dataset type whose records this object manages (`DatasetType`).
369 """
372class DatasetRecordStorageManager(VersionedExtension):
373 """An interface that manages the tables that describe datasets.
375 `DatasetRecordStorageManager` primarily serves as a container and factory
376 for `DatasetRecordStorage` instances, which each provide access to the
377 records for a different `DatasetType`.
378 """
380 @classmethod
381 @abstractmethod
382 def initialize(
383 cls,
384 db: Database,
385 context: StaticTablesContext, *,
386 collections: CollectionManager,
387 dimensions: DimensionRecordStorageManager,
388 ) -> DatasetRecordStorageManager:
389 """Construct an instance of the manager.
391 Parameters
392 ----------
393 db : `Database`
394 Interface to the underlying database engine and namespace.
395 context : `StaticTablesContext`
396 Context object obtained from `Database.declareStaticTables`; used
397 to declare any tables that should always be present.
398 collections: `CollectionManager`
399 Manager object for the collections in this `Registry`.
400 dimensions : `DimensionRecordStorageManager`
401 Manager object for the dimensions in this `Registry`.
403 Returns
404 -------
405 manager : `DatasetRecordStorageManager`
406 An instance of a concrete `DatasetRecordStorageManager` subclass.
407 """
408 raise NotImplementedError()
410 @classmethod
411 @abstractmethod
412 def getIdColumnType(cls) -> type:
413 """Return type used for columns storing dataset IDs.
415 This type is used for columns storing `DatasetRef.id` values, usually
416 a `type` subclass provided by SQLAlchemy.
418 Returns
419 -------
420 dtype : `type`
421 Type used for dataset identification in database.
422 """
423 raise NotImplementedError()
425 @classmethod
426 @abstractmethod
427 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
428 """Test whether the given dataset ID generation mode is supported by
429 `insert`.
431 Parameters
432 ----------
433 mode : `DatasetIdGenEnum`
434 Enum value for the mode to test.
436 Returns
437 -------
438 supported : `bool`
439 Whether the given mode is supported.
440 """
441 raise NotImplementedError()
443 @classmethod
444 @abstractmethod
445 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *,
446 name: str = "dataset", constraint: bool = True, onDelete: Optional[str] = None,
447 **kwargs: Any) -> ddl.FieldSpec:
448 """Add a foreign key (field and constraint) referencing the dataset
449 table.
451 Parameters
452 ----------
453 tableSpec : `ddl.TableSpec`
454 Specification for the table that should reference the dataset
455 table. Will be modified in place.
456 name: `str`, optional
457 A name to use for the prefix of the new field; the full name is
458 ``{name}_id``.
459 onDelete: `str`, optional
460 One of "CASCADE" or "SET NULL", indicating what should happen to
461 the referencing row if the collection row is deleted. `None`
462 indicates that this should be an integrity error.
463 constraint: `bool`, optional
464 If `False` (`True` is default), add a field that can be joined to
465 the dataset primary key, but do not add a foreign key constraint.
466 **kwargs
467 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
468 constructor (only the ``name`` and ``dtype`` arguments are
469 otherwise provided).
471 Returns
472 -------
473 idSpec : `ddl.FieldSpec`
474 Specification for the ID field.
475 """
476 raise NotImplementedError()
478 @abstractmethod
479 def refresh(self) -> None:
480 """Ensure all other operations on this manager are aware of any
481 dataset types that may have been registered by other clients since
482 it was initialized or last refreshed.
483 """
484 raise NotImplementedError()
486 def __getitem__(self, name: str) -> DatasetRecordStorage:
487 """Return the object that provides access to the records associated
488 with the given `DatasetType` name.
490 This is simply a convenience wrapper for `find` that raises `KeyError`
491 when the dataset type is not found.
493 Returns
494 -------
495 records : `DatasetRecordStorage`
496 The object representing the records for the given dataset type.
498 Raises
499 ------
500 KeyError
501 Raised if there is no dataset type with the given name.
503 Notes
504 -----
505 Dataset types registered by another client of the same repository since
506 the last call to `initialize` or `refresh` may not be found.
507 """
508 result = self.find(name)
509 if result is None:
510 raise KeyError(f"Dataset type with name '{name}' not found.")
511 return result
513 @abstractmethod
514 def find(self, name: str) -> Optional[DatasetRecordStorage]:
515 """Return an object that provides access to the records associated with
516 the given `DatasetType` name, if one exists.
518 Parameters
519 ----------
520 name : `str`
521 Name of the dataset type.
523 Returns
524 -------
525 records : `DatasetRecordStorage` or `None`
526 The object representing the records for the given dataset type, or
527 `None` if there are no records for that dataset type.
529 Notes
530 -----
531 Dataset types registered by another client of the same repository since
532 the last call to `initialize` or `refresh` may not be found.
533 """
534 raise NotImplementedError()
536 @abstractmethod
537 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
538 """Ensure that this `Registry` can hold records for the given
539 `DatasetType`, creating new tables as necessary.
541 Parameters
542 ----------
543 datasetType : `DatasetType`
544 Dataset type for which a table should created (as necessary) and
545 an associated `DatasetRecordStorage` returned.
547 Returns
548 -------
549 records : `DatasetRecordStorage`
550 The object representing the records for the given dataset type.
551 inserted : `bool`
552 `True` if the dataset type did not exist in the registry before.
554 Notes
555 -----
556 This operation may not be invoked within a `Database.transaction`
557 context.
558 """
559 raise NotImplementedError()
561 @abstractmethod
562 def remove(self, name: str) -> None:
563 """Remove the dataset type.
565 Parameters
566 ----------
567 name : `str`
568 Name of the dataset type.
569 """
570 raise NotImplementedError()
572 @abstractmethod
573 def __iter__(self) -> Iterator[DatasetType]:
574 """Return an iterator over the the dataset types present in this layer.
576 Notes
577 -----
578 Dataset types registered by another client of the same layer since
579 the last call to `initialize` or `refresh` may not be included.
580 """
581 raise NotImplementedError()
583 @abstractmethod
584 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
585 """Return a `DatasetRef` for the given dataset primary key
586 value.
588 Parameters
589 ----------
590 id : `DatasetId`
591 Primary key value for the dataset.
593 Returns
594 -------
595 ref : `DatasetRef` or `None`
596 Object representing the dataset, or `None` if no dataset with the
597 given primary key values exists in this layer.
598 """
599 raise NotImplementedError()
601 @abstractmethod
602 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
603 """Return a summary for the given collection.
605 Parameters
606 ----------
607 collection : `CollectionRecord`
608 Record describing the collection for which a summary is to be
609 retrieved.
611 Returns
612 -------
613 summary : `CollectionSummary`
614 Summary of the dataset types and governor dimension values in
615 this collection.
616 """
617 raise NotImplementedError()