Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 70%
95 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-23 02:26 -0700
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-23 02:26 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum")
26import enum
27from abc import ABC, abstractmethod
28from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Tuple
30import sqlalchemy.sql
32from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, SimpleQuery, Timespan, ddl
33from ._versioning import VersionedExtension
35if TYPE_CHECKING: 35 ↛ 36line 35 didn't jump to line 36, because the condition on line 35 was never true
36 from ..summaries import CollectionSummary
37 from ._collections import CollectionManager, CollectionRecord, RunRecord
38 from ._database import Database, StaticTablesContext
39 from ._dimensions import DimensionRecordStorageManager
42class DatasetIdGenEnum(enum.Enum):
43 """This enum is used to specify dataset ID generation options for
44 ``insert()`` method.
45 """
47 UNIQUE = 0
48 """Unique mode generates unique ID for each inserted dataset, e.g.
49 auto-generated by database or random UUID.
50 """
52 DATAID_TYPE = 1
53 """In this mode ID is computed deterministically from a combination of
54 dataset type and dataId.
55 """
57 DATAID_TYPE_RUN = 2
58 """In this mode ID is computed deterministically from a combination of
59 dataset type, dataId, and run collection name.
60 """
63class DatasetRecordStorage(ABC):
64 """An interface that manages the records associated with a particular
65 `DatasetType`.
67 Parameters
68 ----------
69 datasetType : `DatasetType`
70 Dataset type whose records this object manages.
71 """
73 def __init__(self, datasetType: DatasetType):
74 self.datasetType = datasetType
76 @abstractmethod
77 def insert(
78 self,
79 run: RunRecord,
80 dataIds: Iterable[DataCoordinate],
81 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
82 ) -> Iterator[DatasetRef]:
83 """Insert one or more dataset entries into the database.
85 Parameters
86 ----------
87 run : `RunRecord`
88 The record object describing the `~CollectionType.RUN` collection
89 this dataset will be associated with.
90 dataIds : `Iterable` [ `DataCoordinate` ]
91 Expanded data IDs (`DataCoordinate` instances) for the
92 datasets to be added. The dimensions of all data IDs must be the
93 same as ``self.datasetType.dimensions``.
94 idMode : `DatasetIdGenEnum`
95 With `UNIQUE` each new dataset is inserted with its new unique ID.
96 With non-`UNIQUE` mode ID is computed from some combination of
97 dataset type, dataId, and run collection name; if the same ID is
98 already in the database then new record is not inserted.
100 Returns
101 -------
102 datasets : `Iterable` [ `DatasetRef` ]
103 References to the inserted datasets.
104 """
105 raise NotImplementedError()
107 @abstractmethod
108 def import_(
109 self,
110 run: RunRecord,
111 datasets: Iterable[DatasetRef],
112 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
113 reuseIds: bool = False,
114 ) -> Iterator[DatasetRef]:
115 """Insert one or more dataset entries into the database.
117 Parameters
118 ----------
119 run : `RunRecord`
120 The record object describing the `~CollectionType.RUN` collection
121 this dataset will be associated with.
122 datasets : `~collections.abc.Iterable` of `DatasetRef`
123 Datasets to be inserted. Datasets can specify ``id`` attribute
124 which will be used for inserted datasets. All dataset IDs must
125 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
126 does not match type supported by this class then IDs will be
127 ignored and new IDs will be generated by backend.
128 idGenerationMode : `DatasetIdGenEnum`
129 With `UNIQUE` each new dataset is inserted with its new unique ID.
130 With non-`UNIQUE` mode ID is computed from some combination of
131 dataset type, dataId, and run collection name; if the same ID is
132 already in the database then new record is not inserted.
133 reuseIds : `bool`, optional
134 If `True` then forces re-use of imported dataset IDs for integer
135 IDs which are normally generated as auto-incremented; exception
136 will be raised if imported IDs clash with existing ones. This
137 option has no effect on the use of globally-unique IDs which are
138 always re-used (or generated if integer IDs are being imported).
140 Returns
141 -------
142 datasets : `Iterable` [ `DatasetRef` ]
143 References to the inserted or existing datasets.
145 Notes
146 -----
147 The ``datasetType`` and ``run`` attributes of datasets are supposed to
148 be identical across all datasets but this is not checked and it should
149 be enforced by higher level registry code. This method does not need
150 to use those attributes from datasets, only ``dataId`` and ``id`` are
151 relevant.
152 """
153 raise NotImplementedError()
155 @abstractmethod
156 def find(
157 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None
158 ) -> Optional[DatasetRef]:
159 """Search a collection for a dataset with the given data ID.
161 Parameters
162 ----------
163 collection : `CollectionRecord`
164 The record object describing the collection to search for the
165 dataset. May have any `CollectionType`.
166 dataId: `DataCoordinate`
167 Complete (but not necessarily expanded) data ID to search with,
168 with ``dataId.graph == self.datasetType.dimensions``.
169 timespan : `Timespan`, optional
170 A timespan that the validity range of the dataset must overlap.
171 Required if ``collection.type is CollectionType.CALIBRATION``, and
172 ignored otherwise.
174 Returns
175 -------
176 ref : `DatasetRef`
177 A resolved `DatasetRef` (without components populated), or `None`
178 if no matching dataset was found.
179 """
180 raise NotImplementedError()
182 @abstractmethod
183 def delete(self, datasets: Iterable[DatasetRef]) -> None:
184 """Fully delete the given datasets from the registry.
186 Parameters
187 ----------
188 datasets : `Iterable` [ `DatasetRef` ]
189 Datasets to be deleted. All datasets must be resolved and have
190 the same `DatasetType` as ``self``.
192 Raises
193 ------
194 AmbiguousDatasetError
195 Raised if any of the given `DatasetRef` instances is unresolved.
196 """
197 raise NotImplementedError()
199 @abstractmethod
200 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
201 """Associate one or more datasets with a collection.
203 Parameters
204 ----------
205 collection : `CollectionRecord`
206 The record object describing the collection. ``collection.type``
207 must be `~CollectionType.TAGGED`.
208 datasets : `Iterable` [ `DatasetRef` ]
209 Datasets to be associated. All datasets must be resolved and have
210 the same `DatasetType` as ``self``.
212 Raises
213 ------
214 AmbiguousDatasetError
215 Raised if any of the given `DatasetRef` instances is unresolved.
217 Notes
218 -----
219 Associating a dataset with into collection that already contains a
220 different dataset with the same `DatasetType` and data ID will remove
221 the existing dataset from that collection.
223 Associating the same dataset into a collection multiple times is a
224 no-op, but is still not permitted on read-only databases.
225 """
226 raise NotImplementedError()
228 @abstractmethod
229 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
230 """Remove one or more datasets from a collection.
232 Parameters
233 ----------
234 collection : `CollectionRecord`
235 The record object describing the collection. ``collection.type``
236 must be `~CollectionType.TAGGED`.
237 datasets : `Iterable` [ `DatasetRef` ]
238 Datasets to be disassociated. All datasets must be resolved and
239 have the same `DatasetType` as ``self``.
241 Raises
242 ------
243 AmbiguousDatasetError
244 Raised if any of the given `DatasetRef` instances is unresolved.
245 """
246 raise NotImplementedError()
248 @abstractmethod
249 def certify(
250 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
251 ) -> None:
252 """Associate one or more datasets with a calibration collection and a
253 validity range within it.
255 Parameters
256 ----------
257 collection : `CollectionRecord`
258 The record object describing the collection. ``collection.type``
259 must be `~CollectionType.CALIBRATION`.
260 datasets : `Iterable` [ `DatasetRef` ]
261 Datasets to be associated. All datasets must be resolved and have
262 the same `DatasetType` as ``self``.
263 timespan : `Timespan`
264 The validity range for these datasets within the collection.
266 Raises
267 ------
268 AmbiguousDatasetError
269 Raised if any of the given `DatasetRef` instances is unresolved.
270 ConflictingDefinitionError
271 Raised if the collection already contains a different dataset with
272 the same `DatasetType` and data ID and an overlapping validity
273 range.
274 CollectionTypeError
275 Raised if
276 ``collection.type is not CollectionType.CALIBRATION`` or if
277 ``self.datasetType.isCalibration() is False``.
278 """
279 raise NotImplementedError()
281 @abstractmethod
282 def decertify(
283 self,
284 collection: CollectionRecord,
285 timespan: Timespan,
286 *,
287 dataIds: Optional[Iterable[DataCoordinate]] = None,
288 ) -> None:
289 """Remove or adjust datasets to clear a validity range within a
290 calibration collection.
292 Parameters
293 ----------
294 collection : `CollectionRecord`
295 The record object describing the collection. ``collection.type``
296 must be `~CollectionType.CALIBRATION`.
297 timespan : `Timespan`
298 The validity range to remove datasets from within the collection.
299 Datasets that overlap this range but are not contained by it will
300 have their validity ranges adjusted to not overlap it, which may
301 split a single dataset validity range into two.
302 dataIds : `Iterable` [ `DataCoordinate` ], optional
303 Data IDs that should be decertified within the given validity range
304 If `None`, all data IDs for ``self.datasetType`` will be
305 decertified.
307 Raises
308 ------
309 CollectionTypeError
310 Raised if ``collection.type is not CollectionType.CALIBRATION``.
311 """
312 raise NotImplementedError()
314 @abstractmethod
315 def select(
316 self,
317 *collections: CollectionRecord,
318 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
319 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select,
320 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
321 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
322 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
323 ) -> sqlalchemy.sql.Selectable:
324 """Return a SQLAlchemy object that represents a ``SELECT`` query for
325 this `DatasetType`.
327 All arguments can either be a value that constrains the query or
328 the `SimpleQuery.Select` tag object to indicate that the value should
329 be returned in the columns in the ``SELECT`` clause. The default is
330 `SimpleQuery.Select`.
332 Parameters
333 ----------
334 *collections : `CollectionRecord`
335 The record object(s) describing the collection(s) to query. May
336 not be of type `CollectionType.CHAINED`. If multiple collections
337 are passed, the query will search all of them in an unspecified
338 order, and all collections must have the same type.
339 dataId : `DataCoordinate` or `Select`
340 The data ID to restrict results with, or an instruction to return
341 the data ID via columns with names
342 ``self.datasetType.dimensions.names``.
343 id : `DatasetId`, `Select` or None,
344 The primary key value for the dataset, an instruction to return it
345 via a ``id`` column, or `None` to ignore it entirely.
346 run : `None` or `Select`
347 If `Select` (default), include the dataset's run key value (as
348 column labeled with the return value of
349 ``CollectionManager.getRunForeignKeyName``).
350 If `None`, do not include this column (to constrain the run,
351 pass a `RunRecord` as the ``collection`` argument instead).
352 timespan : `None`, `Select`, or `Timespan`
353 If `Select` (default), include the validity range timespan in the
354 result columns. If a `Timespan` instance, constrain the results to
355 those whose validity ranges overlap that given timespan. Ignored
356 for collection types other than `~CollectionType.CALIBRATION``,
357 but `None` should be passed explicitly if a mix of
358 `~CollectionType.CALIBRATION` and other types are passed in.
359 ingestDate : `None`, `Select`, or `Timespan`
360 If `Select` include the ingest timestamp in the result columns.
361 If a `Timespan` instance, constrain the results to those whose
362 ingest times which are inside given timespan and also include
363 timestamp in the result columns. If `None` (default) then there is
364 no constraint and timestamp is not returned.
366 Returns
367 -------
368 query : `sqlalchemy.sql.Selectable`
369 A SQLAlchemy object representing a simple ``SELECT`` query.
370 """
371 raise NotImplementedError()
373 datasetType: DatasetType
374 """Dataset type whose records this object manages (`DatasetType`).
375 """
378class DatasetRecordStorageManager(VersionedExtension):
379 """An interface that manages the tables that describe datasets.
381 `DatasetRecordStorageManager` primarily serves as a container and factory
382 for `DatasetRecordStorage` instances, which each provide access to the
383 records for a different `DatasetType`.
384 """
386 @classmethod
387 @abstractmethod
388 def initialize(
389 cls,
390 db: Database,
391 context: StaticTablesContext,
392 *,
393 collections: CollectionManager,
394 dimensions: DimensionRecordStorageManager,
395 ) -> DatasetRecordStorageManager:
396 """Construct an instance of the manager.
398 Parameters
399 ----------
400 db : `Database`
401 Interface to the underlying database engine and namespace.
402 context : `StaticTablesContext`
403 Context object obtained from `Database.declareStaticTables`; used
404 to declare any tables that should always be present.
405 collections: `CollectionManager`
406 Manager object for the collections in this `Registry`.
407 dimensions : `DimensionRecordStorageManager`
408 Manager object for the dimensions in this `Registry`.
410 Returns
411 -------
412 manager : `DatasetRecordStorageManager`
413 An instance of a concrete `DatasetRecordStorageManager` subclass.
414 """
415 raise NotImplementedError()
417 @classmethod
418 @abstractmethod
419 def getIdColumnType(cls) -> type:
420 """Return type used for columns storing dataset IDs.
422 This type is used for columns storing `DatasetRef.id` values, usually
423 a `type` subclass provided by SQLAlchemy.
425 Returns
426 -------
427 dtype : `type`
428 Type used for dataset identification in database.
429 """
430 raise NotImplementedError()
432 @classmethod
433 @abstractmethod
434 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
435 """Test whether the given dataset ID generation mode is supported by
436 `insert`.
438 Parameters
439 ----------
440 mode : `DatasetIdGenEnum`
441 Enum value for the mode to test.
443 Returns
444 -------
445 supported : `bool`
446 Whether the given mode is supported.
447 """
448 raise NotImplementedError()
450 @classmethod
451 @abstractmethod
452 def addDatasetForeignKey(
453 cls,
454 tableSpec: ddl.TableSpec,
455 *,
456 name: str = "dataset",
457 constraint: bool = True,
458 onDelete: Optional[str] = None,
459 **kwargs: Any,
460 ) -> ddl.FieldSpec:
461 """Add a foreign key (field and constraint) referencing the dataset
462 table.
464 Parameters
465 ----------
466 tableSpec : `ddl.TableSpec`
467 Specification for the table that should reference the dataset
468 table. Will be modified in place.
469 name: `str`, optional
470 A name to use for the prefix of the new field; the full name is
471 ``{name}_id``.
472 onDelete: `str`, optional
473 One of "CASCADE" or "SET NULL", indicating what should happen to
474 the referencing row if the collection row is deleted. `None`
475 indicates that this should be an integrity error.
476 constraint: `bool`, optional
477 If `False` (`True` is default), add a field that can be joined to
478 the dataset primary key, but do not add a foreign key constraint.
479 **kwargs
480 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
481 constructor (only the ``name`` and ``dtype`` arguments are
482 otherwise provided).
484 Returns
485 -------
486 idSpec : `ddl.FieldSpec`
487 Specification for the ID field.
488 """
489 raise NotImplementedError()
491 @abstractmethod
492 def refresh(self) -> None:
493 """Ensure all other operations on this manager are aware of any
494 dataset types that may have been registered by other clients since
495 it was initialized or last refreshed.
496 """
497 raise NotImplementedError()
499 def __getitem__(self, name: str) -> DatasetRecordStorage:
500 """Return the object that provides access to the records associated
501 with the given `DatasetType` name.
503 This is simply a convenience wrapper for `find` that raises `KeyError`
504 when the dataset type is not found.
506 Returns
507 -------
508 records : `DatasetRecordStorage`
509 The object representing the records for the given dataset type.
511 Raises
512 ------
513 KeyError
514 Raised if there is no dataset type with the given name.
516 Notes
517 -----
518 Dataset types registered by another client of the same repository since
519 the last call to `initialize` or `refresh` may not be found.
520 """
521 result = self.find(name)
522 if result is None:
523 raise KeyError(f"Dataset type with name '{name}' not found.")
524 return result
526 @abstractmethod
527 def find(self, name: str) -> Optional[DatasetRecordStorage]:
528 """Return an object that provides access to the records associated with
529 the given `DatasetType` name, if one exists.
531 Parameters
532 ----------
533 name : `str`
534 Name of the dataset type.
536 Returns
537 -------
538 records : `DatasetRecordStorage` or `None`
539 The object representing the records for the given dataset type, or
540 `None` if there are no records for that dataset type.
542 Notes
543 -----
544 Dataset types registered by another client of the same repository since
545 the last call to `initialize` or `refresh` may not be found.
546 """
547 raise NotImplementedError()
549 @abstractmethod
550 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
551 """Ensure that this `Registry` can hold records for the given
552 `DatasetType`, creating new tables as necessary.
554 Parameters
555 ----------
556 datasetType : `DatasetType`
557 Dataset type for which a table should created (as necessary) and
558 an associated `DatasetRecordStorage` returned.
560 Returns
561 -------
562 records : `DatasetRecordStorage`
563 The object representing the records for the given dataset type.
564 inserted : `bool`
565 `True` if the dataset type did not exist in the registry before.
567 Notes
568 -----
569 This operation may not be invoked within a `Database.transaction`
570 context.
571 """
572 raise NotImplementedError()
574 @abstractmethod
575 def remove(self, name: str) -> None:
576 """Remove the dataset type.
578 Parameters
579 ----------
580 name : `str`
581 Name of the dataset type.
582 """
583 raise NotImplementedError()
585 @abstractmethod
586 def __iter__(self) -> Iterator[DatasetType]:
587 """Return an iterator over the the dataset types present in this layer.
589 Notes
590 -----
591 Dataset types registered by another client of the same layer since
592 the last call to `initialize` or `refresh` may not be included.
593 """
594 raise NotImplementedError()
596 @abstractmethod
597 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
598 """Return a `DatasetRef` for the given dataset primary key
599 value.
601 Parameters
602 ----------
603 id : `DatasetId`
604 Primary key value for the dataset.
606 Returns
607 -------
608 ref : `DatasetRef` or `None`
609 Object representing the dataset, or `None` if no dataset with the
610 given primary key values exists in this layer.
611 """
612 raise NotImplementedError()
614 @abstractmethod
615 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
616 """Return a summary for the given collection.
618 Parameters
619 ----------
620 collection : `CollectionRecord`
621 Record describing the collection for which a summary is to be
622 retrieved.
624 Returns
625 -------
626 summary : `CollectionSummary`
627 Summary of the dataset types and governor dimension values in
628 this collection.
629 """
630 raise NotImplementedError()