Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 60%
112 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-22 02:05 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-22 02:05 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum")
26import enum
27import uuid
28from abc import ABC, abstractmethod
29from typing import TYPE_CHECKING, Any, Iterable, Iterator, List, Optional, Tuple
31import sqlalchemy.sql
33from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, SimpleQuery, Timespan, ddl
34from ._versioning import VersionedExtension
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 from .._collection_summary import CollectionSummary
38 from ._collections import CollectionManager, CollectionRecord, RunRecord
39 from ._database import Database, StaticTablesContext
40 from ._dimensions import DimensionRecordStorageManager
43class DatasetIdGenEnum(enum.Enum):
44 """This enum is used to specify dataset ID generation options for
45 ``insert()`` method.
46 """
48 UNIQUE = 0
49 """Unique mode generates unique ID for each inserted dataset, e.g.
50 auto-generated by database or random UUID.
51 """
53 DATAID_TYPE = 1
54 """In this mode ID is computed deterministically from a combination of
55 dataset type and dataId.
56 """
58 DATAID_TYPE_RUN = 2
59 """In this mode ID is computed deterministically from a combination of
60 dataset type, dataId, and run collection name.
61 """
64class DatasetIdFactory:
65 """Factory for dataset IDs (UUIDs).
67 For now the logic is hard-coded and is controlled by the user-provided
68 value of `DatasetIdGenEnum`. In the future we may implement a configurable
69 logic that can guess `DatasetIdGenEnum` value from other parameters.
70 """
72 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
73 """Namespace UUID used for UUID5 generation. Do not change. This was
74 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
75 """
77 def makeDatasetId(
78 self,
79 run: str,
80 datasetType: DatasetType,
81 dataId: DataCoordinate,
82 idGenerationMode: DatasetIdGenEnum,
83 ) -> uuid.UUID:
84 """Generate dataset ID for a dataset.
86 Parameters
87 ----------
88 run : `str`
89 Name of the RUN collection for the dataset.
90 datasetType : `DatasetType`
91 Dataset type.
92 dataId : `DataCoordinate`
93 Expanded data ID for the dataset.
94 idGenerationMode : `DatasetIdGenEnum`
95 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
96 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
97 deterministic UUID5-type ID based on a dataset type name and
98 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
99 deterministic UUID5-type ID based on a dataset type name, run
100 collection name, and ``dataId``.
102 Returns
103 -------
104 datasetId : `uuid.UUID`
105 Dataset identifier.
106 """
107 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
108 return uuid.uuid4()
109 else:
110 # WARNING: If you modify this code make sure that the order of
111 # items in the `items` list below never changes.
112 items: List[Tuple[str, str]] = []
113 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
114 items = [
115 ("dataset_type", datasetType.name),
116 ]
117 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
118 items = [
119 ("dataset_type", datasetType.name),
120 ("run", run),
121 ]
122 else:
123 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
125 for name, value in sorted(dataId.byName().items()):
126 items.append((name, str(value)))
127 data = ",".join(f"{key}={value}" for key, value in items)
128 return uuid.uuid5(self.NS_UUID, data)
131class DatasetRecordStorage(ABC):
132 """An interface that manages the records associated with a particular
133 `DatasetType`.
135 Parameters
136 ----------
137 datasetType : `DatasetType`
138 Dataset type whose records this object manages.
139 """
141 def __init__(self, datasetType: DatasetType):
142 self.datasetType = datasetType
144 @abstractmethod
145 def insert(
146 self,
147 run: RunRecord,
148 dataIds: Iterable[DataCoordinate],
149 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
150 ) -> Iterator[DatasetRef]:
151 """Insert one or more dataset entries into the database.
153 Parameters
154 ----------
155 run : `RunRecord`
156 The record object describing the `~CollectionType.RUN` collection
157 this dataset will be associated with.
158 dataIds : `Iterable` [ `DataCoordinate` ]
159 Expanded data IDs (`DataCoordinate` instances) for the
160 datasets to be added. The dimensions of all data IDs must be the
161 same as ``self.datasetType.dimensions``.
162 idMode : `DatasetIdGenEnum`
163 With `UNIQUE` each new dataset is inserted with its new unique ID.
164 With non-`UNIQUE` mode ID is computed from some combination of
165 dataset type, dataId, and run collection name; if the same ID is
166 already in the database then new record is not inserted.
168 Returns
169 -------
170 datasets : `Iterable` [ `DatasetRef` ]
171 References to the inserted datasets.
172 """
173 raise NotImplementedError()
175 @abstractmethod
176 def import_(
177 self,
178 run: RunRecord,
179 datasets: Iterable[DatasetRef],
180 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
181 reuseIds: bool = False,
182 ) -> Iterator[DatasetRef]:
183 """Insert one or more dataset entries into the database.
185 Parameters
186 ----------
187 run : `RunRecord`
188 The record object describing the `~CollectionType.RUN` collection
189 this dataset will be associated with.
190 datasets : `~collections.abc.Iterable` of `DatasetRef`
191 Datasets to be inserted. Datasets can specify ``id`` attribute
192 which will be used for inserted datasets. All dataset IDs must
193 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
194 does not match type supported by this class then IDs will be
195 ignored and new IDs will be generated by backend.
196 idGenerationMode : `DatasetIdGenEnum`
197 With `UNIQUE` each new dataset is inserted with its new unique ID.
198 With non-`UNIQUE` mode ID is computed from some combination of
199 dataset type, dataId, and run collection name; if the same ID is
200 already in the database then new record is not inserted.
201 reuseIds : `bool`, optional
202 If `True` then forces re-use of imported dataset IDs for integer
203 IDs which are normally generated as auto-incremented; exception
204 will be raised if imported IDs clash with existing ones. This
205 option has no effect on the use of globally-unique IDs which are
206 always re-used (or generated if integer IDs are being imported).
208 Returns
209 -------
210 datasets : `Iterable` [ `DatasetRef` ]
211 References to the inserted or existing datasets.
213 Notes
214 -----
215 The ``datasetType`` and ``run`` attributes of datasets are supposed to
216 be identical across all datasets but this is not checked and it should
217 be enforced by higher level registry code. This method does not need
218 to use those attributes from datasets, only ``dataId`` and ``id`` are
219 relevant.
220 """
221 raise NotImplementedError()
223 @abstractmethod
224 def find(
225 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None
226 ) -> Optional[DatasetRef]:
227 """Search a collection for a dataset with the given data ID.
229 Parameters
230 ----------
231 collection : `CollectionRecord`
232 The record object describing the collection to search for the
233 dataset. May have any `CollectionType`.
234 dataId: `DataCoordinate`
235 Complete (but not necessarily expanded) data ID to search with,
236 with ``dataId.graph == self.datasetType.dimensions``.
237 timespan : `Timespan`, optional
238 A timespan that the validity range of the dataset must overlap.
239 Required if ``collection.type is CollectionType.CALIBRATION``, and
240 ignored otherwise.
242 Returns
243 -------
244 ref : `DatasetRef`
245 A resolved `DatasetRef` (without components populated), or `None`
246 if no matching dataset was found.
247 """
248 raise NotImplementedError()
250 @abstractmethod
251 def delete(self, datasets: Iterable[DatasetRef]) -> None:
252 """Fully delete the given datasets from the registry.
254 Parameters
255 ----------
256 datasets : `Iterable` [ `DatasetRef` ]
257 Datasets to be deleted. All datasets must be resolved and have
258 the same `DatasetType` as ``self``.
260 Raises
261 ------
262 AmbiguousDatasetError
263 Raised if any of the given `DatasetRef` instances is unresolved.
264 """
265 raise NotImplementedError()
267 @abstractmethod
268 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
269 """Associate one or more datasets with a collection.
271 Parameters
272 ----------
273 collection : `CollectionRecord`
274 The record object describing the collection. ``collection.type``
275 must be `~CollectionType.TAGGED`.
276 datasets : `Iterable` [ `DatasetRef` ]
277 Datasets to be associated. All datasets must be resolved and have
278 the same `DatasetType` as ``self``.
280 Raises
281 ------
282 AmbiguousDatasetError
283 Raised if any of the given `DatasetRef` instances is unresolved.
285 Notes
286 -----
287 Associating a dataset with into collection that already contains a
288 different dataset with the same `DatasetType` and data ID will remove
289 the existing dataset from that collection.
291 Associating the same dataset into a collection multiple times is a
292 no-op, but is still not permitted on read-only databases.
293 """
294 raise NotImplementedError()
296 @abstractmethod
297 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
298 """Remove one or more datasets from a collection.
300 Parameters
301 ----------
302 collection : `CollectionRecord`
303 The record object describing the collection. ``collection.type``
304 must be `~CollectionType.TAGGED`.
305 datasets : `Iterable` [ `DatasetRef` ]
306 Datasets to be disassociated. All datasets must be resolved and
307 have the same `DatasetType` as ``self``.
309 Raises
310 ------
311 AmbiguousDatasetError
312 Raised if any of the given `DatasetRef` instances is unresolved.
313 """
314 raise NotImplementedError()
316 @abstractmethod
317 def certify(
318 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
319 ) -> None:
320 """Associate one or more datasets with a calibration collection and a
321 validity range within it.
323 Parameters
324 ----------
325 collection : `CollectionRecord`
326 The record object describing the collection. ``collection.type``
327 must be `~CollectionType.CALIBRATION`.
328 datasets : `Iterable` [ `DatasetRef` ]
329 Datasets to be associated. All datasets must be resolved and have
330 the same `DatasetType` as ``self``.
331 timespan : `Timespan`
332 The validity range for these datasets within the collection.
334 Raises
335 ------
336 AmbiguousDatasetError
337 Raised if any of the given `DatasetRef` instances is unresolved.
338 ConflictingDefinitionError
339 Raised if the collection already contains a different dataset with
340 the same `DatasetType` and data ID and an overlapping validity
341 range.
342 CollectionTypeError
343 Raised if
344 ``collection.type is not CollectionType.CALIBRATION`` or if
345 ``self.datasetType.isCalibration() is False``.
346 """
347 raise NotImplementedError()
349 @abstractmethod
350 def decertify(
351 self,
352 collection: CollectionRecord,
353 timespan: Timespan,
354 *,
355 dataIds: Optional[Iterable[DataCoordinate]] = None,
356 ) -> None:
357 """Remove or adjust datasets to clear a validity range within a
358 calibration collection.
360 Parameters
361 ----------
362 collection : `CollectionRecord`
363 The record object describing the collection. ``collection.type``
364 must be `~CollectionType.CALIBRATION`.
365 timespan : `Timespan`
366 The validity range to remove datasets from within the collection.
367 Datasets that overlap this range but are not contained by it will
368 have their validity ranges adjusted to not overlap it, which may
369 split a single dataset validity range into two.
370 dataIds : `Iterable` [ `DataCoordinate` ], optional
371 Data IDs that should be decertified within the given validity range
372 If `None`, all data IDs for ``self.datasetType`` will be
373 decertified.
375 Raises
376 ------
377 CollectionTypeError
378 Raised if ``collection.type is not CollectionType.CALIBRATION``.
379 """
380 raise NotImplementedError()
382 @abstractmethod
383 def select(
384 self,
385 *collections: CollectionRecord,
386 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
387 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select,
388 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
389 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
390 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
391 rank: SimpleQuery.Select.Or[None] = None,
392 ) -> sqlalchemy.sql.Selectable:
393 """Return a SQLAlchemy object that represents a ``SELECT`` query for
394 this `DatasetType`.
396 All arguments can either be a value that constrains the query or
397 the `SimpleQuery.Select` tag object to indicate that the value should
398 be returned in the columns in the ``SELECT`` clause. The default is
399 `SimpleQuery.Select`.
401 Parameters
402 ----------
403 *collections : `CollectionRecord`
404 The record object(s) describing the collection(s) to query. May
405 not be of type `CollectionType.CHAINED`. If multiple collections
406 are passed, the query will search all of them in an unspecified
407 order, and all collections must have the same type.
408 dataId : `DataCoordinate` or `Select`
409 The data ID to restrict results with, or an instruction to return
410 the data ID via columns with names
411 ``self.datasetType.dimensions.names``.
412 id : `DatasetId`, `Select` or None,
413 The primary key value for the dataset, an instruction to return it
414 via a ``id`` column, or `None` to ignore it entirely.
415 run : `None` or `Select`
416 If `Select` (default), include the dataset's run key value (as
417 column labeled with the return value of
418 ``CollectionManager.getRunForeignKeyName``).
419 If `None`, do not include this column (to constrain the run,
420 pass a `RunRecord` as the ``collection`` argument instead).
421 timespan : `None`, `Select`, or `Timespan`
422 If `Select` (default), include the validity range timespan in the
423 result columns. If a `Timespan` instance, constrain the results to
424 those whose validity ranges overlap that given timespan. For
425 collections whose type is not `~CollectionType.CALIBRATION`, if
426 `Select` is passed a column with a literal ``NULL`` value will be
427 added, and ``sqlalchemy.sql.expressions.Null` may be passed to
428 force a constraint that the value be null (since `None` is
429 interpreted as meaning "do not select or constrain this column").
430 ingestDate : `None`, `Select`, or `Timespan`
431 If `Select` include the ingest timestamp in the result columns.
432 If a `Timespan` instance, constrain the results to those whose
433 ingest times which are inside given timespan and also include
434 timestamp in the result columns. If `None` (default) then there is
435 no constraint and timestamp is not returned.
436 rank : `Select` or `None`
437 If `Select`, include a calculated column that is the integer rank
438 of the row's collection in the given list of collections, starting
439 from zero.
441 Returns
442 -------
443 query : `sqlalchemy.sql.Selectable`
444 A SQLAlchemy object representing a simple ``SELECT`` query.
445 """
446 raise NotImplementedError()
448 datasetType: DatasetType
449 """Dataset type whose records this object manages (`DatasetType`).
450 """
453class DatasetRecordStorageManager(VersionedExtension):
454 """An interface that manages the tables that describe datasets.
456 `DatasetRecordStorageManager` primarily serves as a container and factory
457 for `DatasetRecordStorage` instances, which each provide access to the
458 records for a different `DatasetType`.
459 """
461 @classmethod
462 @abstractmethod
463 def initialize(
464 cls,
465 db: Database,
466 context: StaticTablesContext,
467 *,
468 collections: CollectionManager,
469 dimensions: DimensionRecordStorageManager,
470 ) -> DatasetRecordStorageManager:
471 """Construct an instance of the manager.
473 Parameters
474 ----------
475 db : `Database`
476 Interface to the underlying database engine and namespace.
477 context : `StaticTablesContext`
478 Context object obtained from `Database.declareStaticTables`; used
479 to declare any tables that should always be present.
480 collections: `CollectionManager`
481 Manager object for the collections in this `Registry`.
482 dimensions : `DimensionRecordStorageManager`
483 Manager object for the dimensions in this `Registry`.
485 Returns
486 -------
487 manager : `DatasetRecordStorageManager`
488 An instance of a concrete `DatasetRecordStorageManager` subclass.
489 """
490 raise NotImplementedError()
492 @classmethod
493 @abstractmethod
494 def getIdColumnType(cls) -> type:
495 """Return type used for columns storing dataset IDs.
497 This type is used for columns storing `DatasetRef.id` values, usually
498 a `type` subclass provided by SQLAlchemy.
500 Returns
501 -------
502 dtype : `type`
503 Type used for dataset identification in database.
504 """
505 raise NotImplementedError()
507 @classmethod
508 @abstractmethod
509 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
510 """Test whether the given dataset ID generation mode is supported by
511 `insert`.
513 Parameters
514 ----------
515 mode : `DatasetIdGenEnum`
516 Enum value for the mode to test.
518 Returns
519 -------
520 supported : `bool`
521 Whether the given mode is supported.
522 """
523 raise NotImplementedError()
525 @classmethod
526 @abstractmethod
527 def addDatasetForeignKey(
528 cls,
529 tableSpec: ddl.TableSpec,
530 *,
531 name: str = "dataset",
532 constraint: bool = True,
533 onDelete: Optional[str] = None,
534 **kwargs: Any,
535 ) -> ddl.FieldSpec:
536 """Add a foreign key (field and constraint) referencing the dataset
537 table.
539 Parameters
540 ----------
541 tableSpec : `ddl.TableSpec`
542 Specification for the table that should reference the dataset
543 table. Will be modified in place.
544 name: `str`, optional
545 A name to use for the prefix of the new field; the full name is
546 ``{name}_id``.
547 onDelete: `str`, optional
548 One of "CASCADE" or "SET NULL", indicating what should happen to
549 the referencing row if the collection row is deleted. `None`
550 indicates that this should be an integrity error.
551 constraint: `bool`, optional
552 If `False` (`True` is default), add a field that can be joined to
553 the dataset primary key, but do not add a foreign key constraint.
554 **kwargs
555 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
556 constructor (only the ``name`` and ``dtype`` arguments are
557 otherwise provided).
559 Returns
560 -------
561 idSpec : `ddl.FieldSpec`
562 Specification for the ID field.
563 """
564 raise NotImplementedError()
566 @abstractmethod
567 def refresh(self) -> None:
568 """Ensure all other operations on this manager are aware of any
569 dataset types that may have been registered by other clients since
570 it was initialized or last refreshed.
571 """
572 raise NotImplementedError()
574 def __getitem__(self, name: str) -> DatasetRecordStorage:
575 """Return the object that provides access to the records associated
576 with the given `DatasetType` name.
578 This is simply a convenience wrapper for `find` that raises `KeyError`
579 when the dataset type is not found.
581 Returns
582 -------
583 records : `DatasetRecordStorage`
584 The object representing the records for the given dataset type.
586 Raises
587 ------
588 KeyError
589 Raised if there is no dataset type with the given name.
591 Notes
592 -----
593 Dataset types registered by another client of the same repository since
594 the last call to `initialize` or `refresh` may not be found.
595 """
596 result = self.find(name)
597 if result is None:
598 raise KeyError(f"Dataset type with name '{name}' not found.")
599 return result
601 @abstractmethod
602 def find(self, name: str) -> Optional[DatasetRecordStorage]:
603 """Return an object that provides access to the records associated with
604 the given `DatasetType` name, if one exists.
606 Parameters
607 ----------
608 name : `str`
609 Name of the dataset type.
611 Returns
612 -------
613 records : `DatasetRecordStorage` or `None`
614 The object representing the records for the given dataset type, or
615 `None` if there are no records for that dataset type.
617 Notes
618 -----
619 Dataset types registered by another client of the same repository since
620 the last call to `initialize` or `refresh` may not be found.
621 """
622 raise NotImplementedError()
624 @abstractmethod
625 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
626 """Ensure that this `Registry` can hold records for the given
627 `DatasetType`, creating new tables as necessary.
629 Parameters
630 ----------
631 datasetType : `DatasetType`
632 Dataset type for which a table should created (as necessary) and
633 an associated `DatasetRecordStorage` returned.
635 Returns
636 -------
637 records : `DatasetRecordStorage`
638 The object representing the records for the given dataset type.
639 inserted : `bool`
640 `True` if the dataset type did not exist in the registry before.
642 Notes
643 -----
644 This operation may not be invoked within a `Database.transaction`
645 context.
646 """
647 raise NotImplementedError()
649 @abstractmethod
650 def remove(self, name: str) -> None:
651 """Remove the dataset type.
653 Parameters
654 ----------
655 name : `str`
656 Name of the dataset type.
657 """
658 raise NotImplementedError()
660 @abstractmethod
661 def __iter__(self) -> Iterator[DatasetType]:
662 """Return an iterator over the the dataset types present in this layer.
664 Notes
665 -----
666 Dataset types registered by another client of the same layer since
667 the last call to `initialize` or `refresh` may not be included.
668 """
669 raise NotImplementedError()
671 @abstractmethod
672 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
673 """Return a `DatasetRef` for the given dataset primary key
674 value.
676 Parameters
677 ----------
678 id : `DatasetId`
679 Primary key value for the dataset.
681 Returns
682 -------
683 ref : `DatasetRef` or `None`
684 Object representing the dataset, or `None` if no dataset with the
685 given primary key values exists in this layer.
686 """
687 raise NotImplementedError()
689 @abstractmethod
690 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
691 """Return a summary for the given collection.
693 Parameters
694 ----------
695 collection : `CollectionRecord`
696 Record describing the collection for which a summary is to be
697 retrieved.
699 Returns
700 -------
701 summary : `CollectionSummary`
702 Summary of the dataset types and governor dimension values in
703 this collection.
704 """
705 raise NotImplementedError()