Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 60%
113 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-27 08:58 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum")
26import enum
27import uuid
28from abc import ABC, abstractmethod
29from collections.abc import Iterable, Iterator
30from typing import TYPE_CHECKING, Any
32import sqlalchemy.sql
34from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, SimpleQuery, Timespan, ddl
35from ._versioning import VersionedExtension
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from .._collection_summary import CollectionSummary
39 from ._collections import CollectionManager, CollectionRecord, RunRecord
40 from ._database import Database, StaticTablesContext
41 from ._dimensions import DimensionRecordStorageManager
44class DatasetIdGenEnum(enum.Enum):
45 """This enum is used to specify dataset ID generation options for
46 ``insert()`` method.
47 """
49 UNIQUE = 0
50 """Unique mode generates unique ID for each inserted dataset, e.g.
51 auto-generated by database or random UUID.
52 """
54 DATAID_TYPE = 1
55 """In this mode ID is computed deterministically from a combination of
56 dataset type and dataId.
57 """
59 DATAID_TYPE_RUN = 2
60 """In this mode ID is computed deterministically from a combination of
61 dataset type, dataId, and run collection name.
62 """
65class DatasetIdFactory:
66 """Factory for dataset IDs (UUIDs).
68 For now the logic is hard-coded and is controlled by the user-provided
69 value of `DatasetIdGenEnum`. In the future we may implement a configurable
70 logic that can guess `DatasetIdGenEnum` value from other parameters.
71 """
73 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
74 """Namespace UUID used for UUID5 generation. Do not change. This was
75 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
76 """
78 def makeDatasetId(
79 self,
80 run: str,
81 datasetType: DatasetType,
82 dataId: DataCoordinate,
83 idGenerationMode: DatasetIdGenEnum,
84 ) -> uuid.UUID:
85 """Generate dataset ID for a dataset.
87 Parameters
88 ----------
89 run : `str`
90 Name of the RUN collection for the dataset.
91 datasetType : `DatasetType`
92 Dataset type.
93 dataId : `DataCoordinate`
94 Expanded data ID for the dataset.
95 idGenerationMode : `DatasetIdGenEnum`
96 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
97 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
98 deterministic UUID5-type ID based on a dataset type name and
99 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
100 deterministic UUID5-type ID based on a dataset type name, run
101 collection name, and ``dataId``.
103 Returns
104 -------
105 datasetId : `uuid.UUID`
106 Dataset identifier.
107 """
108 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
109 return uuid.uuid4()
110 else:
111 # WARNING: If you modify this code make sure that the order of
112 # items in the `items` list below never changes.
113 items: list[tuple[str, str]] = []
114 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
115 items = [
116 ("dataset_type", datasetType.name),
117 ]
118 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
119 items = [
120 ("dataset_type", datasetType.name),
121 ("run", run),
122 ]
123 else:
124 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
126 for name, value in sorted(dataId.byName().items()):
127 items.append((name, str(value)))
128 data = ",".join(f"{key}={value}" for key, value in items)
129 return uuid.uuid5(self.NS_UUID, data)
132class DatasetRecordStorage(ABC):
133 """An interface that manages the records associated with a particular
134 `DatasetType`.
136 Parameters
137 ----------
138 datasetType : `DatasetType`
139 Dataset type whose records this object manages.
140 """
142 def __init__(self, datasetType: DatasetType):
143 self.datasetType = datasetType
145 @abstractmethod
146 def insert(
147 self,
148 run: RunRecord,
149 dataIds: Iterable[DataCoordinate],
150 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
151 ) -> Iterator[DatasetRef]:
152 """Insert one or more dataset entries into the database.
154 Parameters
155 ----------
156 run : `RunRecord`
157 The record object describing the `~CollectionType.RUN` collection
158 this dataset will be associated with.
159 dataIds : `Iterable` [ `DataCoordinate` ]
160 Expanded data IDs (`DataCoordinate` instances) for the
161 datasets to be added. The dimensions of all data IDs must be the
162 same as ``self.datasetType.dimensions``.
163 idMode : `DatasetIdGenEnum`
164 With `UNIQUE` each new dataset is inserted with its new unique ID.
165 With non-`UNIQUE` mode ID is computed from some combination of
166 dataset type, dataId, and run collection name; if the same ID is
167 already in the database then new record is not inserted.
169 Returns
170 -------
171 datasets : `Iterable` [ `DatasetRef` ]
172 References to the inserted datasets.
173 """
174 raise NotImplementedError()
176 @abstractmethod
177 def import_(
178 self,
179 run: RunRecord,
180 datasets: Iterable[DatasetRef],
181 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
182 reuseIds: bool = False,
183 ) -> Iterator[DatasetRef]:
184 """Insert one or more dataset entries into the database.
186 Parameters
187 ----------
188 run : `RunRecord`
189 The record object describing the `~CollectionType.RUN` collection
190 this dataset will be associated with.
191 datasets : `~collections.abc.Iterable` of `DatasetRef`
192 Datasets to be inserted. Datasets can specify ``id`` attribute
193 which will be used for inserted datasets. All dataset IDs must
194 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
195 does not match type supported by this class then IDs will be
196 ignored and new IDs will be generated by backend.
197 idGenerationMode : `DatasetIdGenEnum`
198 With `UNIQUE` each new dataset is inserted with its new unique ID.
199 With non-`UNIQUE` mode ID is computed from some combination of
200 dataset type, dataId, and run collection name; if the same ID is
201 already in the database then new record is not inserted.
202 reuseIds : `bool`, optional
203 If `True` then forces re-use of imported dataset IDs for integer
204 IDs which are normally generated as auto-incremented; exception
205 will be raised if imported IDs clash with existing ones. This
206 option has no effect on the use of globally-unique IDs which are
207 always re-used (or generated if integer IDs are being imported).
209 Returns
210 -------
211 datasets : `Iterable` [ `DatasetRef` ]
212 References to the inserted or existing datasets.
214 Notes
215 -----
216 The ``datasetType`` and ``run`` attributes of datasets are supposed to
217 be identical across all datasets but this is not checked and it should
218 be enforced by higher level registry code. This method does not need
219 to use those attributes from datasets, only ``dataId`` and ``id`` are
220 relevant.
221 """
222 raise NotImplementedError()
224 @abstractmethod
225 def find(
226 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Timespan | None = None
227 ) -> DatasetRef | None:
228 """Search a collection for a dataset with the given data ID.
230 Parameters
231 ----------
232 collection : `CollectionRecord`
233 The record object describing the collection to search for the
234 dataset. May have any `CollectionType`.
235 dataId: `DataCoordinate`
236 Complete (but not necessarily expanded) data ID to search with,
237 with ``dataId.graph == self.datasetType.dimensions``.
238 timespan : `Timespan`, optional
239 A timespan that the validity range of the dataset must overlap.
240 Required if ``collection.type is CollectionType.CALIBRATION``, and
241 ignored otherwise.
243 Returns
244 -------
245 ref : `DatasetRef`
246 A resolved `DatasetRef` (without components populated), or `None`
247 if no matching dataset was found.
248 """
249 raise NotImplementedError()
251 @abstractmethod
252 def delete(self, datasets: Iterable[DatasetRef]) -> None:
253 """Fully delete the given datasets from the registry.
255 Parameters
256 ----------
257 datasets : `Iterable` [ `DatasetRef` ]
258 Datasets to be deleted. All datasets must be resolved and have
259 the same `DatasetType` as ``self``.
261 Raises
262 ------
263 AmbiguousDatasetError
264 Raised if any of the given `DatasetRef` instances is unresolved.
265 """
266 raise NotImplementedError()
268 @abstractmethod
269 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
270 """Associate one or more datasets with a collection.
272 Parameters
273 ----------
274 collection : `CollectionRecord`
275 The record object describing the collection. ``collection.type``
276 must be `~CollectionType.TAGGED`.
277 datasets : `Iterable` [ `DatasetRef` ]
278 Datasets to be associated. All datasets must be resolved and have
279 the same `DatasetType` as ``self``.
281 Raises
282 ------
283 AmbiguousDatasetError
284 Raised if any of the given `DatasetRef` instances is unresolved.
286 Notes
287 -----
288 Associating a dataset with into collection that already contains a
289 different dataset with the same `DatasetType` and data ID will remove
290 the existing dataset from that collection.
292 Associating the same dataset into a collection multiple times is a
293 no-op, but is still not permitted on read-only databases.
294 """
295 raise NotImplementedError()
297 @abstractmethod
298 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
299 """Remove one or more datasets from a collection.
301 Parameters
302 ----------
303 collection : `CollectionRecord`
304 The record object describing the collection. ``collection.type``
305 must be `~CollectionType.TAGGED`.
306 datasets : `Iterable` [ `DatasetRef` ]
307 Datasets to be disassociated. All datasets must be resolved and
308 have the same `DatasetType` as ``self``.
310 Raises
311 ------
312 AmbiguousDatasetError
313 Raised if any of the given `DatasetRef` instances is unresolved.
314 """
315 raise NotImplementedError()
317 @abstractmethod
318 def certify(
319 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
320 ) -> None:
321 """Associate one or more datasets with a calibration collection and a
322 validity range within it.
324 Parameters
325 ----------
326 collection : `CollectionRecord`
327 The record object describing the collection. ``collection.type``
328 must be `~CollectionType.CALIBRATION`.
329 datasets : `Iterable` [ `DatasetRef` ]
330 Datasets to be associated. All datasets must be resolved and have
331 the same `DatasetType` as ``self``.
332 timespan : `Timespan`
333 The validity range for these datasets within the collection.
335 Raises
336 ------
337 AmbiguousDatasetError
338 Raised if any of the given `DatasetRef` instances is unresolved.
339 ConflictingDefinitionError
340 Raised if the collection already contains a different dataset with
341 the same `DatasetType` and data ID and an overlapping validity
342 range.
343 CollectionTypeError
344 Raised if
345 ``collection.type is not CollectionType.CALIBRATION`` or if
346 ``self.datasetType.isCalibration() is False``.
347 """
348 raise NotImplementedError()
350 @abstractmethod
351 def decertify(
352 self,
353 collection: CollectionRecord,
354 timespan: Timespan,
355 *,
356 dataIds: Iterable[DataCoordinate] | None = None,
357 ) -> None:
358 """Remove or adjust datasets to clear a validity range within a
359 calibration collection.
361 Parameters
362 ----------
363 collection : `CollectionRecord`
364 The record object describing the collection. ``collection.type``
365 must be `~CollectionType.CALIBRATION`.
366 timespan : `Timespan`
367 The validity range to remove datasets from within the collection.
368 Datasets that overlap this range but are not contained by it will
369 have their validity ranges adjusted to not overlap it, which may
370 split a single dataset validity range into two.
371 dataIds : `Iterable` [ `DataCoordinate` ], optional
372 Data IDs that should be decertified within the given validity range
373 If `None`, all data IDs for ``self.datasetType`` will be
374 decertified.
376 Raises
377 ------
378 CollectionTypeError
379 Raised if ``collection.type is not CollectionType.CALIBRATION``.
380 """
381 raise NotImplementedError()
383 @abstractmethod
384 def select(
385 self,
386 *collections: CollectionRecord,
387 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
388 id: SimpleQuery.Select.Or[DatasetId | None] = SimpleQuery.Select,
389 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
390 timespan: SimpleQuery.Select.Or[Timespan | None] = SimpleQuery.Select,
391 ingestDate: SimpleQuery.Select.Or[Timespan | None] = None,
392 rank: SimpleQuery.Select.Or[None] = None,
393 ) -> sqlalchemy.sql.Selectable:
394 """Return a SQLAlchemy object that represents a ``SELECT`` query for
395 this `DatasetType`.
397 All arguments can either be a value that constrains the query or
398 the `SimpleQuery.Select` tag object to indicate that the value should
399 be returned in the columns in the ``SELECT`` clause. The default is
400 `SimpleQuery.Select`.
402 Parameters
403 ----------
404 *collections : `CollectionRecord`
405 The record object(s) describing the collection(s) to query. May
406 not be of type `CollectionType.CHAINED`. If multiple collections
407 are passed, the query will search all of them in an unspecified
408 order, and all collections must have the same type.
409 dataId : `DataCoordinate` or `Select`
410 The data ID to restrict results with, or an instruction to return
411 the data ID via columns with names
412 ``self.datasetType.dimensions.names``.
413 id : `DatasetId`, `Select` or None,
414 The primary key value for the dataset, an instruction to return it
415 via a ``id`` column, or `None` to ignore it entirely.
416 run : `None` or `Select`
417 If `Select` (default), include the dataset's run key value (as
418 column labeled with the return value of
419 ``CollectionManager.getRunForeignKeyName``).
420 If `None`, do not include this column (to constrain the run,
421 pass a `RunRecord` as the ``collection`` argument instead).
422 timespan : `None`, `Select`, or `Timespan`
423 If `Select` (default), include the validity range timespan in the
424 result columns. If a `Timespan` instance, constrain the results to
425 those whose validity ranges overlap that given timespan. For
426 collections whose type is not `~CollectionType.CALIBRATION`, if
427 `Select` is passed a column with a literal ``NULL`` value will be
428 added, and ``sqlalchemy.sql.expressions.Null` may be passed to
429 force a constraint that the value be null (since `None` is
430 interpreted as meaning "do not select or constrain this column").
431 ingestDate : `None`, `Select`, or `Timespan`
432 If `Select` include the ingest timestamp in the result columns.
433 If a `Timespan` instance, constrain the results to those whose
434 ingest times which are inside given timespan and also include
435 timestamp in the result columns. If `None` (default) then there is
436 no constraint and timestamp is not returned.
437 rank : `Select` or `None`
438 If `Select`, include a calculated column that is the integer rank
439 of the row's collection in the given list of collections, starting
440 from zero.
442 Returns
443 -------
444 query : `sqlalchemy.sql.Selectable`
445 A SQLAlchemy object representing a simple ``SELECT`` query.
446 """
447 raise NotImplementedError()
449 datasetType: DatasetType
450 """Dataset type whose records this object manages (`DatasetType`).
451 """
454class DatasetRecordStorageManager(VersionedExtension):
455 """An interface that manages the tables that describe datasets.
457 `DatasetRecordStorageManager` primarily serves as a container and factory
458 for `DatasetRecordStorage` instances, which each provide access to the
459 records for a different `DatasetType`.
460 """
462 @classmethod
463 @abstractmethod
464 def initialize(
465 cls,
466 db: Database,
467 context: StaticTablesContext,
468 *,
469 collections: CollectionManager,
470 dimensions: DimensionRecordStorageManager,
471 ) -> DatasetRecordStorageManager:
472 """Construct an instance of the manager.
474 Parameters
475 ----------
476 db : `Database`
477 Interface to the underlying database engine and namespace.
478 context : `StaticTablesContext`
479 Context object obtained from `Database.declareStaticTables`; used
480 to declare any tables that should always be present.
481 collections: `CollectionManager`
482 Manager object for the collections in this `Registry`.
483 dimensions : `DimensionRecordStorageManager`
484 Manager object for the dimensions in this `Registry`.
486 Returns
487 -------
488 manager : `DatasetRecordStorageManager`
489 An instance of a concrete `DatasetRecordStorageManager` subclass.
490 """
491 raise NotImplementedError()
493 @classmethod
494 @abstractmethod
495 def getIdColumnType(cls) -> type:
496 """Return type used for columns storing dataset IDs.
498 This type is used for columns storing `DatasetRef.id` values, usually
499 a `type` subclass provided by SQLAlchemy.
501 Returns
502 -------
503 dtype : `type`
504 Type used for dataset identification in database.
505 """
506 raise NotImplementedError()
508 @classmethod
509 @abstractmethod
510 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
511 """Test whether the given dataset ID generation mode is supported by
512 `insert`.
514 Parameters
515 ----------
516 mode : `DatasetIdGenEnum`
517 Enum value for the mode to test.
519 Returns
520 -------
521 supported : `bool`
522 Whether the given mode is supported.
523 """
524 raise NotImplementedError()
526 @classmethod
527 @abstractmethod
528 def addDatasetForeignKey(
529 cls,
530 tableSpec: ddl.TableSpec,
531 *,
532 name: str = "dataset",
533 constraint: bool = True,
534 onDelete: str | None = None,
535 **kwargs: Any,
536 ) -> ddl.FieldSpec:
537 """Add a foreign key (field and constraint) referencing the dataset
538 table.
540 Parameters
541 ----------
542 tableSpec : `ddl.TableSpec`
543 Specification for the table that should reference the dataset
544 table. Will be modified in place.
545 name: `str`, optional
546 A name to use for the prefix of the new field; the full name is
547 ``{name}_id``.
548 onDelete: `str`, optional
549 One of "CASCADE" or "SET NULL", indicating what should happen to
550 the referencing row if the collection row is deleted. `None`
551 indicates that this should be an integrity error.
552 constraint: `bool`, optional
553 If `False` (`True` is default), add a field that can be joined to
554 the dataset primary key, but do not add a foreign key constraint.
555 **kwargs
556 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
557 constructor (only the ``name`` and ``dtype`` arguments are
558 otherwise provided).
560 Returns
561 -------
562 idSpec : `ddl.FieldSpec`
563 Specification for the ID field.
564 """
565 raise NotImplementedError()
567 @abstractmethod
568 def refresh(self) -> None:
569 """Ensure all other operations on this manager are aware of any
570 dataset types that may have been registered by other clients since
571 it was initialized or last refreshed.
572 """
573 raise NotImplementedError()
575 def __getitem__(self, name: str) -> DatasetRecordStorage:
576 """Return the object that provides access to the records associated
577 with the given `DatasetType` name.
579 This is simply a convenience wrapper for `find` that raises `KeyError`
580 when the dataset type is not found.
582 Returns
583 -------
584 records : `DatasetRecordStorage`
585 The object representing the records for the given dataset type.
587 Raises
588 ------
589 KeyError
590 Raised if there is no dataset type with the given name.
592 Notes
593 -----
594 Dataset types registered by another client of the same repository since
595 the last call to `initialize` or `refresh` may not be found.
596 """
597 result = self.find(name)
598 if result is None:
599 raise KeyError(f"Dataset type with name '{name}' not found.")
600 return result
602 @abstractmethod
603 def find(self, name: str) -> DatasetRecordStorage | None:
604 """Return an object that provides access to the records associated with
605 the given `DatasetType` name, if one exists.
607 Parameters
608 ----------
609 name : `str`
610 Name of the dataset type.
612 Returns
613 -------
614 records : `DatasetRecordStorage` or `None`
615 The object representing the records for the given dataset type, or
616 `None` if there are no records for that dataset type.
618 Notes
619 -----
620 Dataset types registered by another client of the same repository since
621 the last call to `initialize` or `refresh` may not be found.
622 """
623 raise NotImplementedError()
625 @abstractmethod
626 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
627 """Ensure that this `Registry` can hold records for the given
628 `DatasetType`, creating new tables as necessary.
630 Parameters
631 ----------
632 datasetType : `DatasetType`
633 Dataset type for which a table should created (as necessary) and
634 an associated `DatasetRecordStorage` returned.
636 Returns
637 -------
638 records : `DatasetRecordStorage`
639 The object representing the records for the given dataset type.
640 inserted : `bool`
641 `True` if the dataset type did not exist in the registry before.
643 Notes
644 -----
645 This operation may not be invoked within a `Database.transaction`
646 context.
647 """
648 raise NotImplementedError()
650 @abstractmethod
651 def remove(self, name: str) -> None:
652 """Remove the dataset type.
654 Parameters
655 ----------
656 name : `str`
657 Name of the dataset type.
658 """
659 raise NotImplementedError()
661 @abstractmethod
662 def __iter__(self) -> Iterator[DatasetType]:
663 """Return an iterator over the the dataset types present in this layer.
665 Notes
666 -----
667 Dataset types registered by another client of the same layer since
668 the last call to `initialize` or `refresh` may not be included.
669 """
670 raise NotImplementedError()
672 @abstractmethod
673 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
674 """Return a `DatasetRef` for the given dataset primary key
675 value.
677 Parameters
678 ----------
679 id : `DatasetId`
680 Primary key value for the dataset.
682 Returns
683 -------
684 ref : `DatasetRef` or `None`
685 Object representing the dataset, or `None` if no dataset with the
686 given primary key values exists in this layer.
687 """
688 raise NotImplementedError()
690 @abstractmethod
691 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
692 """Return a summary for the given collection.
694 Parameters
695 ----------
696 collection : `CollectionRecord`
697 Record describing the collection for which a summary is to be
698 retrieved.
700 Returns
701 -------
702 summary : `CollectionSummary`
703 Summary of the dataset types and governor dimension values in
704 this collection.
705 """
706 raise NotImplementedError()