Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 60%
112 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:41 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:41 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum")
26import enum
27import uuid
28from abc import ABC, abstractmethod
29from typing import TYPE_CHECKING, Any, Iterable, Iterator, List, Optional, Tuple
31import sqlalchemy.sql
33from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, SimpleQuery, Timespan, ddl
34from ._versioning import VersionedExtension
36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true
37 from ..summaries import CollectionSummary
38 from ._collections import CollectionManager, CollectionRecord, RunRecord
39 from ._database import Database, StaticTablesContext
40 from ._dimensions import DimensionRecordStorageManager
43class DatasetIdGenEnum(enum.Enum):
44 """This enum is used to specify dataset ID generation options for
45 ``insert()`` method.
46 """
48 UNIQUE = 0
49 """Unique mode generates unique ID for each inserted dataset, e.g.
50 auto-generated by database or random UUID.
51 """
53 DATAID_TYPE = 1
54 """In this mode ID is computed deterministically from a combination of
55 dataset type and dataId.
56 """
58 DATAID_TYPE_RUN = 2
59 """In this mode ID is computed deterministically from a combination of
60 dataset type, dataId, and run collection name.
61 """
64class DatasetIdFactory:
65 """Factory for dataset IDs (UUIDs).
67 For now the logic is hard-coded and is controlled by the user-provided
68 value of `DatasetIdGenEnum`. In the future we may implement a configurable
69 logic that can guess `DatasetIdGenEnum` value from other parameters.
70 """
72 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
73 """Namespace UUID used for UUID5 generation. Do not change. This was
74 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
75 """
77 def makeDatasetId(
78 self,
79 run: str,
80 datasetType: DatasetType,
81 dataId: DataCoordinate,
82 idGenerationMode: DatasetIdGenEnum,
83 ) -> uuid.UUID:
84 """Generate dataset ID for a dataset.
86 Parameters
87 ----------
88 run : `str`
89 Name of the RUN collection for the dataset.
90 datasetType : `DatasetType`
91 Dataset type.
92 dataId : `DataCoordinate`
93 Expanded data ID for the dataset.
94 idGenerationMode : `DatasetIdGenEnum`
95 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
96 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
97 deterministic UUID5-type ID based on a dataset type name and
98 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
99 deterministic UUID5-type ID based on a dataset type name, run
100 collection name, and ``dataId``.
102 Returns
103 -------
104 datasetId : `uuid.UUID`
105 Dataset identifier.
106 """
107 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
108 return uuid.uuid4()
109 else:
110 # WARNING: If you modify this code make sure that the order of
111 # items in the `items` list below never changes.
112 items: List[Tuple[str, str]] = []
113 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
114 items = [
115 ("dataset_type", datasetType.name),
116 ]
117 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
118 items = [
119 ("dataset_type", datasetType.name),
120 ("run", run),
121 ]
122 else:
123 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
125 for name, value in sorted(dataId.byName().items()):
126 items.append((name, str(value)))
127 data = ",".join(f"{key}={value}" for key, value in items)
128 return uuid.uuid5(self.NS_UUID, data)
131class DatasetRecordStorage(ABC):
132 """An interface that manages the records associated with a particular
133 `DatasetType`.
135 Parameters
136 ----------
137 datasetType : `DatasetType`
138 Dataset type whose records this object manages.
139 """
141 def __init__(self, datasetType: DatasetType):
142 self.datasetType = datasetType
144 @abstractmethod
145 def insert(
146 self,
147 run: RunRecord,
148 dataIds: Iterable[DataCoordinate],
149 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
150 ) -> Iterator[DatasetRef]:
151 """Insert one or more dataset entries into the database.
153 Parameters
154 ----------
155 run : `RunRecord`
156 The record object describing the `~CollectionType.RUN` collection
157 this dataset will be associated with.
158 dataIds : `Iterable` [ `DataCoordinate` ]
159 Expanded data IDs (`DataCoordinate` instances) for the
160 datasets to be added. The dimensions of all data IDs must be the
161 same as ``self.datasetType.dimensions``.
162 idMode : `DatasetIdGenEnum`
163 With `UNIQUE` each new dataset is inserted with its new unique ID.
164 With non-`UNIQUE` mode ID is computed from some combination of
165 dataset type, dataId, and run collection name; if the same ID is
166 already in the database then new record is not inserted.
168 Returns
169 -------
170 datasets : `Iterable` [ `DatasetRef` ]
171 References to the inserted datasets.
172 """
173 raise NotImplementedError()
175 @abstractmethod
176 def import_(
177 self,
178 run: RunRecord,
179 datasets: Iterable[DatasetRef],
180 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
181 reuseIds: bool = False,
182 ) -> Iterator[DatasetRef]:
183 """Insert one or more dataset entries into the database.
185 Parameters
186 ----------
187 run : `RunRecord`
188 The record object describing the `~CollectionType.RUN` collection
189 this dataset will be associated with.
190 datasets : `~collections.abc.Iterable` of `DatasetRef`
191 Datasets to be inserted. Datasets can specify ``id`` attribute
192 which will be used for inserted datasets. All dataset IDs must
193 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
194 does not match type supported by this class then IDs will be
195 ignored and new IDs will be generated by backend.
196 idGenerationMode : `DatasetIdGenEnum`
197 With `UNIQUE` each new dataset is inserted with its new unique ID.
198 With non-`UNIQUE` mode ID is computed from some combination of
199 dataset type, dataId, and run collection name; if the same ID is
200 already in the database then new record is not inserted.
201 reuseIds : `bool`, optional
202 If `True` then forces re-use of imported dataset IDs for integer
203 IDs which are normally generated as auto-incremented; exception
204 will be raised if imported IDs clash with existing ones. This
205 option has no effect on the use of globally-unique IDs which are
206 always re-used (or generated if integer IDs are being imported).
208 Returns
209 -------
210 datasets : `Iterable` [ `DatasetRef` ]
211 References to the inserted or existing datasets.
213 Notes
214 -----
215 The ``datasetType`` and ``run`` attributes of datasets are supposed to
216 be identical across all datasets but this is not checked and it should
217 be enforced by higher level registry code. This method does not need
218 to use those attributes from datasets, only ``dataId`` and ``id`` are
219 relevant.
220 """
221 raise NotImplementedError()
223 @abstractmethod
224 def find(
225 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None
226 ) -> Optional[DatasetRef]:
227 """Search a collection for a dataset with the given data ID.
229 Parameters
230 ----------
231 collection : `CollectionRecord`
232 The record object describing the collection to search for the
233 dataset. May have any `CollectionType`.
234 dataId: `DataCoordinate`
235 Complete (but not necessarily expanded) data ID to search with,
236 with ``dataId.graph == self.datasetType.dimensions``.
237 timespan : `Timespan`, optional
238 A timespan that the validity range of the dataset must overlap.
239 Required if ``collection.type is CollectionType.CALIBRATION``, and
240 ignored otherwise.
242 Returns
243 -------
244 ref : `DatasetRef`
245 A resolved `DatasetRef` (without components populated), or `None`
246 if no matching dataset was found.
247 """
248 raise NotImplementedError()
250 @abstractmethod
251 def delete(self, datasets: Iterable[DatasetRef]) -> None:
252 """Fully delete the given datasets from the registry.
254 Parameters
255 ----------
256 datasets : `Iterable` [ `DatasetRef` ]
257 Datasets to be deleted. All datasets must be resolved and have
258 the same `DatasetType` as ``self``.
260 Raises
261 ------
262 AmbiguousDatasetError
263 Raised if any of the given `DatasetRef` instances is unresolved.
264 """
265 raise NotImplementedError()
267 @abstractmethod
268 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
269 """Associate one or more datasets with a collection.
271 Parameters
272 ----------
273 collection : `CollectionRecord`
274 The record object describing the collection. ``collection.type``
275 must be `~CollectionType.TAGGED`.
276 datasets : `Iterable` [ `DatasetRef` ]
277 Datasets to be associated. All datasets must be resolved and have
278 the same `DatasetType` as ``self``.
280 Raises
281 ------
282 AmbiguousDatasetError
283 Raised if any of the given `DatasetRef` instances is unresolved.
285 Notes
286 -----
287 Associating a dataset with into collection that already contains a
288 different dataset with the same `DatasetType` and data ID will remove
289 the existing dataset from that collection.
291 Associating the same dataset into a collection multiple times is a
292 no-op, but is still not permitted on read-only databases.
293 """
294 raise NotImplementedError()
296 @abstractmethod
297 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
298 """Remove one or more datasets from a collection.
300 Parameters
301 ----------
302 collection : `CollectionRecord`
303 The record object describing the collection. ``collection.type``
304 must be `~CollectionType.TAGGED`.
305 datasets : `Iterable` [ `DatasetRef` ]
306 Datasets to be disassociated. All datasets must be resolved and
307 have the same `DatasetType` as ``self``.
309 Raises
310 ------
311 AmbiguousDatasetError
312 Raised if any of the given `DatasetRef` instances is unresolved.
313 """
314 raise NotImplementedError()
316 @abstractmethod
317 def certify(
318 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
319 ) -> None:
320 """Associate one or more datasets with a calibration collection and a
321 validity range within it.
323 Parameters
324 ----------
325 collection : `CollectionRecord`
326 The record object describing the collection. ``collection.type``
327 must be `~CollectionType.CALIBRATION`.
328 datasets : `Iterable` [ `DatasetRef` ]
329 Datasets to be associated. All datasets must be resolved and have
330 the same `DatasetType` as ``self``.
331 timespan : `Timespan`
332 The validity range for these datasets within the collection.
334 Raises
335 ------
336 AmbiguousDatasetError
337 Raised if any of the given `DatasetRef` instances is unresolved.
338 ConflictingDefinitionError
339 Raised if the collection already contains a different dataset with
340 the same `DatasetType` and data ID and an overlapping validity
341 range.
342 CollectionTypeError
343 Raised if
344 ``collection.type is not CollectionType.CALIBRATION`` or if
345 ``self.datasetType.isCalibration() is False``.
346 """
347 raise NotImplementedError()
349 @abstractmethod
350 def decertify(
351 self,
352 collection: CollectionRecord,
353 timespan: Timespan,
354 *,
355 dataIds: Optional[Iterable[DataCoordinate]] = None,
356 ) -> None:
357 """Remove or adjust datasets to clear a validity range within a
358 calibration collection.
360 Parameters
361 ----------
362 collection : `CollectionRecord`
363 The record object describing the collection. ``collection.type``
364 must be `~CollectionType.CALIBRATION`.
365 timespan : `Timespan`
366 The validity range to remove datasets from within the collection.
367 Datasets that overlap this range but are not contained by it will
368 have their validity ranges adjusted to not overlap it, which may
369 split a single dataset validity range into two.
370 dataIds : `Iterable` [ `DataCoordinate` ], optional
371 Data IDs that should be decertified within the given validity range
372 If `None`, all data IDs for ``self.datasetType`` will be
373 decertified.
375 Raises
376 ------
377 CollectionTypeError
378 Raised if ``collection.type is not CollectionType.CALIBRATION``.
379 """
380 raise NotImplementedError()
382 @abstractmethod
383 def select(
384 self,
385 *collections: CollectionRecord,
386 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
387 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select,
388 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
389 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
390 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
391 ) -> sqlalchemy.sql.Selectable:
392 """Return a SQLAlchemy object that represents a ``SELECT`` query for
393 this `DatasetType`.
395 All arguments can either be a value that constrains the query or
396 the `SimpleQuery.Select` tag object to indicate that the value should
397 be returned in the columns in the ``SELECT`` clause. The default is
398 `SimpleQuery.Select`.
400 Parameters
401 ----------
402 *collections : `CollectionRecord`
403 The record object(s) describing the collection(s) to query. May
404 not be of type `CollectionType.CHAINED`. If multiple collections
405 are passed, the query will search all of them in an unspecified
406 order, and all collections must have the same type.
407 dataId : `DataCoordinate` or `Select`
408 The data ID to restrict results with, or an instruction to return
409 the data ID via columns with names
410 ``self.datasetType.dimensions.names``.
411 id : `DatasetId`, `Select` or None,
412 The primary key value for the dataset, an instruction to return it
413 via a ``id`` column, or `None` to ignore it entirely.
414 run : `None` or `Select`
415 If `Select` (default), include the dataset's run key value (as
416 column labeled with the return value of
417 ``CollectionManager.getRunForeignKeyName``).
418 If `None`, do not include this column (to constrain the run,
419 pass a `RunRecord` as the ``collection`` argument instead).
420 timespan : `None`, `Select`, or `Timespan`
421 If `Select` (default), include the validity range timespan in the
422 result columns. If a `Timespan` instance, constrain the results to
423 those whose validity ranges overlap that given timespan. Ignored
424 for collection types other than `~CollectionType.CALIBRATION``,
425 but `None` should be passed explicitly if a mix of
426 `~CollectionType.CALIBRATION` and other types are passed in.
427 ingestDate : `None`, `Select`, or `Timespan`
428 If `Select` include the ingest timestamp in the result columns.
429 If a `Timespan` instance, constrain the results to those whose
430 ingest times which are inside given timespan and also include
431 timestamp in the result columns. If `None` (default) then there is
432 no constraint and timestamp is not returned.
434 Returns
435 -------
436 query : `sqlalchemy.sql.Selectable`
437 A SQLAlchemy object representing a simple ``SELECT`` query.
438 """
439 raise NotImplementedError()
441 datasetType: DatasetType
442 """Dataset type whose records this object manages (`DatasetType`).
443 """
446class DatasetRecordStorageManager(VersionedExtension):
447 """An interface that manages the tables that describe datasets.
449 `DatasetRecordStorageManager` primarily serves as a container and factory
450 for `DatasetRecordStorage` instances, which each provide access to the
451 records for a different `DatasetType`.
452 """
454 @classmethod
455 @abstractmethod
456 def initialize(
457 cls,
458 db: Database,
459 context: StaticTablesContext,
460 *,
461 collections: CollectionManager,
462 dimensions: DimensionRecordStorageManager,
463 ) -> DatasetRecordStorageManager:
464 """Construct an instance of the manager.
466 Parameters
467 ----------
468 db : `Database`
469 Interface to the underlying database engine and namespace.
470 context : `StaticTablesContext`
471 Context object obtained from `Database.declareStaticTables`; used
472 to declare any tables that should always be present.
473 collections: `CollectionManager`
474 Manager object for the collections in this `Registry`.
475 dimensions : `DimensionRecordStorageManager`
476 Manager object for the dimensions in this `Registry`.
478 Returns
479 -------
480 manager : `DatasetRecordStorageManager`
481 An instance of a concrete `DatasetRecordStorageManager` subclass.
482 """
483 raise NotImplementedError()
485 @classmethod
486 @abstractmethod
487 def getIdColumnType(cls) -> type:
488 """Return type used for columns storing dataset IDs.
490 This type is used for columns storing `DatasetRef.id` values, usually
491 a `type` subclass provided by SQLAlchemy.
493 Returns
494 -------
495 dtype : `type`
496 Type used for dataset identification in database.
497 """
498 raise NotImplementedError()
500 @classmethod
501 @abstractmethod
502 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
503 """Test whether the given dataset ID generation mode is supported by
504 `insert`.
506 Parameters
507 ----------
508 mode : `DatasetIdGenEnum`
509 Enum value for the mode to test.
511 Returns
512 -------
513 supported : `bool`
514 Whether the given mode is supported.
515 """
516 raise NotImplementedError()
518 @classmethod
519 @abstractmethod
520 def addDatasetForeignKey(
521 cls,
522 tableSpec: ddl.TableSpec,
523 *,
524 name: str = "dataset",
525 constraint: bool = True,
526 onDelete: Optional[str] = None,
527 **kwargs: Any,
528 ) -> ddl.FieldSpec:
529 """Add a foreign key (field and constraint) referencing the dataset
530 table.
532 Parameters
533 ----------
534 tableSpec : `ddl.TableSpec`
535 Specification for the table that should reference the dataset
536 table. Will be modified in place.
537 name: `str`, optional
538 A name to use for the prefix of the new field; the full name is
539 ``{name}_id``.
540 onDelete: `str`, optional
541 One of "CASCADE" or "SET NULL", indicating what should happen to
542 the referencing row if the collection row is deleted. `None`
543 indicates that this should be an integrity error.
544 constraint: `bool`, optional
545 If `False` (`True` is default), add a field that can be joined to
546 the dataset primary key, but do not add a foreign key constraint.
547 **kwargs
548 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
549 constructor (only the ``name`` and ``dtype`` arguments are
550 otherwise provided).
552 Returns
553 -------
554 idSpec : `ddl.FieldSpec`
555 Specification for the ID field.
556 """
557 raise NotImplementedError()
559 @abstractmethod
560 def refresh(self) -> None:
561 """Ensure all other operations on this manager are aware of any
562 dataset types that may have been registered by other clients since
563 it was initialized or last refreshed.
564 """
565 raise NotImplementedError()
567 def __getitem__(self, name: str) -> DatasetRecordStorage:
568 """Return the object that provides access to the records associated
569 with the given `DatasetType` name.
571 This is simply a convenience wrapper for `find` that raises `KeyError`
572 when the dataset type is not found.
574 Returns
575 -------
576 records : `DatasetRecordStorage`
577 The object representing the records for the given dataset type.
579 Raises
580 ------
581 KeyError
582 Raised if there is no dataset type with the given name.
584 Notes
585 -----
586 Dataset types registered by another client of the same repository since
587 the last call to `initialize` or `refresh` may not be found.
588 """
589 result = self.find(name)
590 if result is None:
591 raise KeyError(f"Dataset type with name '{name}' not found.")
592 return result
594 @abstractmethod
595 def find(self, name: str) -> Optional[DatasetRecordStorage]:
596 """Return an object that provides access to the records associated with
597 the given `DatasetType` name, if one exists.
599 Parameters
600 ----------
601 name : `str`
602 Name of the dataset type.
604 Returns
605 -------
606 records : `DatasetRecordStorage` or `None`
607 The object representing the records for the given dataset type, or
608 `None` if there are no records for that dataset type.
610 Notes
611 -----
612 Dataset types registered by another client of the same repository since
613 the last call to `initialize` or `refresh` may not be found.
614 """
615 raise NotImplementedError()
617 @abstractmethod
618 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
619 """Ensure that this `Registry` can hold records for the given
620 `DatasetType`, creating new tables as necessary.
622 Parameters
623 ----------
624 datasetType : `DatasetType`
625 Dataset type for which a table should created (as necessary) and
626 an associated `DatasetRecordStorage` returned.
628 Returns
629 -------
630 records : `DatasetRecordStorage`
631 The object representing the records for the given dataset type.
632 inserted : `bool`
633 `True` if the dataset type did not exist in the registry before.
635 Notes
636 -----
637 This operation may not be invoked within a `Database.transaction`
638 context.
639 """
640 raise NotImplementedError()
642 @abstractmethod
643 def remove(self, name: str) -> None:
644 """Remove the dataset type.
646 Parameters
647 ----------
648 name : `str`
649 Name of the dataset type.
650 """
651 raise NotImplementedError()
653 @abstractmethod
654 def __iter__(self) -> Iterator[DatasetType]:
655 """Return an iterator over the the dataset types present in this layer.
657 Notes
658 -----
659 Dataset types registered by another client of the same layer since
660 the last call to `initialize` or `refresh` may not be included.
661 """
662 raise NotImplementedError()
664 @abstractmethod
665 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
666 """Return a `DatasetRef` for the given dataset primary key
667 value.
669 Parameters
670 ----------
671 id : `DatasetId`
672 Primary key value for the dataset.
674 Returns
675 -------
676 ref : `DatasetRef` or `None`
677 Object representing the dataset, or `None` if no dataset with the
678 given primary key values exists in this layer.
679 """
680 raise NotImplementedError()
682 @abstractmethod
683 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
684 """Return a summary for the given collection.
686 Parameters
687 ----------
688 collection : `CollectionRecord`
689 Record describing the collection for which a summary is to be
690 retrieved.
692 Returns
693 -------
694 summary : `CollectionSummary`
695 Summary of the dataset types and governor dimension values in
696 this collection.
697 """
698 raise NotImplementedError()