Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 69%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum")
26import enum
27from abc import ABC, abstractmethod
28from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Tuple
30from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, SimpleQuery, Timespan, ddl
31from ._versioning import VersionedExtension
33if TYPE_CHECKING: 33 ↛ 34line 33 didn't jump to line 34, because the condition on line 33 was never true
34 from ..summaries import CollectionSummary
35 from ._collections import CollectionManager, CollectionRecord, RunRecord
36 from ._database import Database, StaticTablesContext
37 from ._dimensions import DimensionRecordStorageManager
40class DatasetIdGenEnum(enum.Enum):
41 """This enum is used to specify dataset ID generation options for
42 ``insert()`` method.
43 """
45 UNIQUE = 0
46 """Unique mode generates unique ID for each inserted dataset, e.g.
47 auto-generated by database or random UUID.
48 """
50 DATAID_TYPE = 1
51 """In this mode ID is computed deterministically from a combination of
52 dataset type and dataId.
53 """
55 DATAID_TYPE_RUN = 2
56 """In this mode ID is computed deterministically from a combination of
57 dataset type, dataId, and run collection name.
58 """
61class DatasetRecordStorage(ABC):
62 """An interface that manages the records associated with a particular
63 `DatasetType`.
65 Parameters
66 ----------
67 datasetType : `DatasetType`
68 Dataset type whose records this object manages.
69 """
71 def __init__(self, datasetType: DatasetType):
72 self.datasetType = datasetType
74 @abstractmethod
75 def insert(
76 self,
77 run: RunRecord,
78 dataIds: Iterable[DataCoordinate],
79 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
80 ) -> Iterator[DatasetRef]:
81 """Insert one or more dataset entries into the database.
83 Parameters
84 ----------
85 run : `RunRecord`
86 The record object describing the `~CollectionType.RUN` collection
87 this dataset will be associated with.
88 dataIds : `Iterable` [ `DataCoordinate` ]
89 Expanded data IDs (`DataCoordinate` instances) for the
90 datasets to be added. The dimensions of all data IDs must be the
91 same as ``self.datasetType.dimensions``.
92 idMode : `DatasetIdGenEnum`
93 With `UNIQUE` each new dataset is inserted with its new unique ID.
94 With non-`UNIQUE` mode ID is computed from some combination of
95 dataset type, dataId, and run collection name; if the same ID is
96 already in the database then new record is not inserted.
98 Returns
99 -------
100 datasets : `Iterable` [ `DatasetRef` ]
101 References to the inserted datasets.
102 """
103 raise NotImplementedError()
105 @abstractmethod
106 def import_(
107 self,
108 run: RunRecord,
109 datasets: Iterable[DatasetRef],
110 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
111 reuseIds: bool = False,
112 ) -> Iterator[DatasetRef]:
113 """Insert one or more dataset entries into the database.
115 Parameters
116 ----------
117 run : `RunRecord`
118 The record object describing the `~CollectionType.RUN` collection
119 this dataset will be associated with.
120 datasets : `~collections.abc.Iterable` of `DatasetRef`
121 Datasets to be inserted. Datasets can specify ``id`` attribute
122 which will be used for inserted datasets. All dataset IDs must
123 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
124 does not match type supported by this class then IDs will be
125 ignored and new IDs will be generated by backend.
126 idGenerationMode : `DatasetIdGenEnum`
127 With `UNIQUE` each new dataset is inserted with its new unique ID.
128 With non-`UNIQUE` mode ID is computed from some combination of
129 dataset type, dataId, and run collection name; if the same ID is
130 already in the database then new record is not inserted.
131 reuseIds : `bool`, optional
132 If `True` then forces re-use of imported dataset IDs for integer
133 IDs which are normally generated as auto-incremented; exception
134 will be raised if imported IDs clash with existing ones. This
135 option has no effect on the use of globally-unique IDs which are
136 always re-used (or generated if integer IDs are being imported).
138 Returns
139 -------
140 datasets : `Iterable` [ `DatasetRef` ]
141 References to the inserted or existing datasets.
143 Notes
144 -----
145 The ``datasetType`` and ``run`` attributes of datasets are supposed to
146 be identical across all datasets but this is not checked and it should
147 be enforced by higher level registry code. This method does not need
148 to use those attributes from datasets, only ``dataId`` and ``id`` are
149 relevant.
150 """
151 raise NotImplementedError()
153 @abstractmethod
154 def find(
155 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None
156 ) -> Optional[DatasetRef]:
157 """Search a collection for a dataset with the given data ID.
159 Parameters
160 ----------
161 collection : `CollectionRecord`
162 The record object describing the collection to search for the
163 dataset. May have any `CollectionType`.
164 dataId: `DataCoordinate`
165 Complete (but not necessarily expanded) data ID to search with,
166 with ``dataId.graph == self.datasetType.dimensions``.
167 timespan : `Timespan`, optional
168 A timespan that the validity range of the dataset must overlap.
169 Required if ``collection.type is CollectionType.CALIBRATION``, and
170 ignored otherwise.
172 Returns
173 -------
174 ref : `DatasetRef`
175 A resolved `DatasetRef` (without components populated), or `None`
176 if no matching dataset was found.
177 """
178 raise NotImplementedError()
180 @abstractmethod
181 def delete(self, datasets: Iterable[DatasetRef]) -> None:
182 """Fully delete the given datasets from the registry.
184 Parameters
185 ----------
186 datasets : `Iterable` [ `DatasetRef` ]
187 Datasets to be deleted. All datasets must be resolved and have
188 the same `DatasetType` as ``self``.
190 Raises
191 ------
192 AmbiguousDatasetError
193 Raised if any of the given `DatasetRef` instances is unresolved.
194 """
195 raise NotImplementedError()
197 @abstractmethod
198 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
199 """Associate one or more datasets with a collection.
201 Parameters
202 ----------
203 collection : `CollectionRecord`
204 The record object describing the collection. ``collection.type``
205 must be `~CollectionType.TAGGED`.
206 datasets : `Iterable` [ `DatasetRef` ]
207 Datasets to be associated. All datasets must be resolved and have
208 the same `DatasetType` as ``self``.
210 Raises
211 ------
212 AmbiguousDatasetError
213 Raised if any of the given `DatasetRef` instances is unresolved.
215 Notes
216 -----
217 Associating a dataset with into collection that already contains a
218 different dataset with the same `DatasetType` and data ID will remove
219 the existing dataset from that collection.
221 Associating the same dataset into a collection multiple times is a
222 no-op, but is still not permitted on read-only databases.
223 """
224 raise NotImplementedError()
226 @abstractmethod
227 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
228 """Remove one or more datasets from a collection.
230 Parameters
231 ----------
232 collection : `CollectionRecord`
233 The record object describing the collection. ``collection.type``
234 must be `~CollectionType.TAGGED`.
235 datasets : `Iterable` [ `DatasetRef` ]
236 Datasets to be disassociated. All datasets must be resolved and
237 have the same `DatasetType` as ``self``.
239 Raises
240 ------
241 AmbiguousDatasetError
242 Raised if any of the given `DatasetRef` instances is unresolved.
243 """
244 raise NotImplementedError()
246 @abstractmethod
247 def certify(
248 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan
249 ) -> None:
250 """Associate one or more datasets with a calibration collection and a
251 validity range within it.
253 Parameters
254 ----------
255 collection : `CollectionRecord`
256 The record object describing the collection. ``collection.type``
257 must be `~CollectionType.CALIBRATION`.
258 datasets : `Iterable` [ `DatasetRef` ]
259 Datasets to be associated. All datasets must be resolved and have
260 the same `DatasetType` as ``self``.
261 timespan : `Timespan`
262 The validity range for these datasets within the collection.
264 Raises
265 ------
266 AmbiguousDatasetError
267 Raised if any of the given `DatasetRef` instances is unresolved.
268 ConflictingDefinitionError
269 Raised if the collection already contains a different dataset with
270 the same `DatasetType` and data ID and an overlapping validity
271 range.
272 CollectionTypeError
273 Raised if
274 ``collection.type is not CollectionType.CALIBRATION`` or if
275 ``self.datasetType.isCalibration() is False``.
276 """
277 raise NotImplementedError()
279 @abstractmethod
280 def decertify(
281 self,
282 collection: CollectionRecord,
283 timespan: Timespan,
284 *,
285 dataIds: Optional[Iterable[DataCoordinate]] = None,
286 ) -> None:
287 """Remove or adjust datasets to clear a validity range within a
288 calibration collection.
290 Parameters
291 ----------
292 collection : `CollectionRecord`
293 The record object describing the collection. ``collection.type``
294 must be `~CollectionType.CALIBRATION`.
295 timespan : `Timespan`
296 The validity range to remove datasets from within the collection.
297 Datasets that overlap this range but are not contained by it will
298 have their validity ranges adjusted to not overlap it, which may
299 split a single dataset validity range into two.
300 dataIds : `Iterable` [ `DataCoordinate` ], optional
301 Data IDs that should be decertified within the given validity range
302 If `None`, all data IDs for ``self.datasetType`` will be
303 decertified.
305 Raises
306 ------
307 CollectionTypeError
308 Raised if ``collection.type is not CollectionType.CALIBRATION``.
309 """
310 raise NotImplementedError()
312 @abstractmethod
313 def select(
314 self,
315 *collections: CollectionRecord,
316 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
317 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select,
318 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
319 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
320 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
321 ) -> SimpleQuery:
322 """Return a SQLAlchemy object that represents a ``SELECT`` query for
323 this `DatasetType`.
325 All arguments can either be a value that constrains the query or
326 the `SimpleQuery.Select` tag object to indicate that the value should
327 be returned in the columns in the ``SELECT`` clause. The default is
328 `SimpleQuery.Select`.
330 Parameters
331 ----------
332 *collections : `CollectionRecord`
333 The record object(s) describing the collection(s) to query. May
334 not be of type `CollectionType.CHAINED`. If multiple collections
335 are passed, the query will search all of them in an unspecified
336 order, and all collections must have the same type.
337 dataId : `DataCoordinate` or `Select`
338 The data ID to restrict results with, or an instruction to return
339 the data ID via columns with names
340 ``self.datasetType.dimensions.names``.
341 id : `DatasetId`, `Select` or None,
342 The primary key value for the dataset, an instruction to return it
343 via a ``id`` column, or `None` to ignore it entirely.
344 run : `None` or `Select`
345 If `Select` (default), include the dataset's run key value (as
346 column labeled with the return value of
347 ``CollectionManager.getRunForeignKeyName``).
348 If `None`, do not include this column (to constrain the run,
349 pass a `RunRecord` as the ``collection`` argument instead).
350 timespan : `None`, `Select`, or `Timespan`
351 If `Select` (default), include the validity range timespan in the
352 result columns. If a `Timespan` instance, constrain the results to
353 those whose validity ranges overlap that given timespan. Ignored
354 unless ``collection.type is CollectionType.CALIBRATION``.
355 ingestDate : `None`, `Select`, or `Timespan`
356 If `Select` include the ingest timestamp in the result columns.
357 If a `Timespan` instance, constrain the results to those whose
358 ingest times which are inside given timespan and also include
359 timestamp in the result columns. If `None` (default) then there is
360 no constraint and timestamp is not returned.
362 Returns
363 -------
364 query : `SimpleQuery`
365 A struct containing the SQLAlchemy object that representing a
366 simple ``SELECT`` query.
367 """
368 raise NotImplementedError()
370 datasetType: DatasetType
371 """Dataset type whose records this object manages (`DatasetType`).
372 """
375class DatasetRecordStorageManager(VersionedExtension):
376 """An interface that manages the tables that describe datasets.
378 `DatasetRecordStorageManager` primarily serves as a container and factory
379 for `DatasetRecordStorage` instances, which each provide access to the
380 records for a different `DatasetType`.
381 """
383 @classmethod
384 @abstractmethod
385 def initialize(
386 cls,
387 db: Database,
388 context: StaticTablesContext,
389 *,
390 collections: CollectionManager,
391 dimensions: DimensionRecordStorageManager,
392 ) -> DatasetRecordStorageManager:
393 """Construct an instance of the manager.
395 Parameters
396 ----------
397 db : `Database`
398 Interface to the underlying database engine and namespace.
399 context : `StaticTablesContext`
400 Context object obtained from `Database.declareStaticTables`; used
401 to declare any tables that should always be present.
402 collections: `CollectionManager`
403 Manager object for the collections in this `Registry`.
404 dimensions : `DimensionRecordStorageManager`
405 Manager object for the dimensions in this `Registry`.
407 Returns
408 -------
409 manager : `DatasetRecordStorageManager`
410 An instance of a concrete `DatasetRecordStorageManager` subclass.
411 """
412 raise NotImplementedError()
414 @classmethod
415 @abstractmethod
416 def getIdColumnType(cls) -> type:
417 """Return type used for columns storing dataset IDs.
419 This type is used for columns storing `DatasetRef.id` values, usually
420 a `type` subclass provided by SQLAlchemy.
422 Returns
423 -------
424 dtype : `type`
425 Type used for dataset identification in database.
426 """
427 raise NotImplementedError()
429 @classmethod
430 @abstractmethod
431 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
432 """Test whether the given dataset ID generation mode is supported by
433 `insert`.
435 Parameters
436 ----------
437 mode : `DatasetIdGenEnum`
438 Enum value for the mode to test.
440 Returns
441 -------
442 supported : `bool`
443 Whether the given mode is supported.
444 """
445 raise NotImplementedError()
447 @classmethod
448 @abstractmethod
449 def addDatasetForeignKey(
450 cls,
451 tableSpec: ddl.TableSpec,
452 *,
453 name: str = "dataset",
454 constraint: bool = True,
455 onDelete: Optional[str] = None,
456 **kwargs: Any,
457 ) -> ddl.FieldSpec:
458 """Add a foreign key (field and constraint) referencing the dataset
459 table.
461 Parameters
462 ----------
463 tableSpec : `ddl.TableSpec`
464 Specification for the table that should reference the dataset
465 table. Will be modified in place.
466 name: `str`, optional
467 A name to use for the prefix of the new field; the full name is
468 ``{name}_id``.
469 onDelete: `str`, optional
470 One of "CASCADE" or "SET NULL", indicating what should happen to
471 the referencing row if the collection row is deleted. `None`
472 indicates that this should be an integrity error.
473 constraint: `bool`, optional
474 If `False` (`True` is default), add a field that can be joined to
475 the dataset primary key, but do not add a foreign key constraint.
476 **kwargs
477 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
478 constructor (only the ``name`` and ``dtype`` arguments are
479 otherwise provided).
481 Returns
482 -------
483 idSpec : `ddl.FieldSpec`
484 Specification for the ID field.
485 """
486 raise NotImplementedError()
488 @abstractmethod
489 def refresh(self) -> None:
490 """Ensure all other operations on this manager are aware of any
491 dataset types that may have been registered by other clients since
492 it was initialized or last refreshed.
493 """
494 raise NotImplementedError()
496 def __getitem__(self, name: str) -> DatasetRecordStorage:
497 """Return the object that provides access to the records associated
498 with the given `DatasetType` name.
500 This is simply a convenience wrapper for `find` that raises `KeyError`
501 when the dataset type is not found.
503 Returns
504 -------
505 records : `DatasetRecordStorage`
506 The object representing the records for the given dataset type.
508 Raises
509 ------
510 KeyError
511 Raised if there is no dataset type with the given name.
513 Notes
514 -----
515 Dataset types registered by another client of the same repository since
516 the last call to `initialize` or `refresh` may not be found.
517 """
518 result = self.find(name)
519 if result is None:
520 raise KeyError(f"Dataset type with name '{name}' not found.")
521 return result
523 @abstractmethod
524 def find(self, name: str) -> Optional[DatasetRecordStorage]:
525 """Return an object that provides access to the records associated with
526 the given `DatasetType` name, if one exists.
528 Parameters
529 ----------
530 name : `str`
531 Name of the dataset type.
533 Returns
534 -------
535 records : `DatasetRecordStorage` or `None`
536 The object representing the records for the given dataset type, or
537 `None` if there are no records for that dataset type.
539 Notes
540 -----
541 Dataset types registered by another client of the same repository since
542 the last call to `initialize` or `refresh` may not be found.
543 """
544 raise NotImplementedError()
546 @abstractmethod
547 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
548 """Ensure that this `Registry` can hold records for the given
549 `DatasetType`, creating new tables as necessary.
551 Parameters
552 ----------
553 datasetType : `DatasetType`
554 Dataset type for which a table should created (as necessary) and
555 an associated `DatasetRecordStorage` returned.
557 Returns
558 -------
559 records : `DatasetRecordStorage`
560 The object representing the records for the given dataset type.
561 inserted : `bool`
562 `True` if the dataset type did not exist in the registry before.
564 Notes
565 -----
566 This operation may not be invoked within a `Database.transaction`
567 context.
568 """
569 raise NotImplementedError()
571 @abstractmethod
572 def remove(self, name: str) -> None:
573 """Remove the dataset type.
575 Parameters
576 ----------
577 name : `str`
578 Name of the dataset type.
579 """
580 raise NotImplementedError()
582 @abstractmethod
583 def __iter__(self) -> Iterator[DatasetType]:
584 """Return an iterator over the the dataset types present in this layer.
586 Notes
587 -----
588 Dataset types registered by another client of the same layer since
589 the last call to `initialize` or `refresh` may not be included.
590 """
591 raise NotImplementedError()
593 @abstractmethod
594 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
595 """Return a `DatasetRef` for the given dataset primary key
596 value.
598 Parameters
599 ----------
600 id : `DatasetId`
601 Primary key value for the dataset.
603 Returns
604 -------
605 ref : `DatasetRef` or `None`
606 Object representing the dataset, or `None` if no dataset with the
607 given primary key values exists in this layer.
608 """
609 raise NotImplementedError()
611 @abstractmethod
612 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
613 """Return a summary for the given collection.
615 Parameters
616 ----------
617 collection : `CollectionRecord`
618 Record describing the collection for which a summary is to be
619 retrieved.
621 Returns
622 -------
623 summary : `CollectionSummary`
624 Summary of the dataset types and governor dimension values in
625 this collection.
626 """
627 raise NotImplementedError()