Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py : 67%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum")
26from abc import ABC, abstractmethod
27import enum
28from typing import (
29 Any,
30 Iterable,
31 Iterator,
32 Optional,
33 Tuple,
34 TYPE_CHECKING,
35)
37from ...core import (
38 DataCoordinate,
39 DatasetId,
40 DatasetRef,
41 DatasetType,
42 ddl,
43 SimpleQuery,
44 Timespan,
45)
46from ._versioning import VersionedExtension
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from ..summaries import CollectionSummary
50 from ._database import Database, StaticTablesContext
51 from ._dimensions import DimensionRecordStorageManager
52 from ._collections import CollectionManager, CollectionRecord, RunRecord
55class DatasetIdGenEnum(enum.Enum):
56 """This enum is used to specify dataset ID generation options for
57 ``insert()`` method.
58 """
60 UNIQUE = 0
61 """Unique mode generates unique ID for each inserted dataset, e.g.
62 auto-generated by database or random UUID.
63 """
65 DATAID_TYPE = 1
66 """In this mode ID is computed deterministically from a combination of
67 dataset type and dataId.
68 """
70 DATAID_TYPE_RUN = 2
71 """In this mode ID is computed deterministically from a combination of
72 dataset type, dataId, and run collection name.
73 """
76class DatasetRecordStorage(ABC):
77 """An interface that manages the records associated with a particular
78 `DatasetType`.
80 Parameters
81 ----------
82 datasetType : `DatasetType`
83 Dataset type whose records this object manages.
84 """
85 def __init__(self, datasetType: DatasetType):
86 self.datasetType = datasetType
88 @abstractmethod
89 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
90 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
91 """Insert one or more dataset entries into the database.
93 Parameters
94 ----------
95 run : `RunRecord`
96 The record object describing the `~CollectionType.RUN` collection
97 this dataset will be associated with.
98 dataIds : `Iterable` [ `DataCoordinate` ]
99 Expanded data IDs (`DataCoordinate` instances) for the
100 datasets to be added. The dimensions of all data IDs must be the
101 same as ``self.datasetType.dimensions``.
102 idMode : `DatasetIdGenEnum`
103 With `UNIQUE` each new dataset is inserted with its new unique ID.
104 With non-`UNIQUE` mode ID is computed from some combination of
105 dataset type, dataId, and run collection name; if the same ID is
106 already in the database then new record is not inserted.
108 Returns
109 -------
110 datasets : `Iterable` [ `DatasetRef` ]
111 References to the inserted datasets.
112 """
113 raise NotImplementedError()
115 @abstractmethod
116 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
117 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
118 reuseIds: bool = False) -> Iterator[DatasetRef]:
119 """Insert one or more dataset entries into the database.
121 Parameters
122 ----------
123 run : `RunRecord`
124 The record object describing the `~CollectionType.RUN` collection
125 this dataset will be associated with.
126 datasets : `~collections.abc.Iterable` of `DatasetRef`
127 Datasets to be inserted. Datasets can specify ``id`` attribute
128 which will be used for inserted datasets. All dataset IDs must
129 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
130 does not match type supported by this class then IDs will be
131 ignored and new IDs will be generated by backend.
132 idGenerationMode : `DatasetIdGenEnum`
133 With `UNIQUE` each new dataset is inserted with its new unique ID.
134 With non-`UNIQUE` mode ID is computed from some combination of
135 dataset type, dataId, and run collection name; if the same ID is
136 already in the database then new record is not inserted.
137 reuseIds : `bool`, optional
138 If `True` then forces re-use of imported dataset IDs for integer
139 IDs which are normally generated as auto-incremented; exception
140 will be raised if imported IDs clash with existing ones. This
141 option has no effect on the use of globally-unique IDs which are
142 always re-used (or generated if integer IDs are being imported).
144 Returns
145 -------
146 datasets : `Iterable` [ `DatasetRef` ]
147 References to the inserted or existing datasets.
149 Notes
150 -----
151 The ``datasetType`` and ``run`` attributes of datasets are supposed to
152 be identical across all datasets but this is not checked and it should
153 be enforced by higher level registry code. This method does not need
154 to use those attributes from datasets, only ``dataId`` and ``id`` are
155 relevant.
156 """
157 raise NotImplementedError()
159 @abstractmethod
160 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
161 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
162 """Search a collection for a dataset with the given data ID.
164 Parameters
165 ----------
166 collection : `CollectionRecord`
167 The record object describing the collection to search for the
168 dataset. May have any `CollectionType`.
169 dataId: `DataCoordinate`
170 Complete (but not necessarily expanded) data ID to search with,
171 with ``dataId.graph == self.datasetType.dimensions``.
172 timespan : `Timespan`, optional
173 A timespan that the validity range of the dataset must overlap.
174 Required if ``collection.type is CollectionType.CALIBRATION``, and
175 ignored otherwise.
177 Returns
178 -------
179 ref : `DatasetRef`
180 A resolved `DatasetRef` (without components populated), or `None`
181 if no matching dataset was found.
182 """
183 raise NotImplementedError()
185 @abstractmethod
186 def delete(self, datasets: Iterable[DatasetRef]) -> None:
187 """Fully delete the given datasets from the registry.
189 Parameters
190 ----------
191 datasets : `Iterable` [ `DatasetRef` ]
192 Datasets to be deleted. All datasets must be resolved and have
193 the same `DatasetType` as ``self``.
195 Raises
196 ------
197 AmbiguousDatasetError
198 Raised if any of the given `DatasetRef` instances is unresolved.
199 """
200 raise NotImplementedError()
202 @abstractmethod
203 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
204 """Associate one or more datasets with a collection.
206 Parameters
207 ----------
208 collection : `CollectionRecord`
209 The record object describing the collection. ``collection.type``
210 must be `~CollectionType.TAGGED`.
211 datasets : `Iterable` [ `DatasetRef` ]
212 Datasets to be associated. All datasets must be resolved and have
213 the same `DatasetType` as ``self``.
215 Raises
216 ------
217 AmbiguousDatasetError
218 Raised if any of the given `DatasetRef` instances is unresolved.
220 Notes
221 -----
222 Associating a dataset with into collection that already contains a
223 different dataset with the same `DatasetType` and data ID will remove
224 the existing dataset from that collection.
226 Associating the same dataset into a collection multiple times is a
227 no-op, but is still not permitted on read-only databases.
228 """
229 raise NotImplementedError()
231 @abstractmethod
232 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
233 """Remove one or more datasets from a collection.
235 Parameters
236 ----------
237 collection : `CollectionRecord`
238 The record object describing the collection. ``collection.type``
239 must be `~CollectionType.TAGGED`.
240 datasets : `Iterable` [ `DatasetRef` ]
241 Datasets to be disassociated. All datasets must be resolved and
242 have the same `DatasetType` as ``self``.
244 Raises
245 ------
246 AmbiguousDatasetError
247 Raised if any of the given `DatasetRef` instances is unresolved.
248 """
249 raise NotImplementedError()
251 @abstractmethod
252 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
253 timespan: Timespan) -> None:
254 """Associate one or more datasets with a calibration collection and a
255 validity range within it.
257 Parameters
258 ----------
259 collection : `CollectionRecord`
260 The record object describing the collection. ``collection.type``
261 must be `~CollectionType.CALIBRATION`.
262 datasets : `Iterable` [ `DatasetRef` ]
263 Datasets to be associated. All datasets must be resolved and have
264 the same `DatasetType` as ``self``.
265 timespan : `Timespan`
266 The validity range for these datasets within the collection.
268 Raises
269 ------
270 AmbiguousDatasetError
271 Raised if any of the given `DatasetRef` instances is unresolved.
272 ConflictingDefinitionError
273 Raised if the collection already contains a different dataset with
274 the same `DatasetType` and data ID and an overlapping validity
275 range.
276 TypeError
277 Raised if
278 ``collection.type is not CollectionType.CALIBRATION`` or if
279 ``self.datasetType.isCalibration() is False``.
280 """
281 raise NotImplementedError()
283 @abstractmethod
284 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
285 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
286 """Remove or adjust datasets to clear a validity range within a
287 calibration collection.
289 Parameters
290 ----------
291 collection : `CollectionRecord`
292 The record object describing the collection. ``collection.type``
293 must be `~CollectionType.CALIBRATION`.
294 timespan : `Timespan`
295 The validity range to remove datasets from within the collection.
296 Datasets that overlap this range but are not contained by it will
297 have their validity ranges adjusted to not overlap it, which may
298 split a single dataset validity range into two.
299 dataIds : `Iterable` [ `DataCoordinate` ], optional
300 Data IDs that should be decertified within the given validity range
301 If `None`, all data IDs for ``self.datasetType`` will be
302 decertified.
304 Raises
305 ------
306 TypeError
307 Raised if ``collection.type is not CollectionType.CALIBRATION``.
308 """
309 raise NotImplementedError()
311 @abstractmethod
312 def select(self, collection: CollectionRecord,
313 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
314 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select,
315 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
316 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
317 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
318 ) -> Optional[SimpleQuery]:
319 """Return a SQLAlchemy object that represents a ``SELECT`` query for
320 this `DatasetType`.
322 All arguments can either be a value that constrains the query or
323 the `SimpleQuery.Select` tag object to indicate that the value should
324 be returned in the columns in the ``SELECT`` clause. The default is
325 `SimpleQuery.Select`.
327 Parameters
328 ----------
329 collection : `CollectionRecord`
330 The record object describing the collection to query. May not be
331 of type `CollectionType.CHAINED`.
332 dataId : `DataCoordinate` or `Select`
333 The data ID to restrict results with, or an instruction to return
334 the data ID via columns with names
335 ``self.datasetType.dimensions.names``.
336 id : `DatasetId`, `Select` or None,
337 The primary key value for the dataset, an instruction to return it
338 via a ``id`` column, or `None` to ignore it entirely.
339 run : `None` or `Select`
340 If `Select` (default), include the dataset's run key value (as
341 column labeled with the return value of
342 ``CollectionManager.getRunForiegnKeyName``).
343 If `None`, do not include this column (to constrain the run,
344 pass a `RunRecord` as the ``collection`` argument instead).
345 timespan : `None`, `Select`, or `Timespan`
346 If `Select` (default), include the validity range timespan in the
347 result columns. If a `Timespan` instance, constrain the results to
348 those whose validity ranges overlap that given timespan. Ignored
349 unless ``collection.type is CollectionType.CALIBRATION``.
350 ingestDate : `None`, `Select`, or `Timespan`
351 If `Select` include the ingest timestamp in the result columns.
352 If a `Timespan` instance, constrain the results to those whose
353 ingest times which are inside given timespan and also include
354 timestamp in the result columns. If `None` (default) then there is
355 no constraint and timestamp is not returned.
357 Returns
358 -------
359 query : `SimpleQuery` or `None`
360 A struct containing the SQLAlchemy object that representing a
361 simple ``SELECT`` query, or `None` if it is known that there are
362 no datasets of this `DatasetType` that match the given constraints.
363 """
364 raise NotImplementedError()
366 datasetType: DatasetType
367 """Dataset type whose records this object manages (`DatasetType`).
368 """
371class DatasetRecordStorageManager(VersionedExtension):
372 """An interface that manages the tables that describe datasets.
374 `DatasetRecordStorageManager` primarily serves as a container and factory
375 for `DatasetRecordStorage` instances, which each provide access to the
376 records for a different `DatasetType`.
377 """
379 @classmethod
380 @abstractmethod
381 def initialize(
382 cls,
383 db: Database,
384 context: StaticTablesContext, *,
385 collections: CollectionManager,
386 dimensions: DimensionRecordStorageManager,
387 ) -> DatasetRecordStorageManager:
388 """Construct an instance of the manager.
390 Parameters
391 ----------
392 db : `Database`
393 Interface to the underlying database engine and namespace.
394 context : `StaticTablesContext`
395 Context object obtained from `Database.declareStaticTables`; used
396 to declare any tables that should always be present.
397 collections: `CollectionManager`
398 Manager object for the collections in this `Registry`.
399 dimensions : `DimensionRecordStorageManager`
400 Manager object for the dimensions in this `Registry`.
402 Returns
403 -------
404 manager : `DatasetRecordStorageManager`
405 An instance of a concrete `DatasetRecordStorageManager` subclass.
406 """
407 raise NotImplementedError()
409 @classmethod
410 @abstractmethod
411 def getIdColumnType(cls) -> type:
412 """Return type used for columns storing dataset IDs.
414 This type is used for columns storing `DatasetRef.id` values, usually
415 a `type` subclass provided by SQLAlchemy.
417 Returns
418 -------
419 dtype : `type`
420 Type used for dataset identification in database.
421 """
422 raise NotImplementedError()
424 @classmethod
425 @abstractmethod
426 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool:
427 """Test whether the given dataset ID generation mode is supported by
428 `insert`.
430 Parameters
431 ----------
432 mode : `DatasetIdGenEnum`
433 Enum value for the mode to test.
435 Returns
436 -------
437 supported : `bool`
438 Whether the given mode is supported.
439 """
440 raise NotImplementedError()
442 @classmethod
443 @abstractmethod
444 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *,
445 name: str = "dataset", constraint: bool = True, onDelete: Optional[str] = None,
446 **kwargs: Any) -> ddl.FieldSpec:
447 """Add a foreign key (field and constraint) referencing the dataset
448 table.
450 Parameters
451 ----------
452 tableSpec : `ddl.TableSpec`
453 Specification for the table that should reference the dataset
454 table. Will be modified in place.
455 name: `str`, optional
456 A name to use for the prefix of the new field; the full name is
457 ``{name}_id``.
458 onDelete: `str`, optional
459 One of "CASCADE" or "SET NULL", indicating what should happen to
460 the referencing row if the collection row is deleted. `None`
461 indicates that this should be an integrity error.
462 constraint: `bool`, optional
463 If `False` (`True` is default), add a field that can be joined to
464 the dataset primary key, but do not add a foreign key constraint.
465 **kwargs
466 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
467 constructor (only the ``name`` and ``dtype`` arguments are
468 otherwise provided).
470 Returns
471 -------
472 idSpec : `ddl.FieldSpec`
473 Specification for the ID field.
474 """
475 raise NotImplementedError()
477 @abstractmethod
478 def refresh(self) -> None:
479 """Ensure all other operations on this manager are aware of any
480 dataset types that may have been registered by other clients since
481 it was initialized or last refreshed.
482 """
483 raise NotImplementedError()
485 def __getitem__(self, name: str) -> DatasetRecordStorage:
486 """Return the object that provides access to the records associated
487 with the given `DatasetType` name.
489 This is simply a convenience wrapper for `find` that raises `KeyError`
490 when the dataset type is not found.
492 Returns
493 -------
494 records : `DatasetRecordStorage`
495 The object representing the records for the given dataset type.
497 Raises
498 ------
499 KeyError
500 Raised if there is no dataset type with the given name.
502 Notes
503 -----
504 Dataset types registered by another client of the same repository since
505 the last call to `initialize` or `refresh` may not be found.
506 """
507 result = self.find(name)
508 if result is None:
509 raise KeyError(f"Dataset type with name '{name}' not found.")
510 return result
512 @abstractmethod
513 def find(self, name: str) -> Optional[DatasetRecordStorage]:
514 """Return an object that provides access to the records associated with
515 the given `DatasetType` name, if one exists.
517 Parameters
518 ----------
519 name : `str`
520 Name of the dataset type.
522 Returns
523 -------
524 records : `DatasetRecordStorage` or `None`
525 The object representing the records for the given dataset type, or
526 `None` if there are no records for that dataset type.
528 Notes
529 -----
530 Dataset types registered by another client of the same repository since
531 the last call to `initialize` or `refresh` may not be found.
532 """
533 raise NotImplementedError()
535 @abstractmethod
536 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
537 """Ensure that this `Registry` can hold records for the given
538 `DatasetType`, creating new tables as necessary.
540 Parameters
541 ----------
542 datasetType : `DatasetType`
543 Dataset type for which a table should created (as necessary) and
544 an associated `DatasetRecordStorage` returned.
546 Returns
547 -------
548 records : `DatasetRecordStorage`
549 The object representing the records for the given dataset type.
550 inserted : `bool`
551 `True` if the dataset type did not exist in the registry before.
553 Notes
554 -----
555 This operation may not be invoked within a `Database.transaction`
556 context.
557 """
558 raise NotImplementedError()
560 @abstractmethod
561 def remove(self, name: str) -> None:
562 """Remove the dataset type.
564 Parameters
565 ----------
566 name : `str`
567 Name of the dataset type.
568 """
569 raise NotImplementedError()
571 @abstractmethod
572 def __iter__(self) -> Iterator[DatasetType]:
573 """Return an iterator over the the dataset types present in this layer.
575 Notes
576 -----
577 Dataset types registered by another client of the same layer since
578 the last call to `initialize` or `refresh` may not be included.
579 """
580 raise NotImplementedError()
582 @abstractmethod
583 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
584 """Return a `DatasetRef` for the given dataset primary key
585 value.
587 Parameters
588 ----------
589 id : `DatasetId`
590 Primary key value for the dataset.
592 Returns
593 -------
594 ref : `DatasetRef` or `None`
595 Object representing the dataset, or `None` if no dataset with the
596 given primary key values exists in this layer.
597 """
598 raise NotImplementedError()
600 @abstractmethod
601 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
602 """Return a summary for the given collection.
604 Parameters
605 ----------
606 collection : `CollectionRecord`
607 Record describing the collection for which a summary is to be
608 retrieved.
610 Returns
611 -------
612 summary : `CollectionSummary`
613 Summary of the dataset types and governor dimension values in
614 this collection.
615 """
616 raise NotImplementedError()