Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py : 66%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum")
26from abc import ABC, abstractmethod
27import enum
28from typing import (
29 Any,
30 Iterable,
31 Iterator,
32 Optional,
33 Tuple,
34 TYPE_CHECKING,
35)
37from ...core import (
38 DataCoordinate,
39 DatasetId,
40 DatasetRef,
41 DatasetType,
42 ddl,
43 SimpleQuery,
44 Timespan,
45)
46from ._versioning import VersionedExtension
48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true
49 from ..summaries import CollectionSummary
50 from ._database import Database, StaticTablesContext
51 from ._dimensions import DimensionRecordStorageManager
52 from ._collections import CollectionManager, CollectionRecord, RunRecord
55class DatasetIdGenEnum(enum.Enum):
56 """This enum is used to specify dataset ID generation options for
57 ``insert()`` method.
58 """
60 UNIQUE = 0
61 """Unique mode generates unique ID for each inserted dataset, e.g.
62 auto-generated by database or random UUID.
63 """
65 DATAID_TYPE = 1
66 """In this mode ID is computed deterministically from a combination of
67 dataset type and dataId.
68 """
70 DATAID_TYPE_RUN = 2
71 """In this mode ID is computed deterministically from a combination of
72 dataset type, dataId, and run collection name.
73 """
76class DatasetRecordStorage(ABC):
77 """An interface that manages the records associated with a particular
78 `DatasetType`.
80 Parameters
81 ----------
82 datasetType : `DatasetType`
83 Dataset type whose records this object manages.
84 """
85 def __init__(self, datasetType: DatasetType):
86 self.datasetType = datasetType
88 @abstractmethod
89 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate],
90 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]:
91 """Insert one or more dataset entries into the database.
93 Parameters
94 ----------
95 run : `RunRecord`
96 The record object describing the `~CollectionType.RUN` collection
97 this dataset will be associated with.
98 dataIds : `Iterable` [ `DataCoordinate` ]
99 Expanded data IDs (`DataCoordinate` instances) for the
100 datasets to be added. The dimensions of all data IDs must be the
101 same as ``self.datasetType.dimensions``.
102 idMode : `DatasetIdGenEnum`
103 With `UNIQUE` each new dataset is inserted with its new unique ID.
104 With non-`UNIQUE` mode ID is computed from some combination of
105 dataset type, dataId, and run collection name; if the same ID is
106 already in the database then new record is not inserted.
108 Returns
109 -------
110 datasets : `Iterable` [ `DatasetRef` ]
111 References to the inserted datasets.
112 """
113 raise NotImplementedError()
115 @abstractmethod
116 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef],
117 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
118 reuseIds: bool = False) -> Iterator[DatasetRef]:
119 """Insert one or more dataset entries into the database.
121 Parameters
122 ----------
123 run : `RunRecord`
124 The record object describing the `~CollectionType.RUN` collection
125 this dataset will be associated with.
126 datasets : `~collections.abc.Iterable` of `DatasetRef`
127 Datasets to be inserted. Datasets can specify ``id`` attribute
128 which will be used for inserted datasets. All dataset IDs must
129 have the same type (`int` or `uuid.UUID`), if type of dataset IDs
130 does not match type supported by this class then IDs will be
131 ignored and new IDs will be generated by backend.
132 idGenerationMode : `DatasetIdGenEnum`
133 With `UNIQUE` each new dataset is inserted with its new unique ID.
134 With non-`UNIQUE` mode ID is computed from some combination of
135 dataset type, dataId, and run collection name; if the same ID is
136 already in the database then new record is not inserted.
137 reuseIds : `bool`, optional
138 If `True` then forces re-use of imported dataset IDs for integer
139 IDs which are normally generated as auto-incremented; exception
140 will be raised if imported IDs clash with existing ones. This
141 option has no effect on the use of globally-unique IDs which are
142 always re-used (or generated if integer IDs are being imported).
144 Returns
145 -------
146 datasets : `Iterable` [ `DatasetRef` ]
147 References to the inserted or existing datasets.
149 Notes
150 -----
151 The ``datasetType`` and ``run`` attributes of datasets are supposed to
152 be identical across all datasets but this is not checked and it should
153 be enforced by higher level registry code. This method does not need
154 to use those attributes from datasets, only ``dataId`` and ``id`` are
155 relevant.
156 """
157 raise NotImplementedError()
159 @abstractmethod
160 def find(self, collection: CollectionRecord, dataId: DataCoordinate,
161 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]:
162 """Search a collection for a dataset with the given data ID.
164 Parameters
165 ----------
166 collection : `CollectionRecord`
167 The record object describing the collection to search for the
168 dataset. May have any `CollectionType`.
169 dataId: `DataCoordinate`
170 Complete (but not necessarily expanded) data ID to search with,
171 with ``dataId.graph == self.datasetType.dimensions``.
172 timespan : `Timespan`, optional
173 A timespan that the validity range of the dataset must overlap.
174 Required if ``collection.type is CollectionType.CALIBRATION``, and
175 ignored otherwise.
177 Returns
178 -------
179 ref : `DatasetRef`
180 A resolved `DatasetRef` (without components populated), or `None`
181 if no matching dataset was found.
182 """
183 raise NotImplementedError()
185 @abstractmethod
186 def delete(self, datasets: Iterable[DatasetRef]) -> None:
187 """Fully delete the given datasets from the registry.
189 Parameters
190 ----------
191 datasets : `Iterable` [ `DatasetRef` ]
192 Datasets to be deleted. All datasets must be resolved and have
193 the same `DatasetType` as ``self``.
195 Raises
196 ------
197 AmbiguousDatasetError
198 Raised if any of the given `DatasetRef` instances is unresolved.
199 """
200 raise NotImplementedError()
202 @abstractmethod
203 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
204 """Associate one or more datasets with a collection.
206 Parameters
207 ----------
208 collection : `CollectionRecord`
209 The record object describing the collection. ``collection.type``
210 must be `~CollectionType.TAGGED`.
211 datasets : `Iterable` [ `DatasetRef` ]
212 Datasets to be associated. All datasets must be resolved and have
213 the same `DatasetType` as ``self``.
215 Raises
216 ------
217 AmbiguousDatasetError
218 Raised if any of the given `DatasetRef` instances is unresolved.
220 Notes
221 -----
222 Associating a dataset with into collection that already contains a
223 different dataset with the same `DatasetType` and data ID will remove
224 the existing dataset from that collection.
226 Associating the same dataset into a collection multiple times is a
227 no-op, but is still not permitted on read-only databases.
228 """
229 raise NotImplementedError()
231 @abstractmethod
232 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None:
233 """Remove one or more datasets from a collection.
235 Parameters
236 ----------
237 collection : `CollectionRecord`
238 The record object describing the collection. ``collection.type``
239 must be `~CollectionType.TAGGED`.
240 datasets : `Iterable` [ `DatasetRef` ]
241 Datasets to be disassociated. All datasets must be resolved and
242 have the same `DatasetType` as ``self``.
244 Raises
245 ------
246 AmbiguousDatasetError
247 Raised if any of the given `DatasetRef` instances is unresolved.
248 """
249 raise NotImplementedError()
251 @abstractmethod
252 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef],
253 timespan: Timespan) -> None:
254 """Associate one or more datasets with a calibration collection and a
255 validity range within it.
257 Parameters
258 ----------
259 collection : `CollectionRecord`
260 The record object describing the collection. ``collection.type``
261 must be `~CollectionType.CALIBRATION`.
262 datasets : `Iterable` [ `DatasetRef` ]
263 Datasets to be associated. All datasets must be resolved and have
264 the same `DatasetType` as ``self``.
265 timespan : `Timespan`
266 The validity range for these datasets within the collection.
268 Raises
269 ------
270 AmbiguousDatasetError
271 Raised if any of the given `DatasetRef` instances is unresolved.
272 ConflictingDefinitionError
273 Raised if the collection already contains a different dataset with
274 the same `DatasetType` and data ID and an overlapping validity
275 range.
276 TypeError
277 Raised if
278 ``collection.type is not CollectionType.CALIBRATION`` or if
279 ``self.datasetType.isCalibration() is False``.
280 """
281 raise NotImplementedError()
283 @abstractmethod
284 def decertify(self, collection: CollectionRecord, timespan: Timespan, *,
285 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None:
286 """Remove or adjust datasets to clear a validity range within a
287 calibration collection.
289 Parameters
290 ----------
291 collection : `CollectionRecord`
292 The record object describing the collection. ``collection.type``
293 must be `~CollectionType.CALIBRATION`.
294 timespan : `Timespan`
295 The validity range to remove datasets from within the collection.
296 Datasets that overlap this range but are not contained by it will
297 have their validity ranges adjusted to not overlap it, which may
298 split a single dataset validity range into two.
299 dataIds : `Iterable` [ `DataCoordinate` ], optional
300 Data IDs that should be decertified within the given validity range
301 If `None`, all data IDs for ``self.datasetType`` will be
302 decertified.
304 Raises
305 ------
306 TypeError
307 Raised if ``collection.type is not CollectionType.CALIBRATION``.
308 """
309 raise NotImplementedError()
311 @abstractmethod
312 def select(self, collection: CollectionRecord,
313 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select,
314 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select,
315 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
316 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
317 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
318 ) -> Optional[SimpleQuery]:
319 """Return a SQLAlchemy object that represents a ``SELECT`` query for
320 this `DatasetType`.
322 All arguments can either be a value that constrains the query or
323 the `SimpleQuery.Select` tag object to indicate that the value should
324 be returned in the columns in the ``SELECT`` clause. The default is
325 `SimpleQuery.Select`.
327 Parameters
328 ----------
329 collection : `CollectionRecord`
330 The record object describing the collection to query. May not be
331 of type `CollectionType.CHAINED`.
332 dataId : `DataCoordinate` or `Select`
333 The data ID to restrict results with, or an instruction to return
334 the data ID via columns with names
335 ``self.datasetType.dimensions.names``.
336 id : `DatasetId`, `Select` or None,
337 The primary key value for the dataset, an instruction to return it
338 via a ``id`` column, or `None` to ignore it entirely.
339 run : `None` or `Select`
340 If `Select` (default), include the dataset's run key value (as
341 column labeled with the return value of
342 ``CollectionManager.getRunForiegnKeyName``).
343 If `None`, do not include this column (to constrain the run,
344 pass a `RunRecord` as the ``collection`` argument instead).
345 timespan : `None`, `Select`, or `Timespan`
346 If `Select` (default), include the validity range timespan in the
347 result columns. If a `Timespan` instance, constrain the results to
348 those whose validity ranges overlap that given timespan. Ignored
349 unless ``collection.type is CollectionType.CALIBRATION``.
350 ingestDate : `None`, `Select`, or `Timespan`
351 If `Select` include the ingest timestamp in the result columns.
352 If a `Timespan` instance, constrain the results to those whose
353 ingest times which are inside given timespan and also include
354 timestamp in the result columns. If `None` (default) then there is
355 no constraint and timestamp is not returned.
357 Returns
358 -------
359 query : `SimpleQuery` or `None`
360 A struct containing the SQLAlchemy object that representing a
361 simple ``SELECT`` query, or `None` if it is known that there are
362 no datasets of this `DatasetType` that match the given constraints.
363 """
364 raise NotImplementedError()
366 datasetType: DatasetType
367 """Dataset type whose records this object manages (`DatasetType`).
368 """
371class DatasetRecordStorageManager(VersionedExtension):
372 """An interface that manages the tables that describe datasets.
374 `DatasetRecordStorageManager` primarily serves as a container and factory
375 for `DatasetRecordStorage` instances, which each provide access to the
376 records for a different `DatasetType`.
377 """
379 @classmethod
380 @abstractmethod
381 def initialize(
382 cls,
383 db: Database,
384 context: StaticTablesContext, *,
385 collections: CollectionManager,
386 dimensions: DimensionRecordStorageManager,
387 ) -> DatasetRecordStorageManager:
388 """Construct an instance of the manager.
390 Parameters
391 ----------
392 db : `Database`
393 Interface to the underlying database engine and namespace.
394 context : `StaticTablesContext`
395 Context object obtained from `Database.declareStaticTables`; used
396 to declare any tables that should always be present.
397 collections: `CollectionManager`
398 Manager object for the collections in this `Registry`.
399 dimensions : `DimensionRecordStorageManager`
400 Manager object for the dimensions in this `Registry`.
402 Returns
403 -------
404 manager : `DatasetRecordStorageManager`
405 An instance of a concrete `DatasetRecordStorageManager` subclass.
406 """
407 raise NotImplementedError()
409 @classmethod
410 @abstractmethod
411 def getIdColumnType(cls) -> type:
412 """Return type used for columns storing dataset IDs.
414 This type is used for columns storing `DatasetRef.id` values, usually
415 a `type` subclass provided by SQLAlchemy.
417 Returns
418 -------
419 dtype : `type`
420 Type used for dataset identification in database.
421 """
422 raise NotImplementedError()
424 @classmethod
425 @abstractmethod
426 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *,
427 name: str = "dataset", constraint: bool = True, onDelete: Optional[str] = None,
428 **kwargs: Any) -> ddl.FieldSpec:
429 """Add a foreign key (field and constraint) referencing the dataset
430 table.
432 Parameters
433 ----------
434 tableSpec : `ddl.TableSpec`
435 Specification for the table that should reference the dataset
436 table. Will be modified in place.
437 name: `str`, optional
438 A name to use for the prefix of the new field; the full name is
439 ``{name}_id``.
440 onDelete: `str`, optional
441 One of "CASCADE" or "SET NULL", indicating what should happen to
442 the referencing row if the collection row is deleted. `None`
443 indicates that this should be an integrity error.
444 constraint: `bool`, optional
445 If `False` (`True` is default), add a field that can be joined to
446 the dataset primary key, but do not add a foreign key constraint.
447 **kwargs
448 Additional keyword arguments are forwarded to the `ddl.FieldSpec`
449 constructor (only the ``name`` and ``dtype`` arguments are
450 otherwise provided).
452 Returns
453 -------
454 idSpec : `ddl.FieldSpec`
455 Specification for the ID field.
456 """
457 raise NotImplementedError()
459 @abstractmethod
460 def refresh(self) -> None:
461 """Ensure all other operations on this manager are aware of any
462 dataset types that may have been registered by other clients since
463 it was initialized or last refreshed.
464 """
465 raise NotImplementedError()
467 def __getitem__(self, name: str) -> DatasetRecordStorage:
468 """Return the object that provides access to the records associated
469 with the given `DatasetType` name.
471 This is simply a convenience wrapper for `find` that raises `KeyError`
472 when the dataset type is not found.
474 Returns
475 -------
476 records : `DatasetRecordStorage`
477 The object representing the records for the given dataset type.
479 Raises
480 ------
481 KeyError
482 Raised if there is no dataset type with the given name.
484 Notes
485 -----
486 Dataset types registered by another client of the same repository since
487 the last call to `initialize` or `refresh` may not be found.
488 """
489 result = self.find(name)
490 if result is None:
491 raise KeyError(f"Dataset type with name '{name}' not found.")
492 return result
494 @abstractmethod
495 def find(self, name: str) -> Optional[DatasetRecordStorage]:
496 """Return an object that provides access to the records associated with
497 the given `DatasetType` name, if one exists.
499 Parameters
500 ----------
501 name : `str`
502 Name of the dataset type.
504 Returns
505 -------
506 records : `DatasetRecordStorage` or `None`
507 The object representing the records for the given dataset type, or
508 `None` if there are no records for that dataset type.
510 Notes
511 -----
512 Dataset types registered by another client of the same repository since
513 the last call to `initialize` or `refresh` may not be found.
514 """
515 raise NotImplementedError()
517 @abstractmethod
518 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]:
519 """Ensure that this `Registry` can hold records for the given
520 `DatasetType`, creating new tables as necessary.
522 Parameters
523 ----------
524 datasetType : `DatasetType`
525 Dataset type for which a table should created (as necessary) and
526 an associated `DatasetRecordStorage` returned.
528 Returns
529 -------
530 records : `DatasetRecordStorage`
531 The object representing the records for the given dataset type.
532 inserted : `bool`
533 `True` if the dataset type did not exist in the registry before.
535 Notes
536 -----
537 This operation may not be invoked within a `Database.transaction`
538 context.
539 """
540 raise NotImplementedError()
542 @abstractmethod
543 def remove(self, name: str) -> None:
544 """Remove the dataset type.
546 Parameters
547 ----------
548 name : `str`
549 Name of the dataset type.
550 """
551 raise NotImplementedError()
553 @abstractmethod
554 def __iter__(self) -> Iterator[DatasetType]:
555 """Return an iterator over the the dataset types present in this layer.
557 Notes
558 -----
559 Dataset types registered by another client of the same layer since
560 the last call to `initialize` or `refresh` may not be included.
561 """
562 raise NotImplementedError()
564 @abstractmethod
565 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]:
566 """Return a `DatasetRef` for the given dataset primary key
567 value.
569 Parameters
570 ----------
571 id : `DatasetId`
572 Primary key value for the dataset.
574 Returns
575 -------
576 ref : `DatasetRef` or `None`
577 Object representing the dataset, or `None` if no dataset with the
578 given primary key values exists in this layer.
579 """
580 raise NotImplementedError()
582 @abstractmethod
583 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
584 """Return a summary for the given collection.
586 Parameters
587 ----------
588 collection : `CollectionRecord`
589 Record describing the collection for which a summary is to be
590 retrieved.
592 Returns
593 -------
594 summary : `CollectionSummary`
595 Summary of the dataset types and governor dimension values in
596 this collection.
597 """
598 raise NotImplementedError()