Coverage for python/lsst/daf/butler/core/datasets/ref.py: 32%
198 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 09:13 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-15 09:13 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = [
24 "AmbiguousDatasetError",
25 "DatasetId",
26 "DatasetIdFactory",
27 "DatasetIdGenEnum",
28 "DatasetRef",
29 "SerializedDatasetRef",
30]
32import enum
33import uuid
34from collections.abc import Iterable
35from typing import TYPE_CHECKING, Any, ClassVar
37from lsst.utils.classes import immutable
38from pydantic import BaseModel, ConstrainedInt, StrictStr, validator
40from ..configSupport import LookupKey
41from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate
42from ..json import from_json_pydantic, to_json_pydantic
43from ..named import NamedKeyDict
44from .type import DatasetType, SerializedDatasetType
46if TYPE_CHECKING:
47 from ...registry import Registry
48 from ..storageClass import StorageClass
51class AmbiguousDatasetError(Exception):
52 """Raised when a `DatasetRef` is not resolved but should be.
54 This happens when the `DatasetRef` has no ID or run but the requested
55 operation requires one of them.
56 """
59class PositiveInt(ConstrainedInt):
60 ge = 0
61 strict = True
64class DatasetIdGenEnum(enum.Enum):
65 """Enum used to specify dataset ID generation options."""
67 UNIQUE = 0
68 """Unique mode generates unique ID for each inserted dataset, e.g.
69 auto-generated by database or random UUID.
70 """
72 DATAID_TYPE = 1
73 """In this mode ID is computed deterministically from a combination of
74 dataset type and dataId.
75 """
77 DATAID_TYPE_RUN = 2
78 """In this mode ID is computed deterministically from a combination of
79 dataset type, dataId, and run collection name.
80 """
83class DatasetIdFactory:
84 """Factory for dataset IDs (UUIDs).
86 For now the logic is hard-coded and is controlled by the user-provided
87 value of `DatasetIdGenEnum`. In the future we may implement a configurable
88 logic that can guess `DatasetIdGenEnum` value from other parameters.
89 """
91 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
92 """Namespace UUID used for UUID5 generation. Do not change. This was
93 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
94 """
96 def makeDatasetId(
97 self,
98 run: str,
99 datasetType: DatasetType,
100 dataId: DataCoordinate,
101 idGenerationMode: DatasetIdGenEnum,
102 ) -> uuid.UUID:
103 """Generate dataset ID for a dataset.
105 Parameters
106 ----------
107 run : `str`
108 Name of the RUN collection for the dataset.
109 datasetType : `DatasetType`
110 Dataset type.
111 dataId : `DataCoordinate`
112 Expanded data ID for the dataset.
113 idGenerationMode : `DatasetIdGenEnum`
114 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
115 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
116 deterministic UUID5-type ID based on a dataset type name and
117 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
118 deterministic UUID5-type ID based on a dataset type name, run
119 collection name, and ``dataId``.
121 Returns
122 -------
123 datasetId : `uuid.UUID`
124 Dataset identifier.
125 """
126 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
127 return uuid.uuid4()
128 else:
129 # WARNING: If you modify this code make sure that the order of
130 # items in the `items` list below never changes.
131 items: list[tuple[str, str]] = []
132 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
133 items = [
134 ("dataset_type", datasetType.name),
135 ]
136 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
137 items = [
138 ("dataset_type", datasetType.name),
139 ("run", run),
140 ]
141 else:
142 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
144 for name, value in sorted(dataId.byName().items()):
145 items.append((name, str(value)))
146 data = ",".join(f"{key}={value}" for key, value in items)
147 return uuid.uuid5(self.NS_UUID, data)
150class SerializedDatasetRef(BaseModel):
151 """Simplified model of a `DatasetRef` suitable for serialization."""
153 id: uuid.UUID
154 datasetType: SerializedDatasetType | None = None
155 dataId: SerializedDataCoordinate | None = None
156 run: StrictStr | None = None
157 component: StrictStr | None = None
159 @validator("dataId")
160 def _check_dataId(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
161 if (d := "datasetType") in values and values[d] is None:
162 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
163 return v
165 @validator("run")
166 def _check_run(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
167 if v and (i := "id") in values and values[i] is None:
168 raise ValueError("'run' cannot be provided unless 'id' is.")
169 return v
171 @validator("component")
172 def _check_component(cls, v: Any, values: dict[str, Any]) -> Any: # noqa: N805
173 # Component should not be given if datasetType is given
174 if v and (d := "datasetType") in values and values[d] is not None:
175 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
176 return v
178 @classmethod
179 def direct(
180 cls,
181 *,
182 id: str,
183 run: str,
184 datasetType: dict[str, Any] | None = None,
185 dataId: dict[str, Any] | None = None,
186 component: str | None = None,
187 ) -> SerializedDatasetRef:
188 """Construct a `SerializedDatasetRef` directly without validators.
190 Notes
191 -----
192 This differs from the pydantic "construct" method in that the arguments
193 are explicitly what the model requires, and it will recurse through
194 members, constructing them from their corresponding `direct` methods.
196 The ``id`` parameter is a string representation of dataset ID, it is
197 converted to UUID by this method.
199 This method should only be called when the inputs are trusted.
200 """
201 node = SerializedDatasetRef.__new__(cls)
202 setter = object.__setattr__
203 setter(node, "id", uuid.UUID(id))
204 setter(
205 node,
206 "datasetType",
207 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType),
208 )
209 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
210 setter(node, "run", run)
211 setter(node, "component", component)
212 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"})
213 return node
216DatasetId = uuid.UUID
217"""A type-annotation alias for dataset ID providing typing flexibility.
218"""
221@immutable
222class DatasetRef:
223 """Reference to a Dataset in a `Registry`.
225 A `DatasetRef` may point to a Dataset that currently does not yet exist
226 (e.g., because it is a predicted input for provenance).
228 Parameters
229 ----------
230 datasetType : `DatasetType`
231 The `DatasetType` for this Dataset.
232 dataId : `DataCoordinate`
233 A mapping of dimensions that labels the Dataset within a Collection.
234 run : `str`
235 The name of the run this dataset was associated with when it was
236 created.
237 id : `DatasetId`, optional
238 The unique identifier assigned when the dataset is created. If ``id``
239 is not specified, a new unique ID will be created.
240 conform : `bool`, optional
241 If `True` (default), call `DataCoordinate.standardize` to ensure that
242 the data ID's dimensions are consistent with the dataset type's.
243 `DatasetRef` instances for which those dimensions are not equal should
244 not be created in new code, but are still supported for backwards
245 compatibility. New code should only pass `False` if it can guarantee
246 that the dimensions are already consistent.
247 id_generation_mode : `DatasetIdGenEnum`
248 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
249 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
250 deterministic UUID5-type ID based on a dataset type name and
251 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
252 deterministic UUID5-type ID based on a dataset type name, run
253 collection name, and ``dataId``.
255 See Also
256 --------
257 :ref:`daf_butler_organizing_datasets`
258 """
260 _serializedType = SerializedDatasetRef
261 __slots__ = (
262 "id",
263 "datasetType",
264 "dataId",
265 "run",
266 )
268 def __init__(
269 self,
270 datasetType: DatasetType,
271 dataId: DataCoordinate,
272 run: str,
273 *,
274 id: DatasetId | None = None,
275 conform: bool = True,
276 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
277 ):
278 self.datasetType = datasetType
279 if conform:
280 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
281 else:
282 self.dataId = dataId
283 self.run = run
284 if id is not None:
285 self.id = id
286 else:
287 self.id = DatasetIdFactory().makeDatasetId(
288 self.run, self.datasetType, self.dataId, id_generation_mode
289 )
291 def __eq__(self, other: Any) -> bool:
292 try:
293 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
294 except AttributeError:
295 return NotImplemented
297 def __hash__(self) -> int:
298 return hash((self.datasetType, self.dataId, self.id))
300 @property
301 def dimensions(self) -> DimensionGraph:
302 """Dimensions associated with the underlying `DatasetType`."""
303 return self.datasetType.dimensions
305 def __repr__(self) -> str:
306 # We delegate to __str__ (i.e use "!s") for the data ID) below because
307 # DataCoordinate's __repr__ - while adhering to the guidelines for
308 # __repr__ - is much harder to users to read, while its __str__ just
309 # produces a dict that can also be passed to DatasetRef's constructor.
310 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})"
312 def __str__(self) -> str:
313 s = (
314 f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]"
315 f" (run={self.run} id={self.id})"
316 )
317 return s
319 def __lt__(self, other: Any) -> bool:
320 # Sort by run, DatasetType name and then by DataCoordinate
321 # The __str__ representation is probably close enough but we
322 # need to ensure that sorting a DatasetRef matches what you would
323 # get if you sorted DatasetType+DataCoordinate
324 if not isinstance(other, type(self)):
325 return NotImplemented
327 # Group by run if defined, takes precedence over DatasetType
328 self_run = "" if self.run is None else self.run
329 other_run = "" if other.run is None else other.run
331 # Compare tuples in the priority order
332 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
334 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
335 """Convert this class to a simple python type.
337 This makes it suitable for serialization.
339 Parameters
340 ----------
341 minimal : `bool`, optional
342 Use minimal serialization. Requires Registry to convert
343 back to a full type.
345 Returns
346 -------
347 simple : `dict` or `int`
348 The object converted to a dictionary.
349 """
350 if minimal:
351 # The only thing needed to uniquely define a DatasetRef is its id
352 # so that can be used directly if it is not a component DatasetRef.
353 # Store is in a dict to allow us to easily add the planned origin
354 # information later without having to support an int and dict in
355 # simple form.
356 simple: dict[str, Any] = {"id": self.id}
357 if self.isComponent():
358 # We can still be a little minimalist with a component
359 # but we will also need to record the datasetType component
360 simple["component"] = self.datasetType.component()
361 return SerializedDatasetRef(**simple)
363 return SerializedDatasetRef(
364 datasetType=self.datasetType.to_simple(minimal=minimal),
365 dataId=self.dataId.to_simple(),
366 run=self.run,
367 id=self.id,
368 )
370 @classmethod
371 def from_simple(
372 cls,
373 simple: SerializedDatasetRef,
374 universe: DimensionUniverse | None = None,
375 registry: Registry | None = None,
376 datasetType: DatasetType | None = None,
377 ) -> DatasetRef:
378 """Construct a new object from simplified form.
380 Generally this is data returned from the `to_simple` method.
382 Parameters
383 ----------
384 simple : `dict` of [`str`, `Any`]
385 The value returned by `to_simple()`.
386 universe : `DimensionUniverse`
387 The special graph of all known dimensions.
388 Can be `None` if a registry is provided.
389 registry : `lsst.daf.butler.Registry`, optional
390 Registry to use to convert simple form of a DatasetRef to
391 a full `DatasetRef`. Can be `None` if a full description of
392 the type is provided along with a universe.
393 datasetType : DatasetType, optional
394 If datasetType is supplied, this will be used as the datasetType
395 object in the resulting DatasetRef instead of being read from
396 the `SerializedDatasetRef`. This is useful when many refs share
397 the same type as memory can be saved. Defaults to None.
399 Returns
400 -------
401 ref : `DatasetRef`
402 Newly-constructed object.
403 """
404 # Minimalist component will just specify component and id and
405 # require registry to reconstruct
406 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
407 if registry is None:
408 raise ValueError("Registry is required to construct component DatasetRef from integer id")
409 if simple.id is None:
410 raise ValueError("For minimal DatasetRef the ID must be defined.")
411 ref = registry.getDataset(simple.id)
412 if ref is None:
413 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
414 if simple.component:
415 ref = ref.makeComponentRef(simple.component)
416 return ref
418 if universe is None and registry is None:
419 raise ValueError("One of universe or registry must be provided.")
421 if universe is None and registry is not None:
422 universe = registry.dimensions
424 if universe is None:
425 # this is for mypy
426 raise ValueError("Unable to determine a usable universe")
428 if simple.datasetType is None and datasetType is None:
429 # mypy
430 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
431 if datasetType is None:
432 if simple.datasetType is None:
433 raise ValueError("Cannot determine Dataset type of this serialized class")
434 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
436 if simple.dataId is None:
437 # mypy
438 raise ValueError("The DataId must be specified to construct a DatasetRef")
439 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
441 # Check that simple ref is resolved.
442 if simple.run is None:
443 dstr = ""
444 if simple.datasetType is None:
445 dstr = f" (datasetType={datasetType.name!r})"
446 raise ValueError(
447 "Run collection name is missing from serialized representation. "
448 f"Encountered with {simple!r}{dstr}."
449 )
451 return cls(datasetType, dataId, id=simple.id, run=simple.run)
453 to_json = to_json_pydantic
454 from_json: ClassVar = classmethod(from_json_pydantic)
456 @classmethod
457 def _unpickle(
458 cls,
459 datasetType: DatasetType,
460 dataId: DataCoordinate,
461 id: DatasetId,
462 run: str,
463 ) -> DatasetRef:
464 """Create new `DatasetRef`.
466 A custom factory method for use by `__reduce__` as a workaround for
467 its lack of support for keyword arguments.
468 """
469 return cls(datasetType, dataId, id=id, run=run)
471 def __reduce__(self) -> tuple:
472 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
474 def __deepcopy__(self, memo: dict) -> DatasetRef:
475 # DatasetRef is recursively immutable; see note in @immutable
476 # decorator.
477 return self
479 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
480 """Return a new `DatasetRef` with the given expanded data ID.
482 Parameters
483 ----------
484 dataId : `DataCoordinate`
485 Data ID for the new `DatasetRef`. Must compare equal to the
486 original data ID.
488 Returns
489 -------
490 ref : `DatasetRef`
491 A new `DatasetRef` with the given data ID.
492 """
493 assert dataId == self.dataId
494 return DatasetRef(
495 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False
496 )
498 def isComponent(self) -> bool:
499 """Indicate whether this `DatasetRef` refers to a component.
501 Returns
502 -------
503 isComponent : `bool`
504 `True` if this `DatasetRef` is a component, `False` otherwise.
505 """
506 return self.datasetType.isComponent()
508 def isComposite(self) -> bool:
509 """Boolean indicating whether this `DatasetRef` is a composite type.
511 Returns
512 -------
513 isComposite : `bool`
514 `True` if this `DatasetRef` is a composite type, `False`
515 otherwise.
516 """
517 return self.datasetType.isComposite()
519 def _lookupNames(self) -> tuple[LookupKey, ...]:
520 """Name keys to use when looking up this DatasetRef in a configuration.
522 The names are returned in order of priority.
524 Returns
525 -------
526 names : `tuple` of `LookupKey`
527 Tuple of the `DatasetType` name and the `StorageClass` name.
528 If ``instrument`` is defined in the dataId, each of those names
529 is added to the start of the tuple with a key derived from the
530 value of ``instrument``.
531 """
532 # Special case the instrument Dimension since we allow configs
533 # to include the instrument name in the hierarchy.
534 names: tuple[LookupKey, ...] = self.datasetType._lookupNames()
536 if "instrument" in self.dataId:
537 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names
539 return names
541 @staticmethod
542 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, list[DatasetRef]]:
543 """Group an iterable of `DatasetRef` by `DatasetType`.
545 Parameters
546 ----------
547 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
548 `DatasetRef` instances to group.
550 Returns
551 -------
552 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
553 Grouped `DatasetRef` instances.
554 """
555 result: NamedKeyDict[DatasetType, list[DatasetRef]] = NamedKeyDict()
556 for ref in refs:
557 result.setdefault(ref.datasetType, []).append(ref)
558 return result
560 def makeCompositeRef(self) -> DatasetRef:
561 """Create a `DatasetRef` of the composite from a component ref.
563 Requires that this `DatasetRef` is a component.
565 Returns
566 -------
567 ref : `DatasetRef`
568 A `DatasetRef` with a dataset type that corresponds to the
569 composite parent of this component, and the same ID and run
570 (which may be `None`, if they are `None` in ``self``).
571 """
572 # Assume that the data ID does not need to be standardized
573 # and should match whatever this ref already has.
574 return DatasetRef(
575 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False
576 )
578 def makeComponentRef(self, name: str) -> DatasetRef:
579 """Create a `DatasetRef` that corresponds to a component.
581 Parameters
582 ----------
583 name : `str`
584 Name of the component.
586 Returns
587 -------
588 ref : `DatasetRef`
589 A `DatasetRef` with a dataset type that corresponds to the given
590 component, and the same ID and run
591 (which may be `None`, if they are `None` in ``self``).
592 """
593 # Assume that the data ID does not need to be standardized
594 # and should match whatever this ref already has.
595 return DatasetRef(
596 self.datasetType.makeComponentDatasetType(name),
597 self.dataId,
598 id=self.id,
599 run=self.run,
600 conform=False,
601 )
603 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef:
604 """Create a new `DatasetRef` from this one, but with a modified
605 `DatasetType` that has a different `StorageClass`.
607 Parameters
608 ----------
609 storageClass : `str` or `StorageClass`
610 The new storage class.
612 Returns
613 -------
614 modified : `DatasetRef`
615 A new dataset reference that is the same as the current one but
616 with a different storage class in the `DatasetType`.
617 """
618 return DatasetRef(
619 datasetType=self.datasetType.overrideStorageClass(storageClass),
620 dataId=self.dataId,
621 id=self.id,
622 run=self.run,
623 conform=False,
624 )
626 def is_compatible_with(self, ref: DatasetRef) -> bool:
627 """Determine if the given `DatasetRef` is compatible with this one.
629 Parameters
630 ----------
631 other : `DatasetRef`
632 Dataset ref to check.
634 Returns
635 -------
636 is_compatible : `bool`
637 Returns `True` if the other dataset ref is either the same as this
638 or the dataset type associated with the other is compatible with
639 this one and the dataId and dataset ID match.
641 Notes
642 -----
643 Compatibility requires that the dataId and dataset ID match and the
644 `DatasetType` is compatible. Compatibility is defined as the storage
645 class associated with the dataset type of the other ref can be
646 converted to this storage class.
648 Specifically this means that if you have done:
650 .. code-block:: py
652 new_ref = ref.overrideStorageClass(sc)
654 and this is successful, then the guarantee is that:
656 .. code-block:: py
658 assert ref.is_compatible_with(new_ref) is True
660 since we know that the python type associated with the new ref can
661 be converted to the original python type. The reverse is not guaranteed
662 and depends on whether bidirectional converters have been registered.
663 """
664 if self.id != ref.id:
665 return False
666 if self.dataId != ref.dataId:
667 return False
668 if self.run != ref.run:
669 return False
670 return self.datasetType.is_compatible_with(ref.datasetType)
672 datasetType: DatasetType
673 """The definition of this dataset (`DatasetType`).
675 Cannot be changed after a `DatasetRef` is constructed.
676 """
678 dataId: DataCoordinate
679 """A mapping of `Dimension` primary key values that labels the dataset
680 within a Collection (`DataCoordinate`).
682 Cannot be changed after a `DatasetRef` is constructed.
683 """
685 run: str
686 """The name of the run that produced the dataset.
688 Cannot be changed after a `DatasetRef` is constructed.
689 """
691 id: DatasetId
692 """Primary key of the dataset (`DatasetId`).
694 Cannot be changed after a `DatasetRef` is constructed.
695 """