Coverage for python/lsst/daf/butler/core/datasets/ref.py: 32%
197 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-08 05:05 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-08 05:05 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = [
24 "AmbiguousDatasetError",
25 "DatasetId",
26 "DatasetIdFactory",
27 "DatasetIdGenEnum",
28 "DatasetRef",
29 "SerializedDatasetRef",
30]
32import enum
33import uuid
34from typing import TYPE_CHECKING, Any, ClassVar, Dict, Iterable, List, Optional, Tuple
36from lsst.utils.classes import immutable
37from pydantic import BaseModel, ConstrainedInt, StrictStr, validator
39from ..configSupport import LookupKey
40from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate
41from ..json import from_json_pydantic, to_json_pydantic
42from ..named import NamedKeyDict
43from .type import DatasetType, SerializedDatasetType
45if TYPE_CHECKING:
46 from ...registry import Registry
47 from ..storageClass import StorageClass
50class AmbiguousDatasetError(Exception):
51 """Raised when a `DatasetRef` is not resolved but should be.
53 This happens when the `DatasetRef` has no ID or run but the requested
54 operation requires one of them.
55 """
58class PositiveInt(ConstrainedInt):
59 ge = 0
60 strict = True
63class DatasetIdGenEnum(enum.Enum):
64 """This enum is used to specify dataset ID generation options."""
66 UNIQUE = 0
67 """Unique mode generates unique ID for each inserted dataset, e.g.
68 auto-generated by database or random UUID.
69 """
71 DATAID_TYPE = 1
72 """In this mode ID is computed deterministically from a combination of
73 dataset type and dataId.
74 """
76 DATAID_TYPE_RUN = 2
77 """In this mode ID is computed deterministically from a combination of
78 dataset type, dataId, and run collection name.
79 """
82class DatasetIdFactory:
83 """Factory for dataset IDs (UUIDs).
85 For now the logic is hard-coded and is controlled by the user-provided
86 value of `DatasetIdGenEnum`. In the future we may implement a configurable
87 logic that can guess `DatasetIdGenEnum` value from other parameters.
88 """
90 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
91 """Namespace UUID used for UUID5 generation. Do not change. This was
92 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
93 """
95 def makeDatasetId(
96 self,
97 run: str,
98 datasetType: DatasetType,
99 dataId: DataCoordinate,
100 idGenerationMode: DatasetIdGenEnum,
101 ) -> uuid.UUID:
102 """Generate dataset ID for a dataset.
104 Parameters
105 ----------
106 run : `str`
107 Name of the RUN collection for the dataset.
108 datasetType : `DatasetType`
109 Dataset type.
110 dataId : `DataCoordinate`
111 Expanded data ID for the dataset.
112 idGenerationMode : `DatasetIdGenEnum`
113 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
114 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
115 deterministic UUID5-type ID based on a dataset type name and
116 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
117 deterministic UUID5-type ID based on a dataset type name, run
118 collection name, and ``dataId``.
120 Returns
121 -------
122 datasetId : `uuid.UUID`
123 Dataset identifier.
124 """
125 if idGenerationMode is DatasetIdGenEnum.UNIQUE:
126 return uuid.uuid4()
127 else:
128 # WARNING: If you modify this code make sure that the order of
129 # items in the `items` list below never changes.
130 items: list[tuple[str, str]] = []
131 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
132 items = [
133 ("dataset_type", datasetType.name),
134 ]
135 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
136 items = [
137 ("dataset_type", datasetType.name),
138 ("run", run),
139 ]
140 else:
141 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")
143 for name, value in sorted(dataId.byName().items()):
144 items.append((name, str(value)))
145 data = ",".join(f"{key}={value}" for key, value in items)
146 return uuid.uuid5(self.NS_UUID, data)
149class SerializedDatasetRef(BaseModel):
150 """Simplified model of a `DatasetRef` suitable for serialization."""
152 id: uuid.UUID
153 datasetType: Optional[SerializedDatasetType] = None
154 dataId: Optional[SerializedDataCoordinate] = None
155 run: Optional[StrictStr] = None
156 component: Optional[StrictStr] = None
158 @validator("dataId")
159 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
160 if (d := "datasetType") in values and values[d] is None:
161 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
162 return v
164 @validator("run")
165 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
166 if v and (i := "id") in values and values[i] is None:
167 raise ValueError("'run' cannot be provided unless 'id' is.")
168 return v
170 @validator("component")
171 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
172 # Component should not be given if datasetType is given
173 if v and (d := "datasetType") in values and values[d] is not None:
174 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
175 return v
177 @classmethod
178 def direct(
179 cls,
180 *,
181 id: str,
182 run: str,
183 datasetType: Optional[Dict[str, Any]] = None,
184 dataId: Optional[Dict[str, Any]] = None,
185 component: Optional[str] = None,
186 ) -> SerializedDatasetRef:
187 """Construct a `SerializedDatasetRef` directly without validators.
189 Notes
190 -----
191 This differs from the pydantic "construct" method in that the arguments
192 are explicitly what the model requires, and it will recurse through
193 members, constructing them from their corresponding `direct` methods.
195 The ``id`` parameter is a string representation of dataset ID, it is
196 converted to UUID by this method.
198 This method should only be called when the inputs are trusted.
199 """
200 node = SerializedDatasetRef.__new__(cls)
201 setter = object.__setattr__
202 setter(node, "id", uuid.UUID(id))
203 setter(
204 node,
205 "datasetType",
206 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType),
207 )
208 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
209 setter(node, "run", run)
210 setter(node, "component", component)
211 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"})
212 return node
215DatasetId = uuid.UUID
216"""A type-annotation alias for dataset ID providing typing flexibility.
217"""
220@immutable
221class DatasetRef:
222 """Reference to a Dataset in a `Registry`.
224 A `DatasetRef` may point to a Dataset that currently does not yet exist
225 (e.g., because it is a predicted input for provenance).
227 Parameters
228 ----------
229 datasetType : `DatasetType`
230 The `DatasetType` for this Dataset.
231 dataId : `DataCoordinate`
232 A mapping of dimensions that labels the Dataset within a Collection.
233 run : `str`
234 The name of the run this dataset was associated with when it was
235 created.
236 id : `DatasetId`, optional
237 The unique identifier assigned when the dataset is created. If ``id``
238 is not specified, a new unique ID will be created.
239 conform : `bool`, optional
240 If `True` (default), call `DataCoordinate.standardize` to ensure that
241 the data ID's dimensions are consistent with the dataset type's.
242 `DatasetRef` instances for which those dimensions are not equal should
243 not be created in new code, but are still supported for backwards
244 compatibility. New code should only pass `False` if it can guarantee
245 that the dimensions are already consistent.
246 id_generation_mode : `DatasetIdGenEnum`
247 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
248 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
249 deterministic UUID5-type ID based on a dataset type name and
250 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
251 deterministic UUID5-type ID based on a dataset type name, run
252 collection name, and ``dataId``.
254 Raises
255 ------
256 ValueError
257 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
258 provided but ``run`` is not.
260 See Also
261 --------
262 :ref:`daf_butler_organizing_datasets`
263 """
265 _serializedType = SerializedDatasetRef
266 __slots__ = (
267 "id",
268 "datasetType",
269 "dataId",
270 "run",
271 )
273 def __init__(
274 self,
275 datasetType: DatasetType,
276 dataId: DataCoordinate,
277 run: str,
278 *,
279 id: Optional[DatasetId] = None,
280 conform: bool = True,
281 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
282 ):
283 self.datasetType = datasetType
284 if conform:
285 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
286 else:
287 self.dataId = dataId
288 self.run = run
289 if id is not None:
290 self.id = id
291 else:
292 self.id = DatasetIdFactory().makeDatasetId(
293 self.run, self.datasetType, self.dataId, id_generation_mode
294 )
296 def __eq__(self, other: Any) -> bool:
297 try:
298 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
299 except AttributeError:
300 return NotImplemented
302 def __hash__(self) -> int:
303 return hash((self.datasetType, self.dataId, self.id))
305 @property
306 def dimensions(self) -> DimensionGraph:
307 """Dimensions associated with the underlying `DatasetType`."""
308 return self.datasetType.dimensions
310 def __repr__(self) -> str:
311 # We delegate to __str__ (i.e use "!s") for the data ID) below because
312 # DataCoordinate's __repr__ - while adhering to the guidelines for
313 # __repr__ - is much harder to users to read, while its __str__ just
314 # produces a dict that can also be passed to DatasetRef's constructor.
315 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, run={self.run!r}, id={self.id})"
317 def __str__(self) -> str:
318 s = (
319 f"{self.datasetType.name}@{self.dataId!s} [sc={self.datasetType.storageClass_name}]"
320 f" (run={self.run} id={self.id})"
321 )
322 return s
324 def __lt__(self, other: Any) -> bool:
325 # Sort by run, DatasetType name and then by DataCoordinate
326 # The __str__ representation is probably close enough but we
327 # need to ensure that sorting a DatasetRef matches what you would
328 # get if you sorted DatasetType+DataCoordinate
329 if not isinstance(other, type(self)):
330 return NotImplemented
332 # Group by run if defined, takes precedence over DatasetType
333 self_run = "" if self.run is None else self.run
334 other_run = "" if other.run is None else other.run
336 # Compare tuples in the priority order
337 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
339 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
340 """Convert this class to a simple python type.
342 This makes it suitable for serialization.
344 Parameters
345 ----------
346 minimal : `bool`, optional
347 Use minimal serialization. Requires Registry to convert
348 back to a full type.
350 Returns
351 -------
352 simple : `dict` or `int`
353 The object converted to a dictionary.
354 """
355 if minimal:
356 # The only thing needed to uniquely define a DatasetRef is its id
357 # so that can be used directly if it is not a component DatasetRef.
358 # Store is in a dict to allow us to easily add the planned origin
359 # information later without having to support an int and dict in
360 # simple form.
361 simple: Dict[str, Any] = {"id": self.id}
362 if self.isComponent():
363 # We can still be a little minimalist with a component
364 # but we will also need to record the datasetType component
365 simple["component"] = self.datasetType.component()
366 return SerializedDatasetRef(**simple)
368 return SerializedDatasetRef(
369 datasetType=self.datasetType.to_simple(minimal=minimal),
370 dataId=self.dataId.to_simple(),
371 run=self.run,
372 id=self.id,
373 )
375 @classmethod
376 def from_simple(
377 cls,
378 simple: SerializedDatasetRef,
379 universe: Optional[DimensionUniverse] = None,
380 registry: Optional[Registry] = None,
381 datasetType: Optional[DatasetType] = None,
382 ) -> DatasetRef:
383 """Construct a new object from simplified form.
385 Generally this is data returned from the `to_simple` method.
387 Parameters
388 ----------
389 simple : `dict` of [`str`, `Any`]
390 The value returned by `to_simple()`.
391 universe : `DimensionUniverse`
392 The special graph of all known dimensions.
393 Can be `None` if a registry is provided.
394 registry : `lsst.daf.butler.Registry`, optional
395 Registry to use to convert simple form of a DatasetRef to
396 a full `DatasetRef`. Can be `None` if a full description of
397 the type is provided along with a universe.
398 datasetType : DatasetType, optional
399 If datasetType is supplied, this will be used as the datasetType
400 object in the resulting DatasetRef instead of being read from
401 the `SerializedDatasetRef`. This is useful when many refs share
402 the same type as memory can be saved. Defaults to None.
404 Returns
405 -------
406 ref : `DatasetRef`
407 Newly-constructed object.
408 """
409 # Minimalist component will just specify component and id and
410 # require registry to reconstruct
411 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
412 if registry is None:
413 raise ValueError("Registry is required to construct component DatasetRef from integer id")
414 if simple.id is None:
415 raise ValueError("For minimal DatasetRef the ID must be defined.")
416 ref = registry.getDataset(simple.id)
417 if ref is None:
418 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
419 if simple.component:
420 ref = ref.makeComponentRef(simple.component)
421 return ref
423 if universe is None and registry is None:
424 raise ValueError("One of universe or registry must be provided.")
426 if universe is None and registry is not None:
427 universe = registry.dimensions
429 if universe is None:
430 # this is for mypy
431 raise ValueError("Unable to determine a usable universe")
433 if simple.datasetType is None and datasetType is None:
434 # mypy
435 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
436 if datasetType is None:
437 if simple.datasetType is None:
438 raise ValueError("Cannot determine Dataset type of this serialized class")
439 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
441 if simple.dataId is None:
442 # mypy
443 raise ValueError("The DataId must be specified to construct a DatasetRef")
444 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
446 # Check that simple ref is resolved.
447 if simple.run is None:
448 dstr = ""
449 if simple.datasetType is None:
450 dstr = f" (datasetType={datasetType.name!r})"
451 raise ValueError(
452 "Run collection name is missing from serialized representation. "
453 f"Encountered with {simple!r}{dstr}."
454 )
456 return cls(datasetType, dataId, id=simple.id, run=simple.run)
458 to_json = to_json_pydantic
459 from_json: ClassVar = classmethod(from_json_pydantic)
461 @classmethod
462 def _unpickle(
463 cls,
464 datasetType: DatasetType,
465 dataId: DataCoordinate,
466 id: DatasetId,
467 run: str,
468 ) -> DatasetRef:
469 """Create new `DatasetRef`.
471 A custom factory method for use by `__reduce__` as a workaround for
472 its lack of support for keyword arguments.
473 """
474 return cls(datasetType, dataId, id=id, run=run)
476 def __reduce__(self) -> tuple:
477 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
479 def __deepcopy__(self, memo: dict) -> DatasetRef:
480 # DatasetRef is recursively immutable; see note in @immutable
481 # decorator.
482 return self
484 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
485 """Return a new `DatasetRef` with the given expanded data ID.
487 Parameters
488 ----------
489 dataId : `DataCoordinate`
490 Data ID for the new `DatasetRef`. Must compare equal to the
491 original data ID.
493 Returns
494 -------
495 ref : `DatasetRef`
496 A new `DatasetRef` with the given data ID.
497 """
498 assert dataId == self.dataId
499 return DatasetRef(
500 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False
501 )
503 def isComponent(self) -> bool:
504 """Indicate whether this `DatasetRef` refers to a component.
506 Returns
507 -------
508 isComponent : `bool`
509 `True` if this `DatasetRef` is a component, `False` otherwise.
510 """
511 return self.datasetType.isComponent()
513 def isComposite(self) -> bool:
514 """Boolean indicating whether this `DatasetRef` is a composite type.
516 Returns
517 -------
518 isComposite : `bool`
519 `True` if this `DatasetRef` is a composite type, `False`
520 otherwise.
521 """
522 return self.datasetType.isComposite()
524 def _lookupNames(self) -> Tuple[LookupKey, ...]:
525 """Name keys to use when looking up this DatasetRef in a configuration.
527 The names are returned in order of priority.
529 Returns
530 -------
531 names : `tuple` of `LookupKey`
532 Tuple of the `DatasetType` name and the `StorageClass` name.
533 If ``instrument`` is defined in the dataId, each of those names
534 is added to the start of the tuple with a key derived from the
535 value of ``instrument``.
536 """
537 # Special case the instrument Dimension since we allow configs
538 # to include the instrument name in the hierarchy.
539 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
541 if "instrument" in self.dataId:
542 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names
544 return names
546 @staticmethod
547 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
548 """Group an iterable of `DatasetRef` by `DatasetType`.
550 Parameters
551 ----------
552 refs : `Iterable` [ `DatasetRef` ]
553 `DatasetRef` instances to group.
555 Returns
556 -------
557 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
558 Grouped `DatasetRef` instances.
559 """
560 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
561 for ref in refs:
562 result.setdefault(ref.datasetType, []).append(ref)
563 return result
565 def makeCompositeRef(self) -> DatasetRef:
566 """Create a `DatasetRef` of the composite from a component ref.
568 Requires that this `DatasetRef` is a component.
570 Returns
571 -------
572 ref : `DatasetRef`
573 A `DatasetRef` with a dataset type that corresponds to the
574 composite parent of this component, and the same ID and run
575 (which may be `None`, if they are `None` in ``self``).
576 """
577 # Assume that the data ID does not need to be standardized
578 # and should match whatever this ref already has.
579 return DatasetRef(
580 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False
581 )
583 def makeComponentRef(self, name: str) -> DatasetRef:
584 """Create a `DatasetRef` that corresponds to a component.
586 Parameters
587 ----------
588 name : `str`
589 Name of the component.
591 Returns
592 -------
593 ref : `DatasetRef`
594 A `DatasetRef` with a dataset type that corresponds to the given
595 component, and the same ID and run
596 (which may be `None`, if they are `None` in ``self``).
597 """
598 # Assume that the data ID does not need to be standardized
599 # and should match whatever this ref already has.
600 return DatasetRef(
601 self.datasetType.makeComponentDatasetType(name),
602 self.dataId,
603 id=self.id,
604 run=self.run,
605 conform=False,
606 )
608 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef:
609 """Create a new `DatasetRef` from this one, but with a modified
610 `DatasetType` that has a different `StorageClass`.
612 Parameters
613 ----------
614 storageClass : `str` or `StorageClass`
615 The new storage class.
617 Returns
618 -------
619 modified : `DatasetRef`
620 A new dataset reference that is the same as the current one but
621 with a different storage class in the `DatasetType`.
622 """
623 return DatasetRef(
624 datasetType=self.datasetType.overrideStorageClass(storageClass),
625 dataId=self.dataId,
626 id=self.id,
627 run=self.run,
628 conform=False,
629 )
631 def is_compatible_with(self, ref: DatasetRef) -> bool:
632 """Determine if the given `DatasetRef` is compatible with this one.
634 Parameters
635 ----------
636 other : `DatasetRef`
637 Dataset ref to check.
639 Returns
640 -------
641 is_compatible : `bool`
642 Returns `True` if the other dataset ref is either the same as this
643 or the dataset type associated with the other is compatible with
644 this one and the dataId and dataset ID match.
646 Notes
647 -----
648 Compatibility requires that the dataId and dataset ID match and the
649 `DatasetType` is compatible. Compatibility is defined as the storage
650 class associated with the dataset type of the other ref can be
651 converted to this storage class.
653 Specifically this means that if you have done:
655 .. code-block:: py
657 new_ref = ref.overrideStorageClass(sc)
659 and this is successful, then the guarantee is that:
661 .. code-block:: py
663 assert ref.is_compatible_with(new_ref) is True
665 since we know that the python type associated with the new ref can
666 be converted to the original python type. The reverse is not guaranteed
667 and depends on whether bidirectional converters have been registered.
668 """
669 if self.id != ref.id:
670 return False
671 if self.dataId != ref.dataId:
672 return False
673 if self.run != ref.run:
674 return False
675 return self.datasetType.is_compatible_with(ref.datasetType)
677 datasetType: DatasetType
678 """The definition of this dataset (`DatasetType`).
680 Cannot be changed after a `DatasetRef` is constructed.
681 """
683 dataId: DataCoordinate
684 """A mapping of `Dimension` primary key values that labels the dataset
685 within a Collection (`DataCoordinate`).
687 Cannot be changed after a `DatasetRef` is constructed.
688 """
690 run: str
691 """The name of the run that produced the dataset.
693 Cannot be changed after a `DatasetRef` is constructed.
694 """
696 id: DatasetId
697 """Primary key of the dataset (`DatasetId`).
699 Cannot be changed after a `DatasetRef` is constructed.
700 """