Coverage for python/lsst/daf/butler/core/datasets/ref.py: 31%
183 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-08 10:28 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-08 10:28 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"]
25import uuid
26from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
28from lsst.utils.classes import immutable
29from pydantic import BaseModel, ConstrainedInt, StrictStr, validator
31from ..configSupport import LookupKey
32from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse, SerializedDataCoordinate
33from ..json import from_json_pydantic, to_json_pydantic
34from ..named import NamedKeyDict
35from .type import DatasetType, SerializedDatasetType
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from ...registry import Registry
39 from ..storageClass import StorageClass
42class AmbiguousDatasetError(Exception):
43 """Raised when a `DatasetRef` is not resolved but should be.
45 This happens when the `DatasetRef` has no ID or run but the requested
46 operation requires one of them.
47 """
50class PositiveInt(ConstrainedInt):
51 ge = 0
52 strict = True
55class SerializedDatasetRef(BaseModel):
56 """Simplified model of a `DatasetRef` suitable for serialization."""
58 # DO NOT change order in the Union, pydantic is sensitive to that!
59 id: Optional[Union[uuid.UUID, PositiveInt]] = None
60 datasetType: Optional[SerializedDatasetType] = None
61 dataId: Optional[SerializedDataCoordinate] = None
62 run: Optional[StrictStr] = None
63 component: Optional[StrictStr] = None
65 @validator("dataId")
66 def _check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
67 if (d := "datasetType") in values and values[d] is None:
68 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
69 return v
71 @validator("run")
72 def _check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
73 if v and (i := "id") in values and values[i] is None:
74 raise ValueError("'run' cannot be provided unless 'id' is.")
75 return v
77 @validator("component")
78 def _check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
79 # Component should not be given if datasetType is given
80 if v and (d := "datasetType") in values and values[d] is not None:
81 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
82 return v
84 @classmethod
85 def direct(
86 cls,
87 *,
88 id: Optional[Union[str, int]] = None,
89 datasetType: Optional[Dict[str, Any]] = None,
90 dataId: Optional[Dict[str, Any]] = None,
91 run: str | None = None,
92 component: Optional[str] = None,
93 ) -> SerializedDatasetRef:
94 """Construct a `SerializedDatasetRef` directly without validators.
96 This differs from the pydantic "construct" method in that the arguments
97 are explicitly what the model requires, and it will recurse through
98 members, constructing them from their corresponding `direct` methods.
100 This method should only be called when the inputs are trusted.
101 """
102 node = SerializedDatasetRef.__new__(cls)
103 setter = object.__setattr__
104 setter(node, "id", uuid.UUID(id) if isinstance(id, str) else id)
105 setter(
106 node,
107 "datasetType",
108 datasetType if datasetType is None else SerializedDatasetType.direct(**datasetType),
109 )
110 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
111 setter(node, "run", run)
112 setter(node, "component", component)
113 setter(node, "__fields_set__", {"id", "datasetType", "dataId", "run", "component"})
114 return node
117DatasetId = Union[int, uuid.UUID]
118"""A type-annotation alias for dataset ID which could be either integer or
119UUID.
120"""
123@immutable
124class DatasetRef:
125 """Reference to a Dataset in a `Registry`.
127 A `DatasetRef` may point to a Dataset that currently does not yet exist
128 (e.g., because it is a predicted input for provenance).
130 Parameters
131 ----------
132 datasetType : `DatasetType`
133 The `DatasetType` for this Dataset.
134 dataId : `DataCoordinate`
135 A mapping of dimensions that labels the Dataset within a Collection.
136 id : `DatasetId`, optional
137 The unique identifier assigned when the dataset is created.
138 run : `str`, optional
139 The name of the run this dataset was associated with when it was
140 created. Must be provided if ``id`` is.
141 conform : `bool`, optional
142 If `True` (default), call `DataCoordinate.standardize` to ensure that
143 the data ID's dimensions are consistent with the dataset type's.
144 `DatasetRef` instances for which those dimensions are not equal should
145 not be created in new code, but are still supported for backwards
146 compatibility. New code should only pass `False` if it can guarantee
147 that the dimensions are already consistent.
149 Raises
150 ------
151 ValueError
152 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
153 provided but ``run`` is not.
155 See Also
156 --------
157 :ref:`daf_butler_organizing_datasets`
158 """
160 _serializedType = SerializedDatasetRef
161 __slots__ = (
162 "id",
163 "datasetType",
164 "dataId",
165 "run",
166 )
168 def __init__(
169 self,
170 datasetType: DatasetType,
171 dataId: DataCoordinate,
172 *,
173 id: Optional[DatasetId] = None,
174 run: Optional[str] = None,
175 conform: bool = True,
176 ):
177 self.id = id
178 self.datasetType = datasetType
179 if conform:
180 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
181 else:
182 self.dataId = dataId
183 if self.id is not None:
184 if run is None:
185 raise ValueError(
186 f"Cannot provide id without run for dataset with id={id}, "
187 f"type={datasetType}, and dataId={dataId}."
188 )
189 self.run = run
190 else:
191 if run is not None:
192 raise ValueError("'run' cannot be provided unless 'id' is.")
193 self.run = None
195 def __eq__(self, other: Any) -> bool:
196 try:
197 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
198 except AttributeError:
199 return NotImplemented
201 def __hash__(self) -> int:
202 return hash((self.datasetType, self.dataId, self.id))
204 @property
205 def dimensions(self) -> DimensionGraph:
206 """Dimensions associated with the underlying `DatasetType`."""
207 return self.datasetType.dimensions
209 def __repr__(self) -> str:
210 # We delegate to __str__ (i.e use "!s") for the data ID) below because
211 # DataCoordinate's __repr__ - while adhering to the guidelines for
212 # __repr__ - is much harder to users to read, while its __str__ just
213 # produces a dict that can also be passed to DatasetRef's constructor.
214 if self.id is not None:
215 return f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})"
216 else:
217 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
219 def __str__(self) -> str:
220 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass_name}]"
221 if self.id is not None:
222 s += f" (id={self.id})"
223 return s
225 def __lt__(self, other: Any) -> bool:
226 # Sort by run, DatasetType name and then by DataCoordinate
227 # The __str__ representation is probably close enough but we
228 # need to ensure that sorting a DatasetRef matches what you would
229 # get if you sorted DatasetType+DataCoordinate
230 if not isinstance(other, type(self)):
231 return NotImplemented
233 # Group by run if defined, takes precedence over DatasetType
234 self_run = "" if self.run is None else self.run
235 other_run = "" if other.run is None else other.run
237 # Compare tuples in the priority order
238 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
240 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
241 """Convert this class to a simple python type.
243 This makes it suitable for serialization.
245 Parameters
246 ----------
247 minimal : `bool`, optional
248 Use minimal serialization. Requires Registry to convert
249 back to a full type.
251 Returns
252 -------
253 simple : `dict` or `int`
254 The object converted to a dictionary.
255 """
256 if minimal and self.id is not None:
257 # The only thing needed to uniquely define a DatasetRef
258 # is its id so that can be used directly if it is
259 # resolved and if it is not a component DatasetRef.
260 # Store is in a dict to allow us to easily add the planned
261 # origin information later without having to support
262 # an int and dict in simple form.
263 simple: Dict[str, Any] = {"id": self.id}
264 if self.isComponent():
265 # We can still be a little minimalist with a component
266 # but we will also need to record the datasetType component
267 simple["component"] = self.datasetType.component()
268 return SerializedDatasetRef(**simple)
270 # Convert to a dict form
271 as_dict: Dict[str, Any] = {
272 "datasetType": self.datasetType.to_simple(minimal=minimal),
273 "dataId": self.dataId.to_simple(),
274 }
276 # Only include the id entry if it is defined
277 if self.id is not None:
278 as_dict["run"] = self.run
279 as_dict["id"] = self.id
281 return SerializedDatasetRef(**as_dict)
283 @classmethod
284 def from_simple(
285 cls,
286 simple: SerializedDatasetRef,
287 universe: Optional[DimensionUniverse] = None,
288 registry: Optional[Registry] = None,
289 datasetType: Optional[DatasetType] = None,
290 ) -> DatasetRef:
291 """Construct a new object from simplified form.
293 Generally this is data returned from the `to_simple` method.
295 Parameters
296 ----------
297 simple : `dict` of [`str`, `Any`]
298 The value returned by `to_simple()`.
299 universe : `DimensionUniverse`
300 The special graph of all known dimensions.
301 Can be `None` if a registry is provided.
302 registry : `lsst.daf.butler.Registry`, optional
303 Registry to use to convert simple form of a DatasetRef to
304 a full `DatasetRef`. Can be `None` if a full description of
305 the type is provided along with a universe.
306 datasetType : DatasetType, optional
307 If datasetType is supplied, this will be used as the datasetType
308 object in the resulting DatasetRef instead of being read from
309 the `SerializedDatasetRef`. This is useful when many refs share
310 the same type as memory can be saved. Defaults to None.
312 Returns
313 -------
314 ref : `DatasetRef`
315 Newly-constructed object.
316 """
317 # Minimalist component will just specify component and id and
318 # require registry to reconstruct
319 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
320 if registry is None:
321 raise ValueError("Registry is required to construct component DatasetRef from integer id")
322 if simple.id is None:
323 raise ValueError("For minimal DatasetRef the ID must be defined.")
324 ref = registry.getDataset(simple.id)
325 if ref is None:
326 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
327 if simple.component:
328 ref = ref.makeComponentRef(simple.component)
329 return ref
331 if universe is None and registry is None:
332 raise ValueError("One of universe or registry must be provided.")
334 if universe is None and registry is not None:
335 universe = registry.dimensions
337 if universe is None:
338 # this is for mypy
339 raise ValueError("Unable to determine a usable universe")
341 if simple.datasetType is None and datasetType is None:
342 # mypy
343 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
344 if datasetType is None:
345 if simple.datasetType is None:
346 raise ValueError("Cannot determine Dataset type of this serialized class")
347 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
349 if simple.dataId is None:
350 # mypy
351 raise ValueError("The DataId must be specified to construct a DatasetRef")
352 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
353 return cls(datasetType, dataId, id=simple.id, run=simple.run)
355 to_json = to_json_pydantic
356 from_json = classmethod(from_json_pydantic)
358 @classmethod
359 def _unpickle(
360 cls,
361 datasetType: DatasetType,
362 dataId: DataCoordinate,
363 id: Optional[DatasetId],
364 run: Optional[str],
365 ) -> DatasetRef:
366 """Create new `DatasetRef`.
368 A custom factory method for use by `__reduce__` as a workaround for
369 its lack of support for keyword arguments.
370 """
371 return cls(datasetType, dataId, id=id, run=run)
373 def __reduce__(self) -> tuple:
374 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
376 def __deepcopy__(self, memo: dict) -> DatasetRef:
377 # DatasetRef is recursively immutable; see note in @immutable
378 # decorator.
379 return self
381 def resolved(self, id: DatasetId, run: str) -> DatasetRef:
382 """Return resolved `DatasetRef`.
384 This is a new `DatasetRef` with the same data ID and dataset type
385 and the given ID and run.
387 Parameters
388 ----------
389 id : `DatasetId`
390 The unique identifier assigned when the dataset is created.
391 run : `str`
392 The run this dataset was associated with when it was created.
394 Returns
395 -------
396 ref : `DatasetRef`
397 A new `DatasetRef`.
398 """
399 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, id=id, run=run, conform=False)
401 def unresolved(self) -> DatasetRef:
402 """Return unresolved `DatasetRef`.
404 This is a new `DatasetRef` with the same data ID and dataset type,
405 but no ID or run.
407 Returns
408 -------
409 ref : `DatasetRef`
410 A new `DatasetRef`.
412 Notes
413 -----
414 This can be used to compare only the data ID and dataset type of a
415 pair of `DatasetRef` instances, regardless of whether either is
416 resolved::
418 if ref1.unresolved() == ref2.unresolved():
419 ...
420 """
421 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False)
423 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
424 """Return a new `DatasetRef` with the given expanded data ID.
426 Parameters
427 ----------
428 dataId : `DataCoordinate`
429 Data ID for the new `DatasetRef`. Must compare equal to the
430 original data ID.
432 Returns
433 -------
434 ref : `DatasetRef`
435 A new `DatasetRef` with the given data ID.
436 """
437 assert dataId == self.dataId
438 return DatasetRef(
439 datasetType=self.datasetType, dataId=dataId, id=self.id, run=self.run, conform=False
440 )
442 def isComponent(self) -> bool:
443 """Indicate whether this `DatasetRef` refers to a component.
445 Returns
446 -------
447 isComponent : `bool`
448 `True` if this `DatasetRef` is a component, `False` otherwise.
449 """
450 return self.datasetType.isComponent()
452 def isComposite(self) -> bool:
453 """Boolean indicating whether this `DatasetRef` is a composite type.
455 Returns
456 -------
457 isComposite : `bool`
458 `True` if this `DatasetRef` is a composite type, `False`
459 otherwise.
460 """
461 return self.datasetType.isComposite()
463 def _lookupNames(self) -> Tuple[LookupKey, ...]:
464 """Name keys to use when looking up this DatasetRef in a configuration.
466 The names are returned in order of priority.
468 Returns
469 -------
470 names : `tuple` of `LookupKey`
471 Tuple of the `DatasetType` name and the `StorageClass` name.
472 If ``instrument`` is defined in the dataId, each of those names
473 is added to the start of the tuple with a key derived from the
474 value of ``instrument``.
475 """
476 # Special case the instrument Dimension since we allow configs
477 # to include the instrument name in the hierarchy.
478 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
480 # mypy doesn't think this could return True, because even though
481 # __contains__ can take an object of any type, it seems hard-coded to
482 # assume it will return False if the type doesn't match the key type
483 # of the Mapping.
484 if "instrument" in self.dataId:
485 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) for n in names) + names
487 return names
489 @staticmethod
490 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
491 """Group an iterable of `DatasetRef` by `DatasetType`.
493 Parameters
494 ----------
495 refs : `Iterable` [ `DatasetRef` ]
496 `DatasetRef` instances to group.
498 Returns
499 -------
500 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
501 Grouped `DatasetRef` instances.
502 """
503 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
504 for ref in refs:
505 result.setdefault(ref.datasetType, []).append(ref)
506 return result
508 def getCheckedId(self) -> DatasetId:
509 """Return ``self.id``, or raise if it is `None`.
511 This trivial method exists to allow operations that would otherwise be
512 natural list comprehensions to check that the ID is not `None` as well.
514 Returns
515 -------
516 id : `DatasetId`
517 ``self.id`` if it is not `None`.
519 Raises
520 ------
521 AmbiguousDatasetError
522 Raised if ``ref.id`` is `None`.
523 """
524 if self.id is None:
525 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; a resolved reference is required.")
526 return self.id
528 def makeCompositeRef(self) -> DatasetRef:
529 """Create a `DatasetRef` of the composite from a component ref.
531 Requires that this `DatasetRef` is a component.
533 Returns
534 -------
535 ref : `DatasetRef`
536 A `DatasetRef` with a dataset type that corresponds to the
537 composite parent of this component, and the same ID and run
538 (which may be `None`, if they are `None` in ``self``).
539 """
540 # Assume that the data ID does not need to be standardized
541 # and should match whatever this ref already has.
542 return DatasetRef(
543 self.datasetType.makeCompositeDatasetType(), self.dataId, id=self.id, run=self.run, conform=False
544 )
546 def makeComponentRef(self, name: str) -> DatasetRef:
547 """Create a `DatasetRef` that corresponds to a component.
549 Parameters
550 ----------
551 name : `str`
552 Name of the component.
554 Returns
555 -------
556 ref : `DatasetRef`
557 A `DatasetRef` with a dataset type that corresponds to the given
558 component, and the same ID and run
559 (which may be `None`, if they are `None` in ``self``).
560 """
561 # Assume that the data ID does not need to be standardized
562 # and should match whatever this ref already has.
563 return DatasetRef(
564 self.datasetType.makeComponentDatasetType(name),
565 self.dataId,
566 id=self.id,
567 run=self.run,
568 conform=False,
569 )
571 def overrideStorageClass(self, storageClass: str | StorageClass) -> DatasetRef:
572 """Create a new `DatasetRef` from this one, but with a modified
573 `DatasetType` that has a different `StorageClass`.
575 Parameters
576 ----------
577 storageClass : `str` or `StorageClass`
578 The new storage class.
580 Returns
581 -------
582 modified : `DatasetRef`
583 A new dataset reference that is the same as the current one but
584 with a different storage class in the `DatasetType`.
585 """
586 return DatasetRef(
587 datasetType=self.datasetType.overrideStorageClass(storageClass),
588 dataId=self.dataId,
589 id=self.id,
590 run=self.run,
591 conform=False,
592 )
594 datasetType: DatasetType
595 """The definition of this dataset (`DatasetType`).
597 Cannot be changed after a `DatasetRef` is constructed.
598 """
600 dataId: DataCoordinate
601 """A mapping of `Dimension` primary key values that labels the dataset
602 within a Collection (`DataCoordinate`).
604 Cannot be changed after a `DatasetRef` is constructed.
605 """
607 run: Optional[str]
608 """The name of the run that produced the dataset.
610 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
611 `unresolved` to add or remove this information when creating a new
612 `DatasetRef`.
613 """
615 id: Optional[DatasetId]
616 """Primary key of the dataset (`DatasetId` or `None`).
618 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
619 `unresolved` to add or remove this information when creating a new
620 `DatasetRef`.
621 """