Coverage for python/lsst/daf/butler/core/datasets/ref.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetRef", "SerializedDatasetRef"]
25from typing import (
26 TYPE_CHECKING,
27 Any,
28 Dict,
29 Iterable,
30 List,
31 Optional,
32 Tuple,
33)
35from pydantic import BaseModel, StrictStr, ConstrainedInt, validator
37from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse
38from ..configSupport import LookupKey
39from ..utils import immutable
40from ..named import NamedKeyDict
41from .type import DatasetType, SerializedDatasetType
42from ..json import from_json_pydantic, to_json_pydantic
44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true
45 from ...registry import Registry
48class AmbiguousDatasetError(Exception):
49 """Raised when a `DatasetRef` is not resolved but should be.
51 This happens when the `DatasetRef` has no ID or run but the requested
52 operation requires one of them.
53 """
56class PositiveInt(ConstrainedInt):
57 ge = 0
58 strict = True
61class SerializedDatasetRef(BaseModel):
62 """Simplified model of a `DatasetRef` suitable for serialization."""
64 id: Optional[PositiveInt] = None
65 datasetType: Optional[SerializedDatasetType] = None
66 dataId: Optional[Dict[str, Any]] = None # Do not use specialist pydantic model for this
67 run: Optional[StrictStr] = None
68 component: Optional[StrictStr] = None
70 @validator("dataId")
71 def check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
72 if (d := "datasetType") in values and values[d] is None:
73 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
74 return v
76 @validator("run")
77 def check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
78 if v and (i := "id") in values and values[i] is None:
79 raise ValueError("'run' cannot be provided unless 'id' is.")
80 return v
82 @validator("component")
83 def check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
84 # Component should not be given if datasetType is given
85 if v and (d := "datasetType") in values and values[d] is not None:
86 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
87 return v
90@immutable
91class DatasetRef:
92 """Reference to a Dataset in a `Registry`.
94 A `DatasetRef` may point to a Dataset that currently does not yet exist
95 (e.g., because it is a predicted input for provenance).
97 Parameters
98 ----------
99 datasetType : `DatasetType`
100 The `DatasetType` for this Dataset.
101 dataId : `DataCoordinate`
102 A mapping of dimensions that labels the Dataset within a Collection.
103 id : `int`, optional
104 The unique integer identifier assigned when the dataset is created.
105 run : `str`, optional
106 The name of the run this dataset was associated with when it was
107 created. Must be provided if ``id`` is.
108 conform : `bool`, optional
109 If `True` (default), call `DataCoordinate.standardize` to ensure that
110 the data ID's dimensions are consistent with the dataset type's.
111 `DatasetRef` instances for which those dimensions are not equal should
112 not be created in new code, but are still supported for backwards
113 compatibility. New code should only pass `False` if it can guarantee
114 that the dimensions are already consistent.
116 Raises
117 ------
118 ValueError
119 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
120 provided but ``run`` is not.
121 """
123 _serializedType = SerializedDatasetRef
124 __slots__ = ("id", "datasetType", "dataId", "run",)
126 def __init__(
127 self,
128 datasetType: DatasetType, dataId: DataCoordinate, *,
129 id: Optional[int] = None,
130 run: Optional[str] = None,
131 conform: bool = True
132 ):
133 self.id = id
134 self.datasetType = datasetType
135 if conform:
136 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
137 else:
138 self.dataId = dataId
139 if self.id is not None:
140 if run is None:
141 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
142 f"type={datasetType}, and dataId={dataId}.")
143 self.run = run
144 else:
145 if run is not None:
146 raise ValueError("'run' cannot be provided unless 'id' is.")
147 self.run = None
149 def __eq__(self, other: Any) -> bool:
150 try:
151 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
152 except AttributeError:
153 return NotImplemented
155 def __hash__(self) -> int:
156 return hash((self.datasetType, self.dataId, self.id))
158 @property
159 def dimensions(self) -> DimensionGraph:
160 """Dimensions associated with the underlying `DatasetType`."""
161 return self.datasetType.dimensions
163 def __repr__(self) -> str:
164 # We delegate to __str__ (i.e use "!s") for the data ID) below because
165 # DataCoordinate's __repr__ - while adhering to the guidelines for
166 # __repr__ - is much harder to users to read, while its __str__ just
167 # produces a dict that can also be passed to DatasetRef's constructor.
168 if self.id is not None:
169 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})")
170 else:
171 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
173 def __str__(self) -> str:
174 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]"
175 if self.id is not None:
176 s += f" (id={self.id})"
177 return s
179 def __lt__(self, other: Any) -> bool:
180 # Sort by run, DatasetType name and then by DataCoordinate
181 # The __str__ representation is probably close enough but we
182 # need to ensure that sorting a DatasetRef matches what you would
183 # get if you sorted DatasetType+DataCoordinate
184 if not isinstance(other, type(self)):
185 return NotImplemented
187 # Group by run if defined, takes precedence over DatasetType
188 self_run = "" if self.run is None else self.run
189 other_run = "" if other.run is None else other.run
191 # Compare tuples in the priority order
192 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
194 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
195 """Convert this class to a simple python type.
197 This makes it suitable for serialization.
199 Parameters
200 ----------
201 minimal : `bool`, optional
202 Use minimal serialization. Requires Registry to convert
203 back to a full type.
205 Returns
206 -------
207 simple : `dict` or `int`
208 The object converted to a dictionary.
209 """
210 if minimal and self.id is not None:
211 # The only thing needed to uniquely define a DatasetRef
212 # is the integer id so that can be used directly if it is
213 # resolved and if it is not a component DatasetRef.
214 # Store is in a dict to allow us to easily add the planned
215 # origin information later without having to support
216 # an int and dict in simple form.
217 simple: Dict[str, Any] = {"id": self.id}
218 if self.isComponent():
219 # We can still be a little minimalist with a component
220 # but we will also need to record the datasetType component
221 simple["component"] = self.datasetType.component()
222 return SerializedDatasetRef(**simple)
224 # Convert to a dict form
225 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal),
226 "dataId": self.dataId.to_simple(),
227 }
229 # Only include the id entry if it is defined
230 if self.id is not None:
231 as_dict["run"] = self.run
232 as_dict["id"] = self.id
234 return SerializedDatasetRef(**as_dict)
236 @classmethod
237 def from_simple(cls, simple: SerializedDatasetRef,
238 universe: Optional[DimensionUniverse] = None,
239 registry: Optional[Registry] = None) -> DatasetRef:
240 """Construct a new object from simplified form.
242 Generally this is data returned from the `to_simple` method.
244 Parameters
245 ----------
246 simple : `dict` of [`str`, `Any`]
247 The value returned by `to_simple()`.
248 universe : `DimensionUniverse`
249 The special graph of all known dimensions.
250 Can be `None` if a registry is provided.
251 registry : `lsst.daf.butler.Registry`, optional
252 Registry to use to convert simple form of a DatasetRef to
253 a full `DatasetRef`. Can be `None` if a full description of
254 the type is provided along with a universe.
256 Returns
257 -------
258 ref : `DatasetRef`
259 Newly-constructed object.
260 """
261 # Minimalist component will just specify component and id and
262 # require registry to reconstruct
263 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
264 if registry is None:
265 raise ValueError("Registry is required to construct component DatasetRef from integer id")
266 if simple.id is None:
267 raise ValueError("For minimal DatasetRef the ID must be defined.")
268 ref = registry.getDataset(simple.id)
269 if ref is None:
270 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
271 if simple.component:
272 ref = ref.makeComponentRef(simple.component)
273 return ref
275 if universe is None and registry is None:
276 raise ValueError("One of universe or registry must be provided.")
278 if universe is None and registry is not None:
279 universe = registry.dimensions
281 if universe is None:
282 # this is for mypy
283 raise ValueError("Unable to determine a usable universe")
285 if simple.datasetType is None:
286 # mypy
287 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
288 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
290 if simple.dataId is None:
291 # mypy
292 raise ValueError("The DataId must be specified to construct a DatasetRef")
293 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
294 return cls(datasetType, dataId,
295 id=simple.id, run=simple.run)
297 to_json = to_json_pydantic
298 from_json = classmethod(from_json_pydantic)
300 @classmethod
301 def _unpickle(
302 cls,
303 datasetType: DatasetType,
304 dataId: DataCoordinate,
305 id: Optional[int],
306 run: Optional[str],
307 ) -> DatasetRef:
308 """Create new `DatasetRef`.
310 A custom factory method for use by `__reduce__` as a workaround for
311 its lack of support for keyword arguments.
312 """
313 return cls(datasetType, dataId, id=id, run=run)
315 def __reduce__(self) -> tuple:
316 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
318 def __deepcopy__(self, memo: dict) -> DatasetRef:
319 # DatasetRef is recursively immutable; see note in @immutable
320 # decorator.
321 return self
323 def resolved(self, id: int, run: str) -> DatasetRef:
324 """Return resolved `DatasetRef`.
326 This is a new `DatasetRef` with the same data ID and dataset type
327 and the given ID and run.
329 Parameters
330 ----------
331 id : `int`
332 The unique integer identifier assigned when the dataset is created.
333 run : `str`
334 The run this dataset was associated with when it was created.
336 Returns
337 -------
338 ref : `DatasetRef`
339 A new `DatasetRef`.
340 """
341 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
342 id=id, run=run, conform=False)
344 def unresolved(self) -> DatasetRef:
345 """Return unresolved `DatasetRef`.
347 This is a new `DatasetRef` with the same data ID and dataset type,
348 but no ID or run.
350 Returns
351 -------
352 ref : `DatasetRef`
353 A new `DatasetRef`.
355 Notes
356 -----
357 This can be used to compare only the data ID and dataset type of a
358 pair of `DatasetRef` instances, regardless of whether either is
359 resolved::
361 if ref1.unresolved() == ref2.unresolved():
362 ...
363 """
364 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False)
366 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
367 """Return a new `DatasetRef` with the given expanded data ID.
369 Parameters
370 ----------
371 dataId : `DataCoordinate`
372 Data ID for the new `DatasetRef`. Must compare equal to the
373 original data ID.
375 Returns
376 -------
377 ref : `DatasetRef`
378 A new `DatasetRef` with the given data ID.
379 """
380 assert dataId == self.dataId
381 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
382 id=self.id, run=self.run,
383 conform=False)
385 def isComponent(self) -> bool:
386 """Indicate whether this `DatasetRef` refers to a component.
388 Returns
389 -------
390 isComponent : `bool`
391 `True` if this `DatasetRef` is a component, `False` otherwise.
392 """
393 return self.datasetType.isComponent()
395 def isComposite(self) -> bool:
396 """Boolean indicating whether this `DatasetRef` is a composite type.
398 Returns
399 -------
400 isComposite : `bool`
401 `True` if this `DatasetRef` is a composite type, `False`
402 otherwise.
403 """
404 return self.datasetType.isComposite()
406 def _lookupNames(self) -> Tuple[LookupKey, ...]:
407 """Name keys to use when looking up this DatasetRef in a configuration.
409 The names are returned in order of priority.
411 Returns
412 -------
413 names : `tuple` of `LookupKey`
414 Tuple of the `DatasetType` name and the `StorageClass` name.
415 If ``instrument`` is defined in the dataId, each of those names
416 is added to the start of the tuple with a key derived from the
417 value of ``instrument``.
418 """
419 # Special case the instrument Dimension since we allow configs
420 # to include the instrument name in the hierarchy.
421 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
423 # mypy doesn't think this could return True, because even though
424 # __contains__ can take an object of any type, it seems hard-coded to
425 # assume it will return False if the type doesn't match the key type
426 # of the Mapping.
427 if "instrument" in self.dataId: # type: ignore
428 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
429 for n in names) + names
431 return names
433 @staticmethod
434 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
435 """Group an iterable of `DatasetRef` by `DatasetType`.
437 Parameters
438 ----------
439 refs : `Iterable` [ `DatasetRef` ]
440 `DatasetRef` instances to group.
442 Returns
443 -------
444 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
445 Grouped `DatasetRef` instances.
446 """
447 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
448 for ref in refs:
449 result.setdefault(ref.datasetType, []).append(ref)
450 return result
452 def getCheckedId(self) -> int:
453 """Return ``self.id``, or raise if it is `None`.
455 This trivial method exists to allow operations that would otherwise be
456 natural list comprehensions to check that the ID is not `None` as well.
458 Returns
459 -------
460 id : `int`
461 ``self.id`` if it is not `None`.
463 Raises
464 ------
465 AmbiguousDatasetError
466 Raised if ``ref.id`` is `None`.
467 """
468 if self.id is None:
469 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
470 f"a resolved reference is required.")
471 return self.id
473 def makeCompositeRef(self) -> DatasetRef:
474 """Create a `DatasetRef` of the composite from a component ref.
476 Requires that this `DatasetRef` is a component.
478 Returns
479 -------
480 ref : `DatasetRef`
481 A `DatasetRef` with a dataset type that corresponds to the
482 composite parent of this component, and the same ID and run
483 (which may be `None`, if they are `None` in ``self``).
484 """
485 # Assume that the data ID does not need to be standardized
486 # and should match whatever this ref already has.
487 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId,
488 id=self.id, run=self.run, conform=False)
490 def makeComponentRef(self, name: str) -> DatasetRef:
491 """Create a `DatasetRef` that corresponds to a component.
493 Parameters
494 ----------
495 name : `str`
496 Name of the component.
498 Returns
499 -------
500 ref : `DatasetRef`
501 A `DatasetRef` with a dataset type that corresponds to the given
502 component, and the same ID and run
503 (which may be `None`, if they are `None` in ``self``).
504 """
505 # Assume that the data ID does not need to be standardized
506 # and should match whatever this ref already has.
507 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId,
508 id=self.id, run=self.run, conform=False)
510 datasetType: DatasetType
511 """The definition of this dataset (`DatasetType`).
513 Cannot be changed after a `DatasetRef` is constructed.
514 """
516 dataId: DataCoordinate
517 """A mapping of `Dimension` primary key values that labels the dataset
518 within a Collection (`DataCoordinate`).
520 Cannot be changed after a `DatasetRef` is constructed.
521 """
523 run: Optional[str]
524 """The name of the run that produced the dataset.
526 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
527 `unresolved` to add or remove this information when creating a new
528 `DatasetRef`.
529 """
531 id: Optional[int]
532 """Primary key of the dataset (`int` or `None`).
534 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
535 `unresolved` to add or remove this information when creating a new
536 `DatasetRef`.
537 """