Coverage for python/lsst/daf/butler/core/datasets/ref.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetId", "DatasetRef", "SerializedDatasetRef"]
25import uuid
26from typing import (
27 TYPE_CHECKING,
28 Any,
29 Dict,
30 Iterable,
31 List,
32 Optional,
33 Tuple,
34 Union,
35)
37from pydantic import BaseModel, StrictStr, ConstrainedInt, validator
39from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse
40from ..configSupport import LookupKey
41from ..utils import immutable
42from ..named import NamedKeyDict
43from .type import DatasetType, SerializedDatasetType
44from ..json import from_json_pydantic, to_json_pydantic
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from ...registry import Registry
50class AmbiguousDatasetError(Exception):
51 """Raised when a `DatasetRef` is not resolved but should be.
53 This happens when the `DatasetRef` has no ID or run but the requested
54 operation requires one of them.
55 """
58class PositiveInt(ConstrainedInt):
59 ge = 0
60 strict = True
63class SerializedDatasetRef(BaseModel):
64 """Simplified model of a `DatasetRef` suitable for serialization."""
66 # DO NOT change order in the Union, pydantic is sensitive to that!
67 id: Optional[Union[uuid.UUID, PositiveInt]] = None
68 datasetType: Optional[SerializedDatasetType] = None
69 dataId: Optional[Dict[str, Any]] = None # Do not use specialist pydantic model for this
70 run: Optional[StrictStr] = None
71 component: Optional[StrictStr] = None
73 @validator("dataId")
74 def check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
75 if (d := "datasetType") in values and values[d] is None:
76 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'")
77 return v
79 @validator("run")
80 def check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
81 if v and (i := "id") in values and values[i] is None:
82 raise ValueError("'run' cannot be provided unless 'id' is.")
83 return v
85 @validator("component")
86 def check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805
87 # Component should not be given if datasetType is given
88 if v and (d := "datasetType") in values and values[d] is not None:
89 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).")
90 return v
93DatasetId = Union[int, uuid.UUID]
94"""A type-annotation alias for dataset ID which could be either integer or
95UUID.
96"""
99@immutable
100class DatasetRef:
101 """Reference to a Dataset in a `Registry`.
103 A `DatasetRef` may point to a Dataset that currently does not yet exist
104 (e.g., because it is a predicted input for provenance).
106 Parameters
107 ----------
108 datasetType : `DatasetType`
109 The `DatasetType` for this Dataset.
110 dataId : `DataCoordinate`
111 A mapping of dimensions that labels the Dataset within a Collection.
112 id : `DatasetId`, optional
113 The unique identifier assigned when the dataset is created.
114 run : `str`, optional
115 The name of the run this dataset was associated with when it was
116 created. Must be provided if ``id`` is.
117 conform : `bool`, optional
118 If `True` (default), call `DataCoordinate.standardize` to ensure that
119 the data ID's dimensions are consistent with the dataset type's.
120 `DatasetRef` instances for which those dimensions are not equal should
121 not be created in new code, but are still supported for backwards
122 compatibility. New code should only pass `False` if it can guarantee
123 that the dimensions are already consistent.
125 Raises
126 ------
127 ValueError
128 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
129 provided but ``run`` is not.
130 """
132 _serializedType = SerializedDatasetRef
133 __slots__ = ("id", "datasetType", "dataId", "run",)
135 def __init__(
136 self,
137 datasetType: DatasetType, dataId: DataCoordinate, *,
138 id: Optional[DatasetId] = None,
139 run: Optional[str] = None,
140 conform: bool = True
141 ):
142 self.id = id
143 self.datasetType = datasetType
144 if conform:
145 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
146 else:
147 self.dataId = dataId
148 if self.id is not None:
149 if run is None:
150 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
151 f"type={datasetType}, and dataId={dataId}.")
152 self.run = run
153 else:
154 if run is not None:
155 raise ValueError("'run' cannot be provided unless 'id' is.")
156 self.run = None
158 def __eq__(self, other: Any) -> bool:
159 try:
160 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
161 except AttributeError:
162 return NotImplemented
164 def __hash__(self) -> int:
165 return hash((self.datasetType, self.dataId, self.id))
167 @property
168 def dimensions(self) -> DimensionGraph:
169 """Dimensions associated with the underlying `DatasetType`."""
170 return self.datasetType.dimensions
172 def __repr__(self) -> str:
173 # We delegate to __str__ (i.e use "!s") for the data ID) below because
174 # DataCoordinate's __repr__ - while adhering to the guidelines for
175 # __repr__ - is much harder to users to read, while its __str__ just
176 # produces a dict that can also be passed to DatasetRef's constructor.
177 if self.id is not None:
178 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})")
179 else:
180 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
182 def __str__(self) -> str:
183 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]"
184 if self.id is not None:
185 s += f" (id={self.id})"
186 return s
188 def __lt__(self, other: Any) -> bool:
189 # Sort by run, DatasetType name and then by DataCoordinate
190 # The __str__ representation is probably close enough but we
191 # need to ensure that sorting a DatasetRef matches what you would
192 # get if you sorted DatasetType+DataCoordinate
193 if not isinstance(other, type(self)):
194 return NotImplemented
196 # Group by run if defined, takes precedence over DatasetType
197 self_run = "" if self.run is None else self.run
198 other_run = "" if other.run is None else other.run
200 # Compare tuples in the priority order
201 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
203 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef:
204 """Convert this class to a simple python type.
206 This makes it suitable for serialization.
208 Parameters
209 ----------
210 minimal : `bool`, optional
211 Use minimal serialization. Requires Registry to convert
212 back to a full type.
214 Returns
215 -------
216 simple : `dict` or `int`
217 The object converted to a dictionary.
218 """
219 if minimal and self.id is not None:
220 # The only thing needed to uniquely define a DatasetRef
221 # is its id so that can be used directly if it is
222 # resolved and if it is not a component DatasetRef.
223 # Store is in a dict to allow us to easily add the planned
224 # origin information later without having to support
225 # an int and dict in simple form.
226 simple: Dict[str, Any] = {"id": self.id}
227 if self.isComponent():
228 # We can still be a little minimalist with a component
229 # but we will also need to record the datasetType component
230 simple["component"] = self.datasetType.component()
231 return SerializedDatasetRef(**simple)
233 # Convert to a dict form
234 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal),
235 "dataId": self.dataId.to_simple(),
236 }
238 # Only include the id entry if it is defined
239 if self.id is not None:
240 as_dict["run"] = self.run
241 as_dict["id"] = self.id
243 return SerializedDatasetRef(**as_dict)
245 @classmethod
246 def from_simple(cls, simple: SerializedDatasetRef,
247 universe: Optional[DimensionUniverse] = None,
248 registry: Optional[Registry] = None) -> DatasetRef:
249 """Construct a new object from simplified form.
251 Generally this is data returned from the `to_simple` method.
253 Parameters
254 ----------
255 simple : `dict` of [`str`, `Any`]
256 The value returned by `to_simple()`.
257 universe : `DimensionUniverse`
258 The special graph of all known dimensions.
259 Can be `None` if a registry is provided.
260 registry : `lsst.daf.butler.Registry`, optional
261 Registry to use to convert simple form of a DatasetRef to
262 a full `DatasetRef`. Can be `None` if a full description of
263 the type is provided along with a universe.
265 Returns
266 -------
267 ref : `DatasetRef`
268 Newly-constructed object.
269 """
270 # Minimalist component will just specify component and id and
271 # require registry to reconstruct
272 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}):
273 if registry is None:
274 raise ValueError("Registry is required to construct component DatasetRef from integer id")
275 if simple.id is None:
276 raise ValueError("For minimal DatasetRef the ID must be defined.")
277 ref = registry.getDataset(simple.id)
278 if ref is None:
279 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}")
280 if simple.component:
281 ref = ref.makeComponentRef(simple.component)
282 return ref
284 if universe is None and registry is None:
285 raise ValueError("One of universe or registry must be provided.")
287 if universe is None and registry is not None:
288 universe = registry.dimensions
290 if universe is None:
291 # this is for mypy
292 raise ValueError("Unable to determine a usable universe")
294 if simple.datasetType is None:
295 # mypy
296 raise ValueError("The DatasetType must be specified to construct a DatasetRef")
297 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry)
299 if simple.dataId is None:
300 # mypy
301 raise ValueError("The DataId must be specified to construct a DatasetRef")
302 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe)
303 return cls(datasetType, dataId,
304 id=simple.id, run=simple.run)
306 to_json = to_json_pydantic
307 from_json = classmethod(from_json_pydantic)
309 @classmethod
310 def _unpickle(
311 cls,
312 datasetType: DatasetType,
313 dataId: DataCoordinate,
314 id: Optional[DatasetId],
315 run: Optional[str],
316 ) -> DatasetRef:
317 """Create new `DatasetRef`.
319 A custom factory method for use by `__reduce__` as a workaround for
320 its lack of support for keyword arguments.
321 """
322 return cls(datasetType, dataId, id=id, run=run)
324 def __reduce__(self) -> tuple:
325 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
327 def __deepcopy__(self, memo: dict) -> DatasetRef:
328 # DatasetRef is recursively immutable; see note in @immutable
329 # decorator.
330 return self
332 def resolved(self, id: DatasetId, run: str) -> DatasetRef:
333 """Return resolved `DatasetRef`.
335 This is a new `DatasetRef` with the same data ID and dataset type
336 and the given ID and run.
338 Parameters
339 ----------
340 id : `DatasetId`
341 The unique identifier assigned when the dataset is created.
342 run : `str`
343 The run this dataset was associated with when it was created.
345 Returns
346 -------
347 ref : `DatasetRef`
348 A new `DatasetRef`.
349 """
350 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
351 id=id, run=run, conform=False)
353 def unresolved(self) -> DatasetRef:
354 """Return unresolved `DatasetRef`.
356 This is a new `DatasetRef` with the same data ID and dataset type,
357 but no ID or run.
359 Returns
360 -------
361 ref : `DatasetRef`
362 A new `DatasetRef`.
364 Notes
365 -----
366 This can be used to compare only the data ID and dataset type of a
367 pair of `DatasetRef` instances, regardless of whether either is
368 resolved::
370 if ref1.unresolved() == ref2.unresolved():
371 ...
372 """
373 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False)
375 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
376 """Return a new `DatasetRef` with the given expanded data ID.
378 Parameters
379 ----------
380 dataId : `DataCoordinate`
381 Data ID for the new `DatasetRef`. Must compare equal to the
382 original data ID.
384 Returns
385 -------
386 ref : `DatasetRef`
387 A new `DatasetRef` with the given data ID.
388 """
389 assert dataId == self.dataId
390 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
391 id=self.id, run=self.run,
392 conform=False)
394 def isComponent(self) -> bool:
395 """Indicate whether this `DatasetRef` refers to a component.
397 Returns
398 -------
399 isComponent : `bool`
400 `True` if this `DatasetRef` is a component, `False` otherwise.
401 """
402 return self.datasetType.isComponent()
404 def isComposite(self) -> bool:
405 """Boolean indicating whether this `DatasetRef` is a composite type.
407 Returns
408 -------
409 isComposite : `bool`
410 `True` if this `DatasetRef` is a composite type, `False`
411 otherwise.
412 """
413 return self.datasetType.isComposite()
415 def _lookupNames(self) -> Tuple[LookupKey, ...]:
416 """Name keys to use when looking up this DatasetRef in a configuration.
418 The names are returned in order of priority.
420 Returns
421 -------
422 names : `tuple` of `LookupKey`
423 Tuple of the `DatasetType` name and the `StorageClass` name.
424 If ``instrument`` is defined in the dataId, each of those names
425 is added to the start of the tuple with a key derived from the
426 value of ``instrument``.
427 """
428 # Special case the instrument Dimension since we allow configs
429 # to include the instrument name in the hierarchy.
430 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
432 # mypy doesn't think this could return True, because even though
433 # __contains__ can take an object of any type, it seems hard-coded to
434 # assume it will return False if the type doesn't match the key type
435 # of the Mapping.
436 if "instrument" in self.dataId: # type: ignore
437 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
438 for n in names) + names
440 return names
442 @staticmethod
443 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
444 """Group an iterable of `DatasetRef` by `DatasetType`.
446 Parameters
447 ----------
448 refs : `Iterable` [ `DatasetRef` ]
449 `DatasetRef` instances to group.
451 Returns
452 -------
453 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
454 Grouped `DatasetRef` instances.
455 """
456 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
457 for ref in refs:
458 result.setdefault(ref.datasetType, []).append(ref)
459 return result
461 def getCheckedId(self) -> DatasetId:
462 """Return ``self.id``, or raise if it is `None`.
464 This trivial method exists to allow operations that would otherwise be
465 natural list comprehensions to check that the ID is not `None` as well.
467 Returns
468 -------
469 id : `DatasetId`
470 ``self.id`` if it is not `None`.
472 Raises
473 ------
474 AmbiguousDatasetError
475 Raised if ``ref.id`` is `None`.
476 """
477 if self.id is None:
478 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
479 f"a resolved reference is required.")
480 return self.id
482 def makeCompositeRef(self) -> DatasetRef:
483 """Create a `DatasetRef` of the composite from a component ref.
485 Requires that this `DatasetRef` is a component.
487 Returns
488 -------
489 ref : `DatasetRef`
490 A `DatasetRef` with a dataset type that corresponds to the
491 composite parent of this component, and the same ID and run
492 (which may be `None`, if they are `None` in ``self``).
493 """
494 # Assume that the data ID does not need to be standardized
495 # and should match whatever this ref already has.
496 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId,
497 id=self.id, run=self.run, conform=False)
499 def makeComponentRef(self, name: str) -> DatasetRef:
500 """Create a `DatasetRef` that corresponds to a component.
502 Parameters
503 ----------
504 name : `str`
505 Name of the component.
507 Returns
508 -------
509 ref : `DatasetRef`
510 A `DatasetRef` with a dataset type that corresponds to the given
511 component, and the same ID and run
512 (which may be `None`, if they are `None` in ``self``).
513 """
514 # Assume that the data ID does not need to be standardized
515 # and should match whatever this ref already has.
516 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId,
517 id=self.id, run=self.run, conform=False)
519 datasetType: DatasetType
520 """The definition of this dataset (`DatasetType`).
522 Cannot be changed after a `DatasetRef` is constructed.
523 """
525 dataId: DataCoordinate
526 """A mapping of `Dimension` primary key values that labels the dataset
527 within a Collection (`DataCoordinate`).
529 Cannot be changed after a `DatasetRef` is constructed.
530 """
532 run: Optional[str]
533 """The name of the run that produced the dataset.
535 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
536 `unresolved` to add or remove this information when creating a new
537 `DatasetRef`.
538 """
540 id: Optional[DatasetId]
541 """Primary key of the dataset (`DatasetId` or `None`).
543 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
544 `unresolved` to add or remove this information when creating a new
545 `DatasetRef`.
546 """