Coverage for python/lsst/daf/butler/core/datasets/ref.py : 28%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetRef"]
25from typing import (
26 TYPE_CHECKING,
27 Any,
28 Dict,
29 Iterable,
30 List,
31 Optional,
32 Tuple,
33)
35from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse
36from ..configSupport import LookupKey
37from ..utils import immutable
38from ..named import NamedKeyDict
39from .type import DatasetType
40from ..json import from_json_generic, to_json_generic
42if TYPE_CHECKING: 42 ↛ 43line 42 didn't jump to line 43, because the condition on line 42 was never true
43 from ...registry import Registry
46class AmbiguousDatasetError(Exception):
47 """Exception raised when a `DatasetRef` is not resolved (has no ID or run),
48 but the requested operation requires one of them.
49 """
52@immutable
53class DatasetRef:
54 """Reference to a Dataset in a `Registry`.
56 A `DatasetRef` may point to a Dataset that currently does not yet exist
57 (e.g., because it is a predicted input for provenance).
59 Parameters
60 ----------
61 datasetType : `DatasetType`
62 The `DatasetType` for this Dataset.
63 dataId : `DataCoordinate`
64 A mapping of dimensions that labels the Dataset within a Collection.
65 id : `int`, optional
66 The unique integer identifier assigned when the dataset is created.
67 run : `str`, optional
68 The name of the run this dataset was associated with when it was
69 created. Must be provided if ``id`` is.
70 conform : `bool`, optional
71 If `True` (default), call `DataCoordinate.standardize` to ensure that
72 the data ID's dimensions are consistent with the dataset type's.
73 `DatasetRef` instances for which those dimensions are not equal should
74 not be created in new code, but are still supported for backwards
75 compatibility. New code should only pass `False` if it can guarantee
76 that the dimensions are already consistent.
78 Raises
79 ------
80 ValueError
81 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
82 provided but ``run`` is not.
83 """
85 __slots__ = ("id", "datasetType", "dataId", "run",)
87 def __init__(
88 self,
89 datasetType: DatasetType, dataId: DataCoordinate, *,
90 id: Optional[int] = None,
91 run: Optional[str] = None,
92 conform: bool = True
93 ):
94 self.id = id
95 self.datasetType = datasetType
96 if conform:
97 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
98 else:
99 self.dataId = dataId
100 if self.id is not None:
101 if run is None:
102 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
103 f"type={datasetType}, and dataId={dataId}.")
104 self.run = run
105 else:
106 if run is not None:
107 raise ValueError("'run' cannot be provided unless 'id' is.")
108 self.run = None
110 def __eq__(self, other: Any) -> bool:
111 try:
112 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
113 except AttributeError:
114 return NotImplemented
116 def __hash__(self) -> int:
117 return hash((self.datasetType, self.dataId, self.id))
119 @property
120 def dimensions(self) -> DimensionGraph:
121 """The dimensions associated with the underlying `DatasetType`
122 """
123 return self.datasetType.dimensions
125 def __repr__(self) -> str:
126 # We delegate to __str__ (i.e use "!s") for the data ID) below because
127 # DataCoordinate's __repr__ - while adhering to the guidelines for
128 # __repr__ - is much harder to users to read, while its __str__ just
129 # produces a dict that can also be passed to DatasetRef's constructor.
130 if self.id is not None:
131 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})")
132 else:
133 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
135 def __str__(self) -> str:
136 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]"
137 if self.id is not None:
138 s += f" (id={self.id})"
139 return s
141 def __lt__(self, other: Any) -> bool:
142 # Sort by run, DatasetType name and then by DataCoordinate
143 # The __str__ representation is probably close enough but we
144 # need to ensure that sorting a DatasetRef matches what you would
145 # get if you sorted DatasetType+DataCoordinate
146 if not isinstance(other, type(self)):
147 return NotImplemented
149 # Group by run if defined, takes precedence over DatasetType
150 self_run = "" if self.run is None else self.run
151 other_run = "" if other.run is None else other.run
153 # Compare tuples in the priority order
154 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId)
156 def to_simple(self, minimal: bool = False) -> Dict:
157 """Convert this class to a simple python type suitable for
158 serialization.
160 Parameters
161 ----------
162 minimal : `bool`, optional
163 Use minimal serialization. Requires Registry to convert
164 back to a full type.
166 Returns
167 -------
168 simple : `dict` or `int`
169 The object converted to a dictionary.
170 """
171 if minimal and self.id is not None:
172 # The only thing needed to uniquely define a DatasetRef
173 # is the integer id so that can be used directly if it is
174 # resolved and if it is not a component DatasetRef.
175 # Store is in a dict to allow us to easily add the planned
176 # origin information later without having to support
177 # an int and dict in simple form.
178 simple: Dict[str, Any] = {"id": self.id}
179 if self.isComponent():
180 # We can still be a little minimalist with a component
181 # but we will also need to record the datasetType component
182 simple["component"] = self.datasetType.component()
183 return simple
185 # Convert to a dict form
186 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal),
187 "dataId": self.dataId.to_simple(),
188 }
190 # Only include the id entry if it is defined
191 if self.id is not None:
192 as_dict["run"] = self.run
193 as_dict["id"] = self.id
195 return as_dict
197 @classmethod
198 def from_simple(cls, simple: Dict,
199 universe: Optional[DimensionUniverse] = None,
200 registry: Optional[Registry] = None) -> DatasetRef:
201 """Construct a new object from the data returned from the `to_simple`
202 method.
204 Parameters
205 ----------
206 simple : `dict` of [`str`, `Any`]
207 The value returned by `to_simple()`.
208 universe : `DimensionUniverse`
209 The special graph of all known dimensions.
210 Can be `None` if a registry is provided.
211 registry : `lsst.daf.butler.Registry`, optional
212 Registry to use to convert simple form of a DatasetRef to
213 a full `DatasetRef`. Can be `None` if a full description of
214 the type is provided along with a universe.
216 Returns
217 -------
218 ref : `DatasetRef`
219 Newly-constructed object.
220 """
222 # Minimalist component will just specify component and id and
223 # require registry to reconstruct
224 if set(simple).issubset({"id", "component"}):
225 if registry is None:
226 raise ValueError("Registry is required to construct component DatasetRef from integer id")
227 ref = registry.getDataset(simple["id"])
228 if ref is None:
229 raise RuntimeError(f"No matching dataset found in registry for id {simple['id']}")
230 if "component" in simple:
231 ref = ref.makeComponentRef(simple["component"])
232 return ref
234 if universe is None and registry is None:
235 raise ValueError("One of universe or registry must be provided.")
237 if universe is None and registry is not None:
238 universe = registry.dimensions
240 if universe is None:
241 # this is for mypy
242 raise ValueError("Unable to determine a usable universe")
244 datasetType = DatasetType.from_simple(simple["datasetType"], universe=universe, registry=registry)
245 dataId = DataCoordinate.from_simple(simple["dataId"], universe=universe)
246 return cls(datasetType, dataId,
247 id=simple["id"], run=simple["run"])
249 to_json = to_json_generic
250 from_json = classmethod(from_json_generic)
252 @classmethod
253 def _unpickle(
254 cls,
255 datasetType: DatasetType,
256 dataId: DataCoordinate,
257 id: Optional[int],
258 run: Optional[str],
259 ) -> DatasetRef:
260 """A custom factory method for use by `__reduce__` as a workaround for
261 its lack of support for keyword arguments.
262 """
263 return cls(datasetType, dataId, id=id, run=run)
265 def __reduce__(self) -> tuple:
266 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run))
268 def __deepcopy__(self, memo: dict) -> DatasetRef:
269 # DatasetRef is recursively immutable; see note in @immutable
270 # decorator.
271 return self
273 def resolved(self, id: int, run: str) -> DatasetRef:
274 """Return a new `DatasetRef` with the same data ID and dataset type
275 and the given ID and run.
277 Parameters
278 ----------
279 id : `int`
280 The unique integer identifier assigned when the dataset is created.
281 run : `str`
282 The run this dataset was associated with when it was created.
284 Returns
285 -------
286 ref : `DatasetRef`
287 A new `DatasetRef`.
288 """
289 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
290 id=id, run=run, conform=False)
292 def unresolved(self) -> DatasetRef:
293 """Return a new `DatasetRef` with the same data ID and dataset type,
294 but no ID or run.
296 Returns
297 -------
298 ref : `DatasetRef`
299 A new `DatasetRef`.
301 Notes
302 -----
303 This can be used to compare only the data ID and dataset type of a
304 pair of `DatasetRef` instances, regardless of whether either is
305 resolved::
307 if ref1.unresolved() == ref2.unresolved():
308 ...
309 """
310 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False)
312 def expanded(self, dataId: DataCoordinate) -> DatasetRef:
313 """Return a new `DatasetRef` with the given expanded data ID.
315 Parameters
316 ----------
317 dataId : `DataCoordinate`
318 Data ID for the new `DatasetRef`. Must compare equal to the
319 original data ID.
321 Returns
322 -------
323 ref : `DatasetRef`
324 A new `DatasetRef` with the given data ID.
325 """
326 assert dataId == self.dataId
327 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
328 id=self.id, run=self.run,
329 conform=False)
331 def isComponent(self) -> bool:
332 """Boolean indicating whether this `DatasetRef` refers to a
333 component of a composite.
335 Returns
336 -------
337 isComponent : `bool`
338 `True` if this `DatasetRef` is a component, `False` otherwise.
339 """
340 return self.datasetType.isComponent()
342 def isComposite(self) -> bool:
343 """Boolean indicating whether this `DatasetRef` is a composite type.
345 Returns
346 -------
347 isComposite : `bool`
348 `True` if this `DatasetRef` is a composite type, `False`
349 otherwise.
350 """
351 return self.datasetType.isComposite()
353 def _lookupNames(self) -> Tuple[LookupKey, ...]:
354 """Name keys to use when looking up this DatasetRef in a configuration.
356 The names are returned in order of priority.
358 Returns
359 -------
360 names : `tuple` of `LookupKey`
361 Tuple of the `DatasetType` name and the `StorageClass` name.
362 If ``instrument`` is defined in the dataId, each of those names
363 is added to the start of the tuple with a key derived from the
364 value of ``instrument``.
365 """
366 # Special case the instrument Dimension since we allow configs
367 # to include the instrument name in the hierarchy.
368 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
370 # mypy doesn't think this could return True, because even though
371 # __contains__ can take an object of any type, it seems hard-coded to
372 # assume it will return False if the type doesn't match the key type
373 # of the Mapping.
374 if "instrument" in self.dataId: # type: ignore
375 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
376 for n in names) + names
378 return names
380 @staticmethod
381 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
382 """Group an iterable of `DatasetRef` by `DatasetType`.
384 Parameters
385 ----------
386 refs : `Iterable` [ `DatasetRef` ]
387 `DatasetRef` instances to group.
389 Returns
390 -------
391 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
392 Grouped `DatasetRef` instances.
393 """
394 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
395 for ref in refs:
396 result.setdefault(ref.datasetType, []).append(ref)
397 return result
399 def getCheckedId(self) -> int:
400 """Return ``self.id``, or raise if it is `None`.
402 This trivial method exists to allow operations that would otherwise be
403 natural list comprehensions to check that the ID is not `None` as well.
405 Returns
406 -------
407 id : `int`
408 ``self.id`` if it is not `None`.
410 Raises
411 ------
412 AmbiguousDatasetError
413 Raised if ``ref.id`` is `None`.
414 """
415 if self.id is None:
416 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
417 f"a resolved reference is required.")
418 return self.id
420 def makeComponentRef(self, name: str) -> DatasetRef:
421 """Create a `DatasetRef` that corresponds to a component of this
422 dataset.
424 Parameters
425 ----------
426 name : `str`
427 Name of the component.
429 Returns
430 -------
431 ref : `DatasetRef`
432 A `DatasetRef` with a dataset type that corresponds to the given
433 component, and the same ID and run
434 (which may be `None`, if they are `None` in ``self``).
435 """
436 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId,
437 id=self.id, run=self.run)
439 datasetType: DatasetType
440 """The definition of this dataset (`DatasetType`).
442 Cannot be changed after a `DatasetRef` is constructed.
443 """
445 dataId: DataCoordinate
446 """A mapping of `Dimension` primary key values that labels the dataset
447 within a Collection (`DataCoordinate`).
449 Cannot be changed after a `DatasetRef` is constructed.
450 """
452 run: Optional[str]
453 """The name of the run that produced the dataset.
455 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
456 `unresolved` to add or remove this information when creating a new
457 `DatasetRef`.
458 """
460 id: Optional[int]
461 """Primary key of the dataset (`int` or `None`).
463 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
464 `unresolved` to add or remove this information when creating a new
465 `DatasetRef`.
466 """