Coverage for python/lsst/daf/butler/core/datasets/ref.py : 31%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetRef"]
25import hashlib
26from typing import (
27 Any,
28 Dict,
29 Iterable,
30 List,
31 Optional,
32 Tuple,
33)
35from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate
36from ..configSupport import LookupKey
37from ..utils import immutable
38from ..named import NamedKeyDict
39from .type import DatasetType
42class AmbiguousDatasetError(Exception):
43 """Exception raised when a `DatasetRef` is not resolved (has no ID or run),
44 but the requested operation requires one of them.
45 """
48@immutable
49class DatasetRef:
50 """Reference to a Dataset in a `Registry`.
52 A `DatasetRef` may point to a Dataset that currently does not yet exist
53 (e.g., because it is a predicted input for provenance).
55 Parameters
56 ----------
57 datasetType : `DatasetType`
58 The `DatasetType` for this Dataset.
59 dataId : `DataCoordinate`
60 A mapping of dimensions that labels the Dataset within a Collection.
61 id : `int`, optional
62 The unique integer identifier assigned when the dataset is created.
63 run : `str`, optional
64 The name of the run this dataset was associated with when it was
65 created. Must be provided if ``id`` is.
66 hash : `bytes`, optional
67 A hash of the dataset type and data ID. Should only be provided if
68 copying from another `DatasetRef` with the same dataset type and data
69 ID.
70 conform : `bool`, optional
71 If `True` (default), call `DataCoordinate.standardize` to ensure that
72 the data ID's dimensions are consistent with the dataset type's.
73 `DatasetRef` instances for which those dimensions are not equal should
74 not be created in new code, but are still supported for backwards
75 compatibility. New code should only pass `False` if it can guarantee
76 that the dimensions are already consistent.
77 hasParentId : `bool`, optional
78 If `True` this `DatasetRef` is a component that has the ``id``
79 of the composite parent.
81 Raises
82 ------
83 ValueError
84 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is
85 provided but ``run`` is not.
86 """
88 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "hasParentId")
90 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *,
91 id: Optional[int] = None,
92 run: Optional[str] = None, hash: Optional[bytes] = None,
93 hasParentId: bool = False,
94 conform: bool = True) -> DatasetRef:
95 self = super().__new__(cls)
96 assert isinstance(datasetType, DatasetType)
97 self.id = id
98 self.datasetType = datasetType
99 self.hasParentId = hasParentId
100 if conform:
101 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
102 else:
103 self.dataId = dataId
104 if self.id is not None:
105 if run is None:
106 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
107 f"type={datasetType}, and dataId={dataId}.")
108 self.run = run
109 else:
110 if run is not None:
111 raise ValueError("'run' cannot be provided unless 'id' is.")
112 self.run = None
113 if hash is not None:
114 # We only set self._hash if we know it; this plays nicely with
115 # the @immutable decorator, which allows an attribute to be set
116 # only one time.
117 self._hash = hash
118 return self
120 def __eq__(self, other: Any) -> bool:
121 try:
122 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
123 except AttributeError:
124 return NotImplemented
126 def __hash__(self) -> int:
127 return hash((self.datasetType, self.dataId, self.id))
129 @property
130 def hash(self) -> bytes:
131 """Secure hash of the `DatasetType` name and data ID (`bytes`).
132 """
133 if not hasattr(self, "_hash"):
134 message = hashlib.blake2b(digest_size=32)
135 message.update(self.datasetType.name.encode("utf8"))
136 self.dataId.fingerprint(message.update)
137 self._hash = message.digest()
138 return self._hash
140 @property
141 def dimensions(self) -> DimensionGraph:
142 """The dimensions associated with the underlying `DatasetType`
143 """
144 return self.datasetType.dimensions
146 def __repr__(self) -> str:
147 # We delegate to __str__ (i.e use "!s") for the data ID) below because
148 # DataCoordinate's __repr__ - while adhering to the guidelines for
149 # __repr__ - is much harder to users to read, while its __str__ just
150 # produces a dict that can also be passed to DatasetRef's constructor.
151 if self.id is not None:
152 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})")
153 else:
154 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
156 def __str__(self) -> str:
157 s = f"{self.datasetType.name}@{self.dataId!s}"
158 if self.id is not None:
159 s += f" (id={self.id})"
160 return s
162 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
163 return ((self.datasetType, self.dataId), {"id": self.id, "run": self.run})
165 def resolved(self, id: int, run: str) -> DatasetRef:
166 """Return a new `DatasetRef` with the same data ID and dataset type
167 and the given ID and run.
169 Parameters
170 ----------
171 id : `int`
172 The unique integer identifier assigned when the dataset is created.
173 run : `str`
174 The run this dataset was associated with when it was created.
176 Returns
177 -------
178 ref : `DatasetRef`
179 A new `DatasetRef`.
180 """
181 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
182 id=id, run=run, hash=self.hash, conform=False)
184 def unresolved(self) -> DatasetRef:
185 """Return a new `DatasetRef` with the same data ID and dataset type,
186 but no ID or run.
188 Returns
189 -------
190 ref : `DatasetRef`
191 A new `DatasetRef`.
193 Notes
194 -----
195 This can be used to compare only the data ID and dataset type of a
196 pair of `DatasetRef` instances, regardless of whether either is
197 resolved::
199 if ref1.unresolved() == ref2.unresolved():
200 ...
201 """
202 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False)
204 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef:
205 """Return a new `DatasetRef` with the given expanded data ID.
207 Parameters
208 ----------
209 dataId : `ExpandedDataCoordinate`
210 Data ID for the new `DatasetRef`. Must compare equal to the
211 original data ID.
213 Returns
214 -------
215 ref : `DatasetRef`
216 A new `DatasetRef` with the given data ID.
217 """
218 assert dataId == self.dataId
219 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
220 id=self.id, run=self.run, hash=self.hash,
221 conform=False)
223 def isComponent(self) -> bool:
224 """Boolean indicating whether this `DatasetRef` refers to a
225 component of a composite.
227 Returns
228 -------
229 isComponent : `bool`
230 `True` if this `DatasetRef` is a component, `False` otherwise.
231 """
232 return self.datasetType.isComponent()
234 def isComposite(self) -> bool:
235 """Boolean indicating whether this `DatasetRef` is a composite type.
237 Returns
238 -------
239 isComposite : `bool`
240 `True` if this `DatasetRef` is a composite type, `False`
241 otherwise.
242 """
243 return self.datasetType.isComposite()
245 def _lookupNames(self) -> Tuple[LookupKey, ...]:
246 """Name keys to use when looking up this DatasetRef in a configuration.
248 The names are returned in order of priority.
250 Returns
251 -------
252 names : `tuple` of `LookupKey`
253 Tuple of the `DatasetType` name and the `StorageClass` name.
254 If ``instrument`` is defined in the dataId, each of those names
255 is added to the start of the tuple with a key derived from the
256 value of ``instrument``.
257 """
258 # Special case the instrument Dimension since we allow configs
259 # to include the instrument name in the hierarchy.
260 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
262 # mypy doesn't think this could return True, because even though
263 # __contains__ can take an object of any type, it seems hard-coded to
264 # assume it will return False if the type doesn't match the key type
265 # of the Mapping.
266 if "instrument" in self.dataId: # type: ignore
267 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
268 for n in names) + names
270 return names
272 @staticmethod
273 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
274 """Group an iterable of `DatasetRef` by `DatasetType`.
276 Parameters
277 ----------
278 refs : `Iterable` [ `DatasetRef` ]
279 `DatasetRef` instances to group.
281 Returns
282 -------
283 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
284 Grouped `DatasetRef` instances.
285 """
286 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
287 for ref in refs:
288 result.setdefault(ref.datasetType, []).append(ref)
289 return result
291 def getCheckedId(self) -> int:
292 """Return ``self.id``, or raise if it is `None`.
294 This trivial method exists to allow operations that would otherwise be
295 natural list comprehensions to check that the ID is not `None` as well.
297 Returns
298 -------
299 id : `int`
300 ``self.id`` if it is not `None`.
302 Raises
303 ------
304 AmbiguousDatasetError
305 Raised if ``ref.id`` is `None`.
306 """
307 if self.id is None:
308 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
309 f"a resolved reference is required.")
310 return self.id
312 def makeComponentRef(self, name: str) -> DatasetRef:
313 """Create a `DatasetRef` that corresponds to a component of this
314 dataset.
316 Parameters
317 ----------
318 name : `str`
319 Name of the component.
321 Returns
322 -------
323 ref : `DatasetRef`
324 A `DatasetRef` with a dataset type that corresponds to the given
325 component, with ``hasParentId=True``, and the same ID and run
326 (which may be `None`, if they are `None` in ``self``).
327 """
328 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId,
329 id=self.id, run=self.run, hasParentId=True)
331 datasetType: DatasetType
332 """The definition of this dataset (`DatasetType`).
334 Cannot be changed after a `DatasetRef` is constructed.
335 """
337 dataId: DataCoordinate
338 """A mapping of `Dimension` primary key values that labels the dataset
339 within a Collection (`DataCoordinate`).
341 Cannot be changed after a `DatasetRef` is constructed.
342 """
344 run: Optional[str]
345 """The name of the run that produced the dataset.
347 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
348 `unresolved` to add or remove this information when creating a new
349 `DatasetRef`.
350 """
352 id: Optional[int]
353 """Primary key of the dataset (`int` or `None`).
355 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
356 `unresolved` to add or remove this information when creating a new
357 `DatasetRef`.
358 """