Coverage for python/lsst/daf/butler/core/datasets/ref.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetRef"]
25import hashlib
26from typing import (
27 Any,
28 Dict,
29 Iterable,
30 Iterator,
31 List,
32 Mapping,
33 Optional,
34 Tuple,
35)
37from types import MappingProxyType
38from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate
39from ..configSupport import LookupKey
40from ..utils import immutable
41from ..named import NamedKeyDict
42from .type import DatasetType
45class AmbiguousDatasetError(Exception):
46 """Exception raised when a `DatasetRef` is not resolved (has no ID, run, or
47 components), but the requested operation requires one of them.
48 """
51@immutable
52class DatasetRef:
53 """Reference to a Dataset in a `Registry`.
55 A `DatasetRef` may point to a Dataset that currently does not yet exist
56 (e.g., because it is a predicted input for provenance).
58 Parameters
59 ----------
60 datasetType : `DatasetType`
61 The `DatasetType` for this Dataset.
62 dataId : `DataCoordinate`
63 A mapping of dimensions that labels the Dataset within a Collection.
64 id : `int`, optional
65 The unique integer identifier assigned when the dataset is created.
66 run : `str`, optional
67 The name of the run this dataset was associated with when it was
68 created. Must be provided if ``id`` is.
69 hash : `bytes`, optional
70 A hash of the dataset type and data ID. Should only be provided if
71 copying from another `DatasetRef` with the same dataset type and data
72 ID.
73 components : `dict`, optional
74 A dictionary mapping component name to a `DatasetRef` for that
75 component. Should not be passed unless ``id`` is also provided (i.e.
76 if this is a "resolved" reference).
77 conform : `bool`, optional
78 If `True` (default), call `DataCoordinate.standardize` to ensure that
79 the data ID's dimensions are consistent with the dataset type's.
80 `DatasetRef` instances for which those dimensions are not equal should
81 not be created in new code, but are still supported for backwards
82 compatibility. New code should only pass `False` if it can guarantee
83 that the dimensions are already consistent.
84 hasParentId : `bool`, optional
85 If `True` this `DatasetRef` is a component that has the ``id``
86 of the composite parent. This is set if the registry does not
87 know about individual components but does know about the composite.
89 Raises
90 ------
91 ValueError
92 Raised if ``run`` or ``components`` is provided but ``id`` is not, or
93 if a component dataset is inconsistent with the storage class, or if
94 ``id`` is provided but ``run`` is not.
95 """
97 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components", "hasParentId")
99 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *,
100 id: Optional[int] = None,
101 run: Optional[str] = None, hash: Optional[bytes] = None,
102 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True,
103 hasParentId: bool = False) -> DatasetRef:
104 self = super().__new__(cls)
105 assert isinstance(datasetType, DatasetType)
106 self.id = id
107 self.datasetType = datasetType
108 self.hasParentId = hasParentId
109 if conform:
110 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
111 else:
112 self.dataId = dataId
113 if self.id is not None:
114 self._components = dict()
115 if components is not None:
116 self._components.update(components)
117 for k, v in self._components.items():
118 expectedStorageClass = self.datasetType.storageClass.components.get(k)
119 if expectedStorageClass is None:
120 raise ValueError(f"{k} is not a valid component for "
121 f"storage class {self.datasetType.storageClass.name}.")
122 if not isinstance(v, DatasetRef):
123 # It's easy to accidentally pass DatasetType or
124 # StorageClass; make that error message friendly.
125 raise ValueError(f"Component {k}={v} is not a DatasetRef.")
126 if v.id is None:
127 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).")
128 if expectedStorageClass != v.datasetType.storageClass:
129 raise ValueError(f"Storage class mismatch for component {k}: "
130 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}")
131 if run is None:
132 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
133 f"type={datasetType}, and dataId={dataId}.")
134 self.run = run
135 else:
136 self._components = None
137 if components:
138 raise ValueError("'components' cannot be provided unless 'id' is.")
139 if run is not None:
140 raise ValueError("'run' cannot be provided unless 'id' is.")
141 self.run = None
142 if hash is not None:
143 # We only set self._hash if we know it; this plays nicely with
144 # the @immutable decorator, which allows an attribute to be set
145 # only one time.
146 self._hash = hash
147 return self
149 def __eq__(self, other: Any) -> bool:
150 try:
151 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
152 except AttributeError:
153 return NotImplemented
155 def __hash__(self) -> int:
156 return hash((self.datasetType, self.dataId, self.id))
158 @property
159 def hash(self) -> bytes:
160 """Secure hash of the `DatasetType` name and data ID (`bytes`).
161 """
162 if not hasattr(self, "_hash"):
163 message = hashlib.blake2b(digest_size=32)
164 message.update(self.datasetType.name.encode("utf8"))
165 self.dataId.fingerprint(message.update)
166 self._hash = message.digest()
167 return self._hash
169 @property
170 def components(self) -> Optional[Mapping[str, DatasetRef]]:
171 """Named `DatasetRef` components (`~collections.abc.Mapping` or
172 `None`).
174 For resolved `DatasetRef` instances, this is a read-only mapping. For
175 unresolved instances, this is always `None`.
176 """
177 if self._components is None:
178 return None
179 return MappingProxyType(self._components)
181 @property
182 def dimensions(self) -> DimensionGraph:
183 """The dimensions associated with the underlying `DatasetType`
184 """
185 return self.datasetType.dimensions
187 def __repr__(self) -> str:
188 # We delegate to __str__ (i.e use "!s") for the data ID) below because
189 # DataCoordinate's __repr__ - while adhering to the guidelines for
190 # __repr__ - is much harder to users to read, while its __str__ just
191 # produces a dict that can also be passed to DatasetRef's constructor.
192 if self.id is not None:
193 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, "
194 f"components={self._components})")
195 else:
196 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
198 def __str__(self) -> str:
199 s = f"{self.datasetType.name}@{self.dataId!s}"
200 if self.id is not None:
201 s += f" (id={self.id})"
202 return s
204 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
205 return ((self.datasetType, self.dataId),
206 {"id": self.id, "run": self.run, "components": self._components})
208 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None
209 ) -> DatasetRef:
210 """Return a new `DatasetRef` with the same data ID and dataset type
211 and the given ID and run.
213 Parameters
214 ----------
215 id : `int`
216 The unique integer identifier assigned when the dataset is created.
217 run : `str`
218 The run this dataset was associated with when it was created.
219 components : `dict`, optional
220 A dictionary mapping component name to a `DatasetRef` for that
221 component. If ``self`` is already a resolved `DatasetRef`,
222 its components will be merged with this dictionary, with this
223 dictionary taking precedence.
225 Returns
226 -------
227 ref : `DatasetRef`
228 A new `DatasetRef`.
229 """
230 if self._components is not None:
231 newComponents = self._components.copy()
232 else:
233 newComponents = {}
234 if components:
235 newComponents.update(components)
236 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
237 id=id, run=run, hash=self.hash, components=newComponents, conform=False)
239 def unresolved(self) -> DatasetRef:
240 """Return a new `DatasetRef` with the same data ID and dataset type,
241 but no ID, run, or components.
243 Returns
244 -------
245 ref : `DatasetRef`
246 A new `DatasetRef`.
248 Notes
249 -----
250 This can be used to compare only the data ID and dataset type of a
251 pair of `DatasetRef` instances, regardless of whether either is
252 resolved::
254 if ref1.unresolved() == ref2.unresolved():
255 ...
256 """
257 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False)
259 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef:
260 """Return a new `DatasetRef` with the given expanded data ID.
262 Parameters
263 ----------
264 dataId : `ExpandedDataCoordinate`
265 Data ID for the new `DatasetRef`. Must compare equal to the
266 original data ID.
268 Returns
269 -------
270 ref : `DatasetRef`
271 A new `DatasetRef` with the given data ID.
272 """
273 assert dataId == self.dataId
274 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
275 id=self.id, run=self.run, hash=self.hash, components=self.components,
276 conform=False)
278 def isComponent(self) -> bool:
279 """Boolean indicating whether this `DatasetRef` refers to a
280 component of a composite.
282 Returns
283 -------
284 isComponent : `bool`
285 `True` if this `DatasetRef` is a component, `False` otherwise.
286 """
287 return self.datasetType.isComponent()
289 def isComposite(self) -> bool:
290 """Boolean indicating whether this `DatasetRef` is a composite type.
292 Returns
293 -------
294 isComposite : `bool`
295 `True` if this `DatasetRef` is a composite type, `False`
296 otherwise.
297 """
298 return self.datasetType.isComposite()
300 def _lookupNames(self) -> Tuple[LookupKey, ...]:
301 """Name keys to use when looking up this DatasetRef in a configuration.
303 The names are returned in order of priority.
305 Returns
306 -------
307 names : `tuple` of `LookupKey`
308 Tuple of the `DatasetType` name and the `StorageClass` name.
309 If ``instrument`` is defined in the dataId, each of those names
310 is added to the start of the tuple with a key derived from the
311 value of ``instrument``.
312 """
313 # Special case the instrument Dimension since we allow configs
314 # to include the instrument name in the hierarchy.
315 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames()
317 # mypy doesn't think this could return True, because even though
318 # __contains__ can take an object of any type, it seems hard-coded to
319 # assume it will return False if the type doesn't match the key type
320 # of the Mapping.
321 if "instrument" in self.dataId: # type: ignore
322 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
323 for n in names) + names
325 return names
327 def allRefs(self, parents: bool = True) -> Iterator[DatasetRef]:
328 """Return all the nested component `DatasetRef` and optionally the
329 parent.
331 Parameters
332 ----------
333 parents : `bool`, optional
334 If `True` (default) include the given dataset in the output
335 iterable. If `False`, include only its components. This does
336 not propagate recursively - only the outermost level of parents
337 is ignored if ``parents`` is `False`.
339 Yields
340 ------
341 ref : `DatasetRef`
342 Itself (only if ``parent`` is `True`) or one of its (recursive)
343 children.
345 Notes
346 -----
347 If ``parents`` is `True`, components are guaranteed to be yielded
348 before their parents.
349 """
350 if self.components is None:
351 raise AmbiguousDatasetError(f"Unresolved ref {self} cannot be flattened.")
352 yield from DatasetRef.flatten(self.components.values(), parents=True)
353 if parents:
354 yield self
356 @staticmethod
357 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]:
358 """Recursively transform an iterable over `DatasetRef` to include
359 nested component `DatasetRef` instances.
361 Parameters
362 ----------
363 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
364 Input iterable to process. Must contain only resolved `DatasetRef`
365 instances (i.e. with `DatasetRef.components` not `None`).
366 parents : `bool`, optional
367 If `True` (default) include the given datasets in the output
368 iterable. If `False`, include only their components. This does
369 not propagate recursively - only the outermost level of parents
370 is ignored if ``parents`` is `False`.
372 Yields
373 ------
374 ref : `DatasetRef`
375 Either one of the given `DatasetRef` instances (only if ``parent``
376 is `True`) or one of its (recursive) children.
378 Notes
379 -----
380 If ``parents`` is `True`, components are guaranteed to be yielded
381 before their parents.
382 """
383 for ref in refs:
384 for subref in ref.allRefs(parents):
385 yield subref
387 @staticmethod
388 def groupByType(refs: Iterable[DatasetRef], *, recursive: bool = True
389 ) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
390 """Group an iterable of `DatasetRef` by `DatasetType`.
392 Parameters
393 ----------
394 refs : `Iterable` [ `DatasetRef` ]
395 `DatasetRef` instances to group.
396 recursive : `bool`, optional
397 If `True` (default), also group any `DatasetRef` instances found in
398 the `DatasetRef.components` dictionaries of ``refs``, recursively.
399 `True` also checks that references are "resolved" (unresolved
400 references never have components).
402 Returns
403 -------
404 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
405 Grouped `DatasetRef` instances.
407 Raises
408 ------
409 AmbiguousDatasetError
410 Raised if ``recursive is True``, and one or more refs has
411 ``DatasetRef.components is None`` (as is always the case for
412 unresolved `DatasetRef` objects).
413 """
414 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict()
415 iter = DatasetRef.flatten(refs) if recursive else refs
416 for ref in iter:
417 result.setdefault(ref.datasetType, []).append(ref)
418 return result
420 def getCheckedId(self) -> int:
421 """Return ``self.id``, or raise if it is `None`.
423 This trivial method exists to allow operations that would otherwise be
424 natural list comprehensions to check that the ID is not `None` as well.
426 Returns
427 -------
428 id : `int`
429 ``self.id`` if it is not `None`.
431 Raises
432 ------
433 AmbiguousDatasetError
434 Raised if ``ref.id`` is `None`.
435 """
436 if self.id is None:
437 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
438 f"a resolved reference is required.")
439 return self.id
441 datasetType: DatasetType
442 """The definition of this dataset (`DatasetType`).
444 Cannot be changed after a `DatasetRef` is constructed.
445 """
447 dataId: DataCoordinate
448 """A mapping of `Dimension` primary key values that labels the dataset
449 within a Collection (`DataCoordinate`).
451 Cannot be changed after a `DatasetRef` is constructed.
452 """
454 run: Optional[str]
455 """The name of the run that produced the dataset.
457 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
458 `unresolved` to add or remove this information when creating a new
459 `DatasetRef`.
460 """
462 id: Optional[int]
463 """Primary key of the dataset (`int` or `None`).
465 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
466 `unresolved` to add or remove this information when creating a new
467 `DatasetRef`.
468 """
470 _components: Optional[Dict[str, DatasetRef]]