Coverage for python/lsst/daf/butler/core/datasets/ref.py : 26%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ["AmbiguousDatasetError", "DatasetRef", "FakeDatasetRef"]
25import hashlib
26from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Tuple
28from types import MappingProxyType
29from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate
30from ..configSupport import LookupKey
31from ..utils import immutable, NamedKeyDict
32from .type import DatasetType
35class AmbiguousDatasetError(Exception):
36 """Exception raised when a `DatasetRef` is not resolved (has no ID, run, or
37 components), but the requested operation requires one of them.
38 """
41@immutable
42class DatasetRef:
43 """Reference to a Dataset in a `Registry`.
45 A `DatasetRef` may point to a Dataset that currently does not yet exist
46 (e.g., because it is a predicted input for provenance).
48 Parameters
49 ----------
50 datasetType : `DatasetType`
51 The `DatasetType` for this Dataset.
52 dataId : `DataCoordinate`
53 A mapping of dimensions that labels the Dataset within a Collection.
54 id : `int`, optional
55 The unique integer identifier assigned when the dataset is created.
56 run : `str`, optional
57 The name of the run this dataset was associated with when it was
58 created. Must be provided if ``id`` is.
59 hash : `bytes`, optional
60 A hash of the dataset type and data ID. Should only be provided if
61 copying from another `DatasetRef` with the same dataset type and data
62 ID.
63 components : `dict`, optional
64 A dictionary mapping component name to a `DatasetRef` for that
65 component. Should not be passed unless ``id`` is also provided (i.e.
66 if this is a "resolved" reference).
67 conform : `bool`, optional
68 If `True` (default), call `DataCoordinate.standardize` to ensure that
69 the data ID's dimensions are consistent with the dataset type's.
70 `DatasetRef` instances for which those dimensions are not equal should
71 not be created in new code, but are still supported for backwards
72 compatibility. New code should only pass `False` if it can guarantee
73 that the dimensions are already consistent.
74 hasParentId : `bool`, optional
75 If `True` this `DatasetRef` is a component that has the ``id``
76 of the composite parent. This is set if the registry does not
77 know about individual components but does know about the composite.
79 Raises
80 ------
81 ValueError
82 Raised if ``run`` or ``components`` is provided but ``id`` is not, or
83 if a component dataset is inconsistent with the storage class, or if
84 ``id`` is provided but ``run`` is not.
85 """
87 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components", "hasParentId")
89 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *,
90 id: Optional[int] = None,
91 run: Optional[str] = None, hash: Optional[bytes] = None,
92 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True,
93 hasParentId: bool = False) -> DatasetRef:
94 self = super().__new__(cls)
95 assert isinstance(datasetType, DatasetType)
96 self.id = id
97 self.datasetType = datasetType
98 self.hasParentId = hasParentId
99 if conform:
100 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions)
101 else:
102 self.dataId = dataId
103 if self.id is not None:
104 self._components = dict()
105 if components is not None:
106 self._components.update(components)
107 for k, v in self._components.items():
108 expectedStorageClass = self.datasetType.storageClass.components.get(k)
109 if expectedStorageClass is None:
110 raise ValueError(f"{k} is not a valid component for "
111 f"storage class {self.datasetType.storageClass.name}.")
112 if not isinstance(v, DatasetRef):
113 # It's easy to accidentally pass DatasetType or
114 # StorageClass; make that error message friendly.
115 raise ValueError(f"Component {k}={v} is not a DatasetRef.")
116 if v.id is None:
117 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).")
118 if expectedStorageClass != v.datasetType.storageClass:
119 raise ValueError(f"Storage class mismatch for component {k}: "
120 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}")
121 if run is None:
122 raise ValueError(f"Cannot provide id without run for dataset with id={id}, "
123 f"type={datasetType}, and dataId={dataId}.")
124 self.run = run
125 else:
126 self._components = None
127 if components:
128 raise ValueError("'components' cannot be provided unless 'id' is.")
129 if run is not None:
130 raise ValueError("'run' cannot be provided unless 'id' is.")
131 self.run = None
132 if hash is not None:
133 # We only set self._hash if we know it; this plays nicely with
134 # the @immutable decorator, which allows an attribute to be set
135 # only one time.
136 self._hash = hash
137 return self
139 def __eq__(self, other: DatasetRef):
140 try:
141 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id)
142 except AttributeError:
143 return NotImplemented
145 def __hash__(self) -> int:
146 return hash((self.datasetType, self.dataId, self.id))
148 @property
149 def hash(self) -> bytes:
150 """Secure hash of the `DatasetType` name and data ID (`bytes`).
151 """
152 if not hasattr(self, "_hash"):
153 message = hashlib.blake2b(digest_size=32)
154 message.update(self.datasetType.name.encode("utf8"))
155 self.dataId.fingerprint(message.update)
156 self._hash = message.digest()
157 return self._hash
159 @property
160 def components(self) -> Optional[Mapping[str, DatasetRef]]:
161 """Named `DatasetRef` components (`~collections.abc.Mapping` or
162 `None`).
164 For resolved `DatasetRef` instances, this is a read-only mapping. For
165 unresolved instances, this is always `None`.
166 """
167 if self._components is None:
168 return None
169 return MappingProxyType(self._components)
171 @property
172 def dimensions(self) -> DimensionGraph:
173 """The dimensions associated with the underlying `DatasetType`
174 """
175 return self.datasetType.dimensions
177 def __repr__(self) -> str:
178 # We delegate to __str__ (i.e use "!s") for the data ID) below because
179 # DataCoordinate's __repr__ - while adhering to the guidelines for
180 # __repr__ - is much harder to users to read, while its __str__ just
181 # produces a dict that can also be passed to DatasetRef's constructor.
182 if self.id is not None:
183 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, "
184 f"components={self._components})")
185 else:
186 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})"
188 def __str__(self) -> str:
189 s = f"{self.datasetType.name}@{self.dataId!s}"
190 if self.id is not None:
191 s += f" (id={self.id})"
192 return s
194 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
195 return ((self.datasetType, self.dataId),
196 {"id": self.id, "run": self.run, "components": self._components})
198 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None
199 ) -> DatasetRef:
200 """Return a new `DatasetRef` with the same data ID and dataset type
201 and the given ID and run.
203 Parameters
204 ----------
205 id : `int`
206 The unique integer identifier assigned when the dataset is created.
207 run : `str`
208 The run this dataset was associated with when it was created.
209 components : `dict`, optional
210 A dictionary mapping component name to a `DatasetRef` for that
211 component. If ``self`` is already a resolved `DatasetRef`,
212 its components will be merged with this dictionary, with this
213 dictionary taking precedence.
215 Returns
216 -------
217 ref : `DatasetRef`
218 A new `DatasetRef`.
219 """
220 if self._components is not None:
221 newComponents = self._components.copy()
222 else:
223 newComponents = {}
224 if components:
225 newComponents.update(components)
226 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId,
227 id=id, run=run, hash=self.hash, components=newComponents, conform=False)
229 def unresolved(self) -> DatasetRef:
230 """Return a new `DatasetRef` with the same data ID and dataset type,
231 but no ID, run, or components.
233 Returns
234 -------
235 ref : `DatasetRef`
236 A new `DatasetRef`.
238 Notes
239 -----
240 This can be used to compare only the data ID and dataset type of a
241 pair of `DatasetRef` instances, regardless of whether either is
242 resolved::
244 if ref1.unresolved() == ref2.unresolved():
245 ...
246 """
247 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False)
249 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef:
250 """Return a new `DatasetRef` with the given expanded data ID.
252 Parameters
253 ----------
254 dataId : `ExpandedDataCoordinate`
255 Data ID for the new `DatasetRef`. Must compare equal to the
256 original data ID.
258 Returns
259 -------
260 ref : `DatasetRef`
261 A new `DatasetRef` with the given data ID.
262 """
263 assert dataId == self.dataId
264 return DatasetRef(datasetType=self.datasetType, dataId=dataId,
265 id=self.id, run=self.run, hash=self.hash, components=self.components,
266 conform=False)
268 def isComponent(self) -> bool:
269 """Boolean indicating whether this `DatasetRef` refers to a
270 component of a composite.
272 Returns
273 -------
274 isComponent : `bool`
275 `True` if this `DatasetRef` is a component, `False` otherwise.
276 """
277 return self.datasetType.isComponent()
279 def isComposite(self) -> bool:
280 """Boolean indicating whether this `DatasetRef` is a composite type.
282 Returns
283 -------
284 isComposite : `bool`
285 `True` if this `DatasetRef` is a composite type, `False`
286 otherwise.
287 """
288 return self.datasetType.isComposite()
290 def _lookupNames(self) -> Tuple[LookupKey]:
291 """Name keys to use when looking up this DatasetRef in a configuration.
293 The names are returned in order of priority.
295 Returns
296 -------
297 names : `tuple` of `LookupKey`
298 Tuple of the `DatasetType` name and the `StorageClass` name.
299 If ``instrument`` is defined in the dataId, each of those names
300 is added to the start of the tuple with a key derived from the
301 value of ``instrument``.
302 """
303 # Special case the instrument Dimension since we allow configs
304 # to include the instrument name in the hierarchy.
305 names = self.datasetType._lookupNames()
307 if "instrument" in self.dataId:
308 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]})
309 for n in names) + names
311 return names
313 @staticmethod
314 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]:
315 """Recursively transform an iterable over `DatasetRef` to include
316 nested component `DatasetRef` instances.
318 Parameters
319 ----------
320 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
321 Input iterable to process. Must contain only resolved `DatasetRef`
322 instances (i.e. with `DatasetRef.components` not `None`).
323 parents : `bool`, optional
324 If `True` (default) include the given datasets in the output
325 iterable. If `False`, include only their components. This does
326 not propagate recursively - only the outermost level of parents
327 is ignored if ``parents`` is `False`.
329 Yields
330 ------
331 ref : `DatasetRef`
332 Either one of the given `DatasetRef` instances (only if ``parent``
333 is `True`) or on of its (recursive) children.
335 Notes
336 -----
337 If ``parents`` is `True`, components are guaranteed to be yielded
338 before their parents.
339 """
340 for ref in refs:
341 if ref.components is None:
342 raise AmbiguousDatasetError(f"Unresolved ref {ref} passed to 'flatten'.")
343 yield from DatasetRef.flatten(ref.components.values(), parents=True)
344 if parents:
345 yield ref
347 @staticmethod
348 def groupByType(refs: Iterable[DatasetRef], *, recursive: bool = True
349 ) -> NamedKeyDict[DatasetType, List[DatasetRef]]:
350 """Group an iterable of `DatasetRef` by `DatasetType`.
352 Parameters
353 ----------
354 refs : `Iterable` [ `DatasetRef` ]
355 `DatasetRef` instances to group.
356 recursive : `bool`, optional
357 If `True` (default), also group any `DatasetRef` instances found in
358 the `DatasetRef.components` dictionaries of ``refs``, recursively.
359 `True` also checks that references are "resolved" (unresolved
360 references never have components).
362 Returns
363 -------
364 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ]
365 Grouped `DatasetRef` instances.
367 Raises
368 ------
369 AmbiguousDatasetError
370 Raised if ``recursive is True``, and one or more refs has
371 ``DatasetRef.components is None`` (as is always the case for
372 unresolved `DatasetRef` objects).
373 """
374 result = NamedKeyDict()
375 iter = DatasetRef.flatten(refs) if recursive else refs
376 for ref in iter:
377 result.setdefault(ref.datasetType, []).append(ref)
378 return result
380 def getCheckedId(self) -> int:
381 """Return ``self.id``, or raise if it is `None`.
383 This trivial method exists to allow operations that would otherwise be
384 natural list comprehensions to check that the ID is not `None` as well.
386 Returns
387 -------
388 id : `int`
389 ``self.id`` if it is not `None`.
391 Raises
392 ------
393 AmbiguousDatasetError
394 Raised if ``ref.id`` is `None`.
395 """
396 if self.id is None:
397 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; "
398 f"a resolved reference is required.")
399 return self.id
401 datasetType: DatasetType
402 """The definition of this dataset (`DatasetType`).
404 Cannot be changed after a `DatasetRef` is constructed.
405 """
407 dataId: DataCoordinate
408 """A mapping of `Dimension` primary key values that labels the dataset
409 within a Collection (`DataCoordinate`).
411 Cannot be changed after a `DatasetRef` is constructed.
412 """
414 run: Optional[setattr]
415 """The name of the run that produced the dataset.
417 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
418 `unresolved` to add or remove this information when creating a new
419 `DatasetRef`.
420 """
422 id: Optional[int]
423 """Primary key of the dataset (`int` or `None`).
425 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or
426 `unresolved` to add or remove this information when creating a new
427 `DatasetRef`.
428 """
431@immutable
432class FakeDatasetRef:
433 """A fake `DatasetRef` that can be used internally by butler where
434 only the dataset ID is available.
436 Should only be used when registry can not be used to create a full
437 `DatasetRef` from the ID. A particular use case is during dataset
438 deletion when solely the ID is available.
440 Parameters
441 ----------
442 id : `int`
443 The dataset ID.
444 """
445 __slots__ = ("id",)
447 def __new__(cls, id: int):
448 self = super().__new__(cls)
449 self.id = id
450 return self
452 def __str__(self):
453 return f"dataset_id={self.id}"
455 def __repr__(self):
456 return f"FakeDatasetRef({self.id})"
458 def __eq__(self, other: FakeDatasetRef):
459 try:
460 return self.id == other.id
461 except AttributeError:
462 return NotImplemented
464 def __hash__(self) -> int:
465 return hash(self.id)
467 @property
468 def components(self):
469 return {}
471 @staticmethod
472 def flatten(refs: Iterable[FakeDatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]:
473 return DatasetRef.flatten(refs, parents=parents)