Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef"] 

24 

25import hashlib 

26from typing import ( 

27 Any, 

28 Dict, 

29 Iterable, 

30 List, 

31 Optional, 

32 Tuple, 

33) 

34 

35from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate 

36from ..configSupport import LookupKey 

37from ..utils import immutable 

38from ..named import NamedKeyDict 

39from .type import DatasetType 

40 

41 

42class AmbiguousDatasetError(Exception): 

43 """Exception raised when a `DatasetRef` is not resolved (has no ID or run), 

44 but the requested operation requires one of them. 

45 """ 

46 

47 

48@immutable 

49class DatasetRef: 

50 """Reference to a Dataset in a `Registry`. 

51 

52 A `DatasetRef` may point to a Dataset that currently does not yet exist 

53 (e.g., because it is a predicted input for provenance). 

54 

55 Parameters 

56 ---------- 

57 datasetType : `DatasetType` 

58 The `DatasetType` for this Dataset. 

59 dataId : `DataCoordinate` 

60 A mapping of dimensions that labels the Dataset within a Collection. 

61 id : `int`, optional 

62 The unique integer identifier assigned when the dataset is created. 

63 run : `str`, optional 

64 The name of the run this dataset was associated with when it was 

65 created. Must be provided if ``id`` is. 

66 hash : `bytes`, optional 

67 A hash of the dataset type and data ID. Should only be provided if 

68 copying from another `DatasetRef` with the same dataset type and data 

69 ID. 

70 conform : `bool`, optional 

71 If `True` (default), call `DataCoordinate.standardize` to ensure that 

72 the data ID's dimensions are consistent with the dataset type's. 

73 `DatasetRef` instances for which those dimensions are not equal should 

74 not be created in new code, but are still supported for backwards 

75 compatibility. New code should only pass `False` if it can guarantee 

76 that the dimensions are already consistent. 

77 hasParentId : `bool`, optional 

78 If `True` this `DatasetRef` is a component that has the ``id`` 

79 of the composite parent. 

80 

81 Raises 

82 ------ 

83 ValueError 

84 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

85 provided but ``run`` is not. 

86 """ 

87 

88 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "hasParentId") 

89 

90 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *, 

91 id: Optional[int] = None, 

92 run: Optional[str] = None, hash: Optional[bytes] = None, 

93 hasParentId: bool = False, 

94 conform: bool = True) -> DatasetRef: 

95 self = super().__new__(cls) 

96 assert isinstance(datasetType, DatasetType) 

97 self.id = id 

98 self.datasetType = datasetType 

99 self.hasParentId = hasParentId 

100 if conform: 

101 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

102 else: 

103 self.dataId = dataId 

104 if self.id is not None: 

105 if run is None: 

106 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

107 f"type={datasetType}, and dataId={dataId}.") 

108 self.run = run 

109 else: 

110 if run is not None: 

111 raise ValueError("'run' cannot be provided unless 'id' is.") 

112 self.run = None 

113 if hash is not None: 

114 # We only set self._hash if we know it; this plays nicely with 

115 # the @immutable decorator, which allows an attribute to be set 

116 # only one time. 

117 self._hash = hash 

118 return self 

119 

120 def __eq__(self, other: Any) -> bool: 

121 try: 

122 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

123 except AttributeError: 

124 return NotImplemented 

125 

126 def __hash__(self) -> int: 

127 return hash((self.datasetType, self.dataId, self.id)) 

128 

129 @property 

130 def hash(self) -> bytes: 

131 """Secure hash of the `DatasetType` name and data ID (`bytes`). 

132 """ 

133 if not hasattr(self, "_hash"): 

134 message = hashlib.blake2b(digest_size=32) 

135 message.update(self.datasetType.name.encode("utf8")) 

136 self.dataId.fingerprint(message.update) 

137 self._hash = message.digest() 

138 return self._hash 

139 

140 @property 

141 def dimensions(self) -> DimensionGraph: 

142 """The dimensions associated with the underlying `DatasetType` 

143 """ 

144 return self.datasetType.dimensions 

145 

146 def __repr__(self) -> str: 

147 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

148 # DataCoordinate's __repr__ - while adhering to the guidelines for 

149 # __repr__ - is much harder to users to read, while its __str__ just 

150 # produces a dict that can also be passed to DatasetRef's constructor. 

151 if self.id is not None: 

152 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

153 else: 

154 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

155 

156 def __str__(self) -> str: 

157 s = f"{self.datasetType.name}@{self.dataId!s}" 

158 if self.id is not None: 

159 s += f" (id={self.id})" 

160 return s 

161 

162 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: 

163 return ((self.datasetType, self.dataId), {"id": self.id, "run": self.run}) 

164 

165 def resolved(self, id: int, run: str) -> DatasetRef: 

166 """Return a new `DatasetRef` with the same data ID and dataset type 

167 and the given ID and run. 

168 

169 Parameters 

170 ---------- 

171 id : `int` 

172 The unique integer identifier assigned when the dataset is created. 

173 run : `str` 

174 The run this dataset was associated with when it was created. 

175 

176 Returns 

177 ------- 

178 ref : `DatasetRef` 

179 A new `DatasetRef`. 

180 """ 

181 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

182 id=id, run=run, hash=self.hash, conform=False) 

183 

184 def unresolved(self) -> DatasetRef: 

185 """Return a new `DatasetRef` with the same data ID and dataset type, 

186 but no ID or run. 

187 

188 Returns 

189 ------- 

190 ref : `DatasetRef` 

191 A new `DatasetRef`. 

192 

193 Notes 

194 ----- 

195 This can be used to compare only the data ID and dataset type of a 

196 pair of `DatasetRef` instances, regardless of whether either is 

197 resolved:: 

198 

199 if ref1.unresolved() == ref2.unresolved(): 

200 ... 

201 """ 

202 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False) 

203 

204 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef: 

205 """Return a new `DatasetRef` with the given expanded data ID. 

206 

207 Parameters 

208 ---------- 

209 dataId : `ExpandedDataCoordinate` 

210 Data ID for the new `DatasetRef`. Must compare equal to the 

211 original data ID. 

212 

213 Returns 

214 ------- 

215 ref : `DatasetRef` 

216 A new `DatasetRef` with the given data ID. 

217 """ 

218 assert dataId == self.dataId 

219 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

220 id=self.id, run=self.run, hash=self.hash, 

221 conform=False) 

222 

223 def isComponent(self) -> bool: 

224 """Boolean indicating whether this `DatasetRef` refers to a 

225 component of a composite. 

226 

227 Returns 

228 ------- 

229 isComponent : `bool` 

230 `True` if this `DatasetRef` is a component, `False` otherwise. 

231 """ 

232 return self.datasetType.isComponent() 

233 

234 def isComposite(self) -> bool: 

235 """Boolean indicating whether this `DatasetRef` is a composite type. 

236 

237 Returns 

238 ------- 

239 isComposite : `bool` 

240 `True` if this `DatasetRef` is a composite type, `False` 

241 otherwise. 

242 """ 

243 return self.datasetType.isComposite() 

244 

245 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

246 """Name keys to use when looking up this DatasetRef in a configuration. 

247 

248 The names are returned in order of priority. 

249 

250 Returns 

251 ------- 

252 names : `tuple` of `LookupKey` 

253 Tuple of the `DatasetType` name and the `StorageClass` name. 

254 If ``instrument`` is defined in the dataId, each of those names 

255 is added to the start of the tuple with a key derived from the 

256 value of ``instrument``. 

257 """ 

258 # Special case the instrument Dimension since we allow configs 

259 # to include the instrument name in the hierarchy. 

260 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

261 

262 # mypy doesn't think this could return True, because even though 

263 # __contains__ can take an object of any type, it seems hard-coded to 

264 # assume it will return False if the type doesn't match the key type 

265 # of the Mapping. 

266 if "instrument" in self.dataId: # type: ignore 

267 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

268 for n in names) + names 

269 

270 return names 

271 

272 @staticmethod 

273 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

274 """Group an iterable of `DatasetRef` by `DatasetType`. 

275 

276 Parameters 

277 ---------- 

278 refs : `Iterable` [ `DatasetRef` ] 

279 `DatasetRef` instances to group. 

280 

281 Returns 

282 ------- 

283 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

284 Grouped `DatasetRef` instances. 

285 """ 

286 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

287 for ref in refs: 

288 result.setdefault(ref.datasetType, []).append(ref) 

289 return result 

290 

291 def getCheckedId(self) -> int: 

292 """Return ``self.id``, or raise if it is `None`. 

293 

294 This trivial method exists to allow operations that would otherwise be 

295 natural list comprehensions to check that the ID is not `None` as well. 

296 

297 Returns 

298 ------- 

299 id : `int` 

300 ``self.id`` if it is not `None`. 

301 

302 Raises 

303 ------ 

304 AmbiguousDatasetError 

305 Raised if ``ref.id`` is `None`. 

306 """ 

307 if self.id is None: 

308 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

309 f"a resolved reference is required.") 

310 return self.id 

311 

312 def makeComponentRef(self, name: str) -> DatasetRef: 

313 """Create a `DatasetRef` that corresponds to a component of this 

314 dataset. 

315 

316 Parameters 

317 ---------- 

318 name : `str` 

319 Name of the component. 

320 

321 Returns 

322 ------- 

323 ref : `DatasetRef` 

324 A `DatasetRef` with a dataset type that corresponds to the given 

325 component, with ``hasParentId=True``, and the same ID and run 

326 (which may be `None`, if they are `None` in ``self``). 

327 """ 

328 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

329 id=self.id, run=self.run, hasParentId=True) 

330 

331 datasetType: DatasetType 

332 """The definition of this dataset (`DatasetType`). 

333 

334 Cannot be changed after a `DatasetRef` is constructed. 

335 """ 

336 

337 dataId: DataCoordinate 

338 """A mapping of `Dimension` primary key values that labels the dataset 

339 within a Collection (`DataCoordinate`). 

340 

341 Cannot be changed after a `DatasetRef` is constructed. 

342 """ 

343 

344 run: Optional[str] 

345 """The name of the run that produced the dataset. 

346 

347 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

348 `unresolved` to add or remove this information when creating a new 

349 `DatasetRef`. 

350 """ 

351 

352 id: Optional[int] 

353 """Primary key of the dataset (`int` or `None`). 

354 

355 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

356 `unresolved` to add or remove this information when creating a new 

357 `DatasetRef`. 

358 """