Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef"] 

24 

25from typing import ( 

26 Any, 

27 Dict, 

28 Iterable, 

29 List, 

30 Optional, 

31 Tuple, 

32) 

33 

34from ..dimensions import DataCoordinate, DimensionGraph 

35from ..configSupport import LookupKey 

36from ..utils import immutable 

37from ..named import NamedKeyDict 

38from .type import DatasetType 

39 

40 

41class AmbiguousDatasetError(Exception): 

42 """Exception raised when a `DatasetRef` is not resolved (has no ID or run), 

43 but the requested operation requires one of them. 

44 """ 

45 

46 

47@immutable 

48class DatasetRef: 

49 """Reference to a Dataset in a `Registry`. 

50 

51 A `DatasetRef` may point to a Dataset that currently does not yet exist 

52 (e.g., because it is a predicted input for provenance). 

53 

54 Parameters 

55 ---------- 

56 datasetType : `DatasetType` 

57 The `DatasetType` for this Dataset. 

58 dataId : `DataCoordinate` 

59 A mapping of dimensions that labels the Dataset within a Collection. 

60 id : `int`, optional 

61 The unique integer identifier assigned when the dataset is created. 

62 run : `str`, optional 

63 The name of the run this dataset was associated with when it was 

64 created. Must be provided if ``id`` is. 

65 conform : `bool`, optional 

66 If `True` (default), call `DataCoordinate.standardize` to ensure that 

67 the data ID's dimensions are consistent with the dataset type's. 

68 `DatasetRef` instances for which those dimensions are not equal should 

69 not be created in new code, but are still supported for backwards 

70 compatibility. New code should only pass `False` if it can guarantee 

71 that the dimensions are already consistent. 

72 hasParentId : `bool`, optional 

73 If `True` this `DatasetRef` is a component that has the ``id`` 

74 of the composite parent. 

75 

76 Raises 

77 ------ 

78 ValueError 

79 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

80 provided but ``run`` is not. 

81 """ 

82 

83 __slots__ = ("id", "datasetType", "dataId", "run", "hasParentId") 

84 

85 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *, 

86 id: Optional[int] = None, 

87 run: Optional[str] = None, 

88 hasParentId: bool = False, 

89 conform: bool = True) -> DatasetRef: 

90 self = super().__new__(cls) 

91 assert isinstance(datasetType, DatasetType) 

92 self.id = id 

93 self.datasetType = datasetType 

94 self.hasParentId = hasParentId 

95 if conform: 

96 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

97 else: 

98 self.dataId = dataId 

99 if self.id is not None: 

100 if run is None: 

101 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

102 f"type={datasetType}, and dataId={dataId}.") 

103 self.run = run 

104 else: 

105 if run is not None: 

106 raise ValueError("'run' cannot be provided unless 'id' is.") 

107 self.run = None 

108 return self 

109 

110 def __eq__(self, other: Any) -> bool: 

111 try: 

112 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

113 except AttributeError: 

114 return NotImplemented 

115 

116 def __hash__(self) -> int: 

117 return hash((self.datasetType, self.dataId, self.id)) 

118 

119 @property 

120 def dimensions(self) -> DimensionGraph: 

121 """The dimensions associated with the underlying `DatasetType` 

122 """ 

123 return self.datasetType.dimensions 

124 

125 def __repr__(self) -> str: 

126 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

127 # DataCoordinate's __repr__ - while adhering to the guidelines for 

128 # __repr__ - is much harder to users to read, while its __str__ just 

129 # produces a dict that can also be passed to DatasetRef's constructor. 

130 if self.id is not None: 

131 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

132 else: 

133 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

134 

135 def __str__(self) -> str: 

136 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]" 

137 if self.id is not None: 

138 s += f" (id={self.id})" 

139 return s 

140 

141 def __lt__(self, other: Any) -> bool: 

142 # Sort by run, DatasetType name and then by DataCoordinate 

143 # The __str__ representation is probably close enough but we 

144 # need to ensure that sorting a DatasetRef matches what you would 

145 # get if you sorted DatasetType+DataCoordinate 

146 if not isinstance(other, type(self)): 

147 return NotImplemented 

148 

149 # Group by run if defined, takes precedence over DatasetType 

150 self_run = "" if self.run is None else self.run 

151 other_run = "" if other.run is None else other.run 

152 

153 # Compare tuples in the priority order 

154 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

155 

156 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: 

157 return ((self.datasetType, self.dataId), {"id": self.id, "run": self.run}) 

158 

159 def resolved(self, id: int, run: str) -> DatasetRef: 

160 """Return a new `DatasetRef` with the same data ID and dataset type 

161 and the given ID and run. 

162 

163 Parameters 

164 ---------- 

165 id : `int` 

166 The unique integer identifier assigned when the dataset is created. 

167 run : `str` 

168 The run this dataset was associated with when it was created. 

169 

170 Returns 

171 ------- 

172 ref : `DatasetRef` 

173 A new `DatasetRef`. 

174 """ 

175 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

176 id=id, run=run, conform=False) 

177 

178 def unresolved(self) -> DatasetRef: 

179 """Return a new `DatasetRef` with the same data ID and dataset type, 

180 but no ID or run. 

181 

182 Returns 

183 ------- 

184 ref : `DatasetRef` 

185 A new `DatasetRef`. 

186 

187 Notes 

188 ----- 

189 This can be used to compare only the data ID and dataset type of a 

190 pair of `DatasetRef` instances, regardless of whether either is 

191 resolved:: 

192 

193 if ref1.unresolved() == ref2.unresolved(): 

194 ... 

195 """ 

196 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

197 

198 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

199 """Return a new `DatasetRef` with the given expanded data ID. 

200 

201 Parameters 

202 ---------- 

203 dataId : `DataCoordinate` 

204 Data ID for the new `DatasetRef`. Must compare equal to the 

205 original data ID. 

206 

207 Returns 

208 ------- 

209 ref : `DatasetRef` 

210 A new `DatasetRef` with the given data ID. 

211 """ 

212 assert dataId == self.dataId 

213 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

214 id=self.id, run=self.run, 

215 conform=False) 

216 

217 def isComponent(self) -> bool: 

218 """Boolean indicating whether this `DatasetRef` refers to a 

219 component of a composite. 

220 

221 Returns 

222 ------- 

223 isComponent : `bool` 

224 `True` if this `DatasetRef` is a component, `False` otherwise. 

225 """ 

226 return self.datasetType.isComponent() 

227 

228 def isComposite(self) -> bool: 

229 """Boolean indicating whether this `DatasetRef` is a composite type. 

230 

231 Returns 

232 ------- 

233 isComposite : `bool` 

234 `True` if this `DatasetRef` is a composite type, `False` 

235 otherwise. 

236 """ 

237 return self.datasetType.isComposite() 

238 

239 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

240 """Name keys to use when looking up this DatasetRef in a configuration. 

241 

242 The names are returned in order of priority. 

243 

244 Returns 

245 ------- 

246 names : `tuple` of `LookupKey` 

247 Tuple of the `DatasetType` name and the `StorageClass` name. 

248 If ``instrument`` is defined in the dataId, each of those names 

249 is added to the start of the tuple with a key derived from the 

250 value of ``instrument``. 

251 """ 

252 # Special case the instrument Dimension since we allow configs 

253 # to include the instrument name in the hierarchy. 

254 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

255 

256 # mypy doesn't think this could return True, because even though 

257 # __contains__ can take an object of any type, it seems hard-coded to 

258 # assume it will return False if the type doesn't match the key type 

259 # of the Mapping. 

260 if "instrument" in self.dataId: # type: ignore 

261 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

262 for n in names) + names 

263 

264 return names 

265 

266 @staticmethod 

267 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

268 """Group an iterable of `DatasetRef` by `DatasetType`. 

269 

270 Parameters 

271 ---------- 

272 refs : `Iterable` [ `DatasetRef` ] 

273 `DatasetRef` instances to group. 

274 

275 Returns 

276 ------- 

277 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

278 Grouped `DatasetRef` instances. 

279 """ 

280 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

281 for ref in refs: 

282 result.setdefault(ref.datasetType, []).append(ref) 

283 return result 

284 

285 def getCheckedId(self) -> int: 

286 """Return ``self.id``, or raise if it is `None`. 

287 

288 This trivial method exists to allow operations that would otherwise be 

289 natural list comprehensions to check that the ID is not `None` as well. 

290 

291 Returns 

292 ------- 

293 id : `int` 

294 ``self.id`` if it is not `None`. 

295 

296 Raises 

297 ------ 

298 AmbiguousDatasetError 

299 Raised if ``ref.id`` is `None`. 

300 """ 

301 if self.id is None: 

302 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

303 f"a resolved reference is required.") 

304 return self.id 

305 

306 def makeComponentRef(self, name: str) -> DatasetRef: 

307 """Create a `DatasetRef` that corresponds to a component of this 

308 dataset. 

309 

310 Parameters 

311 ---------- 

312 name : `str` 

313 Name of the component. 

314 

315 Returns 

316 ------- 

317 ref : `DatasetRef` 

318 A `DatasetRef` with a dataset type that corresponds to the given 

319 component, with ``hasParentId=True``, and the same ID and run 

320 (which may be `None`, if they are `None` in ``self``). 

321 """ 

322 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

323 id=self.id, run=self.run, hasParentId=True) 

324 

325 datasetType: DatasetType 

326 """The definition of this dataset (`DatasetType`). 

327 

328 Cannot be changed after a `DatasetRef` is constructed. 

329 """ 

330 

331 dataId: DataCoordinate 

332 """A mapping of `Dimension` primary key values that labels the dataset 

333 within a Collection (`DataCoordinate`). 

334 

335 Cannot be changed after a `DatasetRef` is constructed. 

336 """ 

337 

338 run: Optional[str] 

339 """The name of the run that produced the dataset. 

340 

341 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

342 `unresolved` to add or remove this information when creating a new 

343 `DatasetRef`. 

344 """ 

345 

346 id: Optional[int] 

347 """Primary key of the dataset (`int` or `None`). 

348 

349 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

350 `unresolved` to add or remove this information when creating a new 

351 `DatasetRef`. 

352 """