Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef"] 

24 

25from typing import ( 

26 Any, 

27 Iterable, 

28 List, 

29 Optional, 

30 Tuple, 

31) 

32 

33from ..dimensions import DataCoordinate, DimensionGraph 

34from ..configSupport import LookupKey 

35from ..utils import immutable 

36from ..named import NamedKeyDict 

37from .type import DatasetType 

38 

39 

40class AmbiguousDatasetError(Exception): 

41 """Exception raised when a `DatasetRef` is not resolved (has no ID or run), 

42 but the requested operation requires one of them. 

43 """ 

44 

45 

46@immutable 

47class DatasetRef: 

48 """Reference to a Dataset in a `Registry`. 

49 

50 A `DatasetRef` may point to a Dataset that currently does not yet exist 

51 (e.g., because it is a predicted input for provenance). 

52 

53 Parameters 

54 ---------- 

55 datasetType : `DatasetType` 

56 The `DatasetType` for this Dataset. 

57 dataId : `DataCoordinate` 

58 A mapping of dimensions that labels the Dataset within a Collection. 

59 id : `int`, optional 

60 The unique integer identifier assigned when the dataset is created. 

61 run : `str`, optional 

62 The name of the run this dataset was associated with when it was 

63 created. Must be provided if ``id`` is. 

64 conform : `bool`, optional 

65 If `True` (default), call `DataCoordinate.standardize` to ensure that 

66 the data ID's dimensions are consistent with the dataset type's. 

67 `DatasetRef` instances for which those dimensions are not equal should 

68 not be created in new code, but are still supported for backwards 

69 compatibility. New code should only pass `False` if it can guarantee 

70 that the dimensions are already consistent. 

71 hasParentId : `bool`, optional 

72 If `True` this `DatasetRef` is a component that has the ``id`` 

73 of the composite parent. 

74 

75 Raises 

76 ------ 

77 ValueError 

78 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

79 provided but ``run`` is not. 

80 """ 

81 

82 __slots__ = ("id", "datasetType", "dataId", "run", "hasParentId") 

83 

84 def __init__( 

85 self, 

86 datasetType: DatasetType, dataId: DataCoordinate, *, 

87 id: Optional[int] = None, 

88 run: Optional[str] = None, 

89 hasParentId: bool = False, 

90 conform: bool = True 

91 ): 

92 self.id = id 

93 self.datasetType = datasetType 

94 self.hasParentId = hasParentId 

95 if conform: 

96 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

97 else: 

98 self.dataId = dataId 

99 if self.id is not None: 

100 if run is None: 

101 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

102 f"type={datasetType}, and dataId={dataId}.") 

103 self.run = run 

104 else: 

105 if run is not None: 

106 raise ValueError("'run' cannot be provided unless 'id' is.") 

107 self.run = None 

108 

109 def __eq__(self, other: Any) -> bool: 

110 try: 

111 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

112 except AttributeError: 

113 return NotImplemented 

114 

115 def __hash__(self) -> int: 

116 return hash((self.datasetType, self.dataId, self.id)) 

117 

118 @property 

119 def dimensions(self) -> DimensionGraph: 

120 """The dimensions associated with the underlying `DatasetType` 

121 """ 

122 return self.datasetType.dimensions 

123 

124 def __repr__(self) -> str: 

125 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

126 # DataCoordinate's __repr__ - while adhering to the guidelines for 

127 # __repr__ - is much harder to users to read, while its __str__ just 

128 # produces a dict that can also be passed to DatasetRef's constructor. 

129 if self.id is not None: 

130 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

131 else: 

132 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

133 

134 def __str__(self) -> str: 

135 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]" 

136 if self.id is not None: 

137 s += f" (id={self.id})" 

138 return s 

139 

140 def __lt__(self, other: Any) -> bool: 

141 # Sort by run, DatasetType name and then by DataCoordinate 

142 # The __str__ representation is probably close enough but we 

143 # need to ensure that sorting a DatasetRef matches what you would 

144 # get if you sorted DatasetType+DataCoordinate 

145 if not isinstance(other, type(self)): 

146 return NotImplemented 

147 

148 # Group by run if defined, takes precedence over DatasetType 

149 self_run = "" if self.run is None else self.run 

150 other_run = "" if other.run is None else other.run 

151 

152 # Compare tuples in the priority order 

153 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

154 

155 @classmethod 

156 def _unpickle( 

157 cls, 

158 datasetType: DatasetType, 

159 dataId: DataCoordinate, 

160 id: Optional[int], 

161 run: Optional[str], 

162 hasParentId: bool, 

163 ) -> DatasetRef: 

164 """A custom factory method for use by `__reduce__` as a workaround for 

165 its lack of support for keyword arguments. 

166 """ 

167 return cls(datasetType, dataId, id=id, run=run, hasParentId=hasParentId) 

168 

169 def __reduce__(self) -> tuple: 

170 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run, self.hasParentId)) 

171 

172 def __deepcopy__(self, memo: dict) -> DatasetRef: 

173 # DatasetRef is recursively immutable; see note in @immutable 

174 # decorator. 

175 return self 

176 

177 def resolved(self, id: int, run: str) -> DatasetRef: 

178 """Return a new `DatasetRef` with the same data ID and dataset type 

179 and the given ID and run. 

180 

181 Parameters 

182 ---------- 

183 id : `int` 

184 The unique integer identifier assigned when the dataset is created. 

185 run : `str` 

186 The run this dataset was associated with when it was created. 

187 

188 Returns 

189 ------- 

190 ref : `DatasetRef` 

191 A new `DatasetRef`. 

192 """ 

193 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

194 id=id, run=run, conform=False) 

195 

196 def unresolved(self) -> DatasetRef: 

197 """Return a new `DatasetRef` with the same data ID and dataset type, 

198 but no ID or run. 

199 

200 Returns 

201 ------- 

202 ref : `DatasetRef` 

203 A new `DatasetRef`. 

204 

205 Notes 

206 ----- 

207 This can be used to compare only the data ID and dataset type of a 

208 pair of `DatasetRef` instances, regardless of whether either is 

209 resolved:: 

210 

211 if ref1.unresolved() == ref2.unresolved(): 

212 ... 

213 """ 

214 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

215 

216 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

217 """Return a new `DatasetRef` with the given expanded data ID. 

218 

219 Parameters 

220 ---------- 

221 dataId : `DataCoordinate` 

222 Data ID for the new `DatasetRef`. Must compare equal to the 

223 original data ID. 

224 

225 Returns 

226 ------- 

227 ref : `DatasetRef` 

228 A new `DatasetRef` with the given data ID. 

229 """ 

230 assert dataId == self.dataId 

231 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

232 id=self.id, run=self.run, 

233 conform=False) 

234 

235 def isComponent(self) -> bool: 

236 """Boolean indicating whether this `DatasetRef` refers to a 

237 component of a composite. 

238 

239 Returns 

240 ------- 

241 isComponent : `bool` 

242 `True` if this `DatasetRef` is a component, `False` otherwise. 

243 """ 

244 return self.datasetType.isComponent() 

245 

246 def isComposite(self) -> bool: 

247 """Boolean indicating whether this `DatasetRef` is a composite type. 

248 

249 Returns 

250 ------- 

251 isComposite : `bool` 

252 `True` if this `DatasetRef` is a composite type, `False` 

253 otherwise. 

254 """ 

255 return self.datasetType.isComposite() 

256 

257 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

258 """Name keys to use when looking up this DatasetRef in a configuration. 

259 

260 The names are returned in order of priority. 

261 

262 Returns 

263 ------- 

264 names : `tuple` of `LookupKey` 

265 Tuple of the `DatasetType` name and the `StorageClass` name. 

266 If ``instrument`` is defined in the dataId, each of those names 

267 is added to the start of the tuple with a key derived from the 

268 value of ``instrument``. 

269 """ 

270 # Special case the instrument Dimension since we allow configs 

271 # to include the instrument name in the hierarchy. 

272 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

273 

274 # mypy doesn't think this could return True, because even though 

275 # __contains__ can take an object of any type, it seems hard-coded to 

276 # assume it will return False if the type doesn't match the key type 

277 # of the Mapping. 

278 if "instrument" in self.dataId: # type: ignore 

279 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

280 for n in names) + names 

281 

282 return names 

283 

284 @staticmethod 

285 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

286 """Group an iterable of `DatasetRef` by `DatasetType`. 

287 

288 Parameters 

289 ---------- 

290 refs : `Iterable` [ `DatasetRef` ] 

291 `DatasetRef` instances to group. 

292 

293 Returns 

294 ------- 

295 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

296 Grouped `DatasetRef` instances. 

297 """ 

298 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

299 for ref in refs: 

300 result.setdefault(ref.datasetType, []).append(ref) 

301 return result 

302 

303 def getCheckedId(self) -> int: 

304 """Return ``self.id``, or raise if it is `None`. 

305 

306 This trivial method exists to allow operations that would otherwise be 

307 natural list comprehensions to check that the ID is not `None` as well. 

308 

309 Returns 

310 ------- 

311 id : `int` 

312 ``self.id`` if it is not `None`. 

313 

314 Raises 

315 ------ 

316 AmbiguousDatasetError 

317 Raised if ``ref.id`` is `None`. 

318 """ 

319 if self.id is None: 

320 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

321 f"a resolved reference is required.") 

322 return self.id 

323 

324 def makeComponentRef(self, name: str) -> DatasetRef: 

325 """Create a `DatasetRef` that corresponds to a component of this 

326 dataset. 

327 

328 Parameters 

329 ---------- 

330 name : `str` 

331 Name of the component. 

332 

333 Returns 

334 ------- 

335 ref : `DatasetRef` 

336 A `DatasetRef` with a dataset type that corresponds to the given 

337 component, with ``hasParentId=True``, and the same ID and run 

338 (which may be `None`, if they are `None` in ``self``). 

339 """ 

340 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

341 id=self.id, run=self.run, hasParentId=True) 

342 

343 datasetType: DatasetType 

344 """The definition of this dataset (`DatasetType`). 

345 

346 Cannot be changed after a `DatasetRef` is constructed. 

347 """ 

348 

349 dataId: DataCoordinate 

350 """A mapping of `Dimension` primary key values that labels the dataset 

351 within a Collection (`DataCoordinate`). 

352 

353 Cannot be changed after a `DatasetRef` is constructed. 

354 """ 

355 

356 run: Optional[str] 

357 """The name of the run that produced the dataset. 

358 

359 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

360 `unresolved` to add or remove this information when creating a new 

361 `DatasetRef`. 

362 """ 

363 

364 id: Optional[int] 

365 """Primary key of the dataset (`int` or `None`). 

366 

367 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

368 `unresolved` to add or remove this information when creating a new 

369 `DatasetRef`. 

370 """