Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["DatasetRef", "FakeDatasetRef"] 

24 

25import hashlib 

26from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, Tuple 

27 

28from types import MappingProxyType 

29from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate 

30from ..configSupport import LookupKey 

31from ..utils import immutable 

32from .type import DatasetType 

33 

34 

35@immutable 

36class DatasetRef: 

37 """Reference to a Dataset in a `Registry`. 

38 

39 A `DatasetRef` may point to a Dataset that currently does not yet exist 

40 (e.g., because it is a predicted input for provenance). 

41 

42 Parameters 

43 ---------- 

44 datasetType : `DatasetType` 

45 The `DatasetType` for this Dataset. 

46 dataId : `DataCoordinate` 

47 A mapping of dimensions that labels the Dataset within a Collection. 

48 id : `int`, optional 

49 The unique integer identifier assigned when the dataset is created. 

50 run : `str`, optional 

51 The name of the run this dataset was associated with when it was 

52 created. 

53 hash : `bytes`, optional 

54 A hash of the dataset type and data ID. Should only be provided if 

55 copying from another `DatasetRef` with the same dataset type and data 

56 ID. 

57 components : `dict`, optional 

58 A dictionary mapping component name to a `DatasetRef` for that 

59 component. Should not be passed unless ``id`` is also provided (i.e. 

60 if this is a "resolved" reference). 

61 conform : `bool`, optional 

62 If `True` (default), call `DataCoordinate.standardize` to ensure that 

63 the data ID's dimensions are consistent with the dataset type's. 

64 `DatasetRef` instances for which those dimensions are not equal should 

65 not be created in new code, but are still supported for backwards 

66 compatibility. New code should only pass `False` if it can guarantee 

67 that the dimensions are already consistent. 

68 

69 Raises 

70 ------ 

71 ValueError 

72 Raised if ``run`` or ``components`` is provided but ``id`` is not, or 

73 if a component dataset is inconsistent with the storage class. 

74 """ 

75 

76 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components") 

77 

78 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *, 

79 id: Optional[int] = None, 

80 run: Optional[str] = None, hash: Optional[bytes] = None, 

81 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True) -> DatasetRef: 

82 self = super().__new__(cls) 

83 assert isinstance(datasetType, DatasetType) 

84 self.id = id 

85 self.datasetType = datasetType 

86 if conform: 

87 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

88 else: 

89 self.dataId = dataId 

90 if self.id is not None: 

91 self._components = dict() 

92 if components is not None: 

93 self._components.update(components) 

94 for k, v in self._components.items(): 

95 expectedStorageClass = self.datasetType.storageClass.components.get(k) 

96 if expectedStorageClass is None: 

97 raise ValueError(f"{k} is not a valid component for " 

98 f"storage class {self.datasetType.storageClass.name}.") 

99 if not isinstance(v, DatasetRef): 

100 # It's easy to accidentally pass DatasetType or 

101 # StorageClass; make that error message friendly. 

102 raise ValueError(f"Component {k}={v} is not a DatasetRef.") 

103 if v.id is None: 

104 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).") 

105 if expectedStorageClass != v.datasetType.storageClass: 

106 raise ValueError(f"Storage class mismatch for component {k}: " 

107 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}") 

108 # TODO: it would be nice to guarantee that id and run should be 

109 # either both None or not None together. We can't easily do that 

110 # yet because the Query infrastructure has a hard time obtaining 

111 # run strings, so we allow run to be `None` here, but that will 

112 # change. 

113 self.run = run 

114 else: 

115 self._components = None 

116 if components: 

117 raise ValueError("'components' cannot be provided unless 'id' is.") 

118 if run is not None: 

119 raise ValueError("'run' cannot be provided unless 'id' is.") 

120 self.run = None 

121 if hash is not None: 

122 # We only set self._hash if we know it; this plays nicely with 

123 # the @immutable decorator, which allows an attribute to be set 

124 # only one time. 

125 self._hash = hash 

126 return self 

127 

128 def __eq__(self, other: DatasetRef): 

129 try: 

130 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

131 except AttributeError: 

132 return NotImplemented 

133 

134 def __hash__(self) -> int: 

135 return hash((self.datasetType, self.dataId, self.id)) 

136 

137 @property 

138 def hash(self) -> bytes: 

139 """Secure hash of the `DatasetType` name and data ID (`bytes`). 

140 """ 

141 if not hasattr(self, "_hash"): 

142 message = hashlib.blake2b(digest_size=32) 

143 message.update(self.datasetType.name.encode("utf8")) 

144 self.dataId.fingerprint(message.update) 

145 self._hash = message.digest() 

146 return self._hash 

147 

148 @property 

149 def components(self) -> Optional[Mapping[str, DatasetRef]]: 

150 """Named `DatasetRef` components (`~collections.abc.Mapping` or 

151 `None`). 

152 

153 For resolved `DatasetRef` instances, this is a read-only mapping that 

154 can be updated in-place via `Registry.attachComponent()`. For 

155 unresolved instances, this is always `None`. 

156 """ 

157 if self._components is None: 

158 return None 

159 return MappingProxyType(self._components) 

160 

161 @property 

162 def dimensions(self) -> DimensionGraph: 

163 """The dimensions associated with the underlying `DatasetType` 

164 """ 

165 return self.datasetType.dimensions 

166 

167 def __repr__(self) -> str: 

168 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

169 # DataCoordinate's __repr__ - while adhering to the guidelines for 

170 # __repr__ - is much harder to users to read, while its __str__ just 

171 # produces a dict that can also be passed to DatasetRef's constructor. 

172 if self.id is not None: 

173 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, " 

174 f"components={self._components})") 

175 else: 

176 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

177 

178 def __str__(self) -> str: 

179 s = f"{self.datasetType.name}@{self.dataId!s}" 

180 if self.id is not None: 

181 s += f" (id={self.id})" 

182 return s 

183 

184 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: 

185 return ((self.datasetType, self.dataId), 

186 {"id": self.id, "run": self.run, "components": self._components}) 

187 

188 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None 

189 ) -> DatasetRef: 

190 """Return a new `DatasetRef` with the same data ID and dataset type 

191 and the given ID and run. 

192 

193 Parameters 

194 ---------- 

195 id : `int` 

196 The unique integer identifier assigned when the dataset is created. 

197 run : `str` 

198 The run this dataset was associated with when it was created. 

199 components : `dict`, optional 

200 A dictionary mapping component name to a `DatasetRef` for that 

201 component. If ``self`` is already a resolved `DatasetRef`, 

202 its components will be merged with this dictionary, with this 

203 dictionary taking precedence. 

204 

205 Returns 

206 ------- 

207 ref : `DatasetRef` 

208 A new `DatasetRef`. 

209 """ 

210 if self._components is not None: 

211 newComponents = self._components.copy() 

212 else: 

213 newComponents = {} 

214 if components: 

215 newComponents.update(components) 

216 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

217 id=id, run=run, hash=self.hash, components=newComponents, conform=False) 

218 

219 def unresolved(self) -> DatasetRef: 

220 """Return a new `DatasetRef` with the same data ID and dataset type, 

221 but no ID, run, or components. 

222 

223 Returns 

224 ------- 

225 ref : `DatasetRef` 

226 A new `DatasetRef`. 

227 

228 Notes 

229 ----- 

230 This can be used to compare only the data ID and dataset type of a 

231 pair of `DatasetRef` instances, regardless of whether either is 

232 resolved:: 

233 

234 if ref1.unresolved() == ref2.unresolved(): 

235 ... 

236 """ 

237 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False) 

238 

239 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef: 

240 """Return a new `DatasetRef` with the given expanded data ID. 

241 

242 Parameters 

243 ---------- 

244 dataId : `ExpandedDataCoordinate` 

245 Data ID for the new `DatasetRef`. Must compare equal to the 

246 original data ID. 

247 

248 Returns 

249 ------- 

250 ref : `DatasetRef` 

251 A new `DatasetRef` with the given data ID. 

252 """ 

253 assert dataId == self.dataId 

254 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

255 id=self.id, run=self.run, hash=self.hash, components=self.components, 

256 conform=False) 

257 

258 def isComponent(self) -> bool: 

259 """Boolean indicating whether this `DatasetRef` refers to a 

260 component of a composite. 

261 

262 Returns 

263 ------- 

264 isComponent : `bool` 

265 `True` if this `DatasetRef` is a component, `False` otherwise. 

266 """ 

267 return self.datasetType.isComponent() 

268 

269 def isComposite(self) -> bool: 

270 """Boolean indicating whether this `DatasetRef` is a composite type. 

271 

272 Returns 

273 ------- 

274 isComposite : `bool` 

275 `True` if this `DatasetRef` is a composite type, `False` 

276 otherwise. 

277 """ 

278 return self.datasetType.isComposite() 

279 

280 def _lookupNames(self) -> Tuple[LookupKey]: 

281 """Name keys to use when looking up this DatasetRef in a configuration. 

282 

283 The names are returned in order of priority. 

284 

285 Returns 

286 ------- 

287 names : `tuple` of `LookupKey` 

288 Tuple of the `DatasetType` name and the `StorageClass` name. 

289 If ``instrument`` is defined in the dataId, each of those names 

290 is added to the start of the tuple with a key derived from the 

291 value of ``instrument``. 

292 """ 

293 # Special case the instrument Dimension since we allow configs 

294 # to include the instrument name in the hierarchy. 

295 names = self.datasetType._lookupNames() 

296 

297 if "instrument" in self.dataId: 

298 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

299 for n in names) + names 

300 

301 return names 

302 

303 @staticmethod 

304 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]: 

305 """Recursively transform an iterable over `DatasetRef` to include 

306 nested component `DatasetRef` instances. 

307 

308 Parameters 

309 ---------- 

310 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

311 Input iterable to process. Must contain only resolved `DatasetRef` 

312 instances (i.e. with `DatasetRef.components` not `None`). 

313 parents : `bool`, optional 

314 If `True` (default) include the given datasets in the output 

315 iterable. If `False`, include only their components. This does 

316 not propagate recursively - only the outermost level of parents 

317 is ignored if ``parents`` is `False`. 

318 

319 Yields 

320 ------ 

321 ref : `DatasetRef` 

322 Either one of the given `DatasetRef` instances (only if ``parent`` 

323 is `True`) or on of its (recursive) children. 

324 

325 Notes 

326 ----- 

327 If ``parents`` is `True`, components are guaranteed to be yielded 

328 before their parents. 

329 """ 

330 for ref in refs: 

331 if ref.components is None: 

332 raise TypeError(f"Unresolved ref '{ref} passed to 'flatten'.") 

333 yield from DatasetRef.flatten(ref.components.values(), parents=True) 

334 if parents: 

335 yield ref 

336 

337 datasetType: DatasetType 

338 """The definition of this dataset (`DatasetType`). 

339 

340 Cannot be changed after a `DatasetRef` is constructed. 

341 """ 

342 

343 dataId: DataCoordinate 

344 """A mapping of `Dimension` primary key values that labels the dataset 

345 within a Collection (`DataCoordinate`). 

346 

347 Cannot be changed after a `DatasetRef` is constructed. 

348 """ 

349 

350 run: Optional[setattr] 

351 """The name of the run that produced the dataset. 

352 

353 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

354 `unresolved` to add or remove this information when creating a new 

355 `DatasetRef`. 

356 """ 

357 

358 id: Optional[int] 

359 """Primary key of the dataset (`int` or `None`). 

360 

361 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

362 `unresolved` to add or remove this information when creating a new 

363 `DatasetRef`. 

364 """ 

365 

366 

367@immutable 

368class FakeDatasetRef: 

369 """A fake `DatasetRef` that can be used internally by butler where 

370 only the dataset ID is available. 

371 

372 Should only be used when registry can not be used to create a full 

373 `DatasetRef` from the ID. A particular use case is during dataset 

374 deletion when solely the ID is available. 

375 

376 Parameters 

377 ---------- 

378 id : `int` 

379 The dataset ID. 

380 """ 

381 __slots__ = ("id",) 

382 

383 def __new__(cls, id: int): 

384 self = super().__new__(cls) 

385 self.id = id 

386 return self 

387 

388 def __str__(self): 

389 return f"dataset_id={self.id}" 

390 

391 def __repr__(self): 

392 return f"FakeDatasetRef({self.id})" 

393 

394 def __eq__(self, other: FakeDatasetRef): 

395 try: 

396 return self.id == other.id 

397 except AttributeError: 

398 return NotImplemented 

399 

400 def __hash__(self) -> int: 

401 return hash(self.id) 

402 

403 @property 

404 def components(self): 

405 return {} 

406 

407 @staticmethod 

408 def flatten(refs: Iterable[FakeDatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]: 

409 return DatasetRef.flatten(refs, parents=parents)