Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef"] 

24 

25import hashlib 

26from typing import ( 

27 Any, 

28 Dict, 

29 Iterable, 

30 Iterator, 

31 List, 

32 Mapping, 

33 Optional, 

34 Tuple, 

35) 

36 

37from types import MappingProxyType 

38from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate 

39from ..configSupport import LookupKey 

40from ..utils import immutable, NamedKeyDict 

41from .type import DatasetType 

42 

43 

44class AmbiguousDatasetError(Exception): 

45 """Exception raised when a `DatasetRef` is not resolved (has no ID, run, or 

46 components), but the requested operation requires one of them. 

47 """ 

48 

49 

50@immutable 

51class DatasetRef: 

52 """Reference to a Dataset in a `Registry`. 

53 

54 A `DatasetRef` may point to a Dataset that currently does not yet exist 

55 (e.g., because it is a predicted input for provenance). 

56 

57 Parameters 

58 ---------- 

59 datasetType : `DatasetType` 

60 The `DatasetType` for this Dataset. 

61 dataId : `DataCoordinate` 

62 A mapping of dimensions that labels the Dataset within a Collection. 

63 id : `int`, optional 

64 The unique integer identifier assigned when the dataset is created. 

65 run : `str`, optional 

66 The name of the run this dataset was associated with when it was 

67 created. Must be provided if ``id`` is. 

68 hash : `bytes`, optional 

69 A hash of the dataset type and data ID. Should only be provided if 

70 copying from another `DatasetRef` with the same dataset type and data 

71 ID. 

72 components : `dict`, optional 

73 A dictionary mapping component name to a `DatasetRef` for that 

74 component. Should not be passed unless ``id`` is also provided (i.e. 

75 if this is a "resolved" reference). 

76 conform : `bool`, optional 

77 If `True` (default), call `DataCoordinate.standardize` to ensure that 

78 the data ID's dimensions are consistent with the dataset type's. 

79 `DatasetRef` instances for which those dimensions are not equal should 

80 not be created in new code, but are still supported for backwards 

81 compatibility. New code should only pass `False` if it can guarantee 

82 that the dimensions are already consistent. 

83 hasParentId : `bool`, optional 

84 If `True` this `DatasetRef` is a component that has the ``id`` 

85 of the composite parent. This is set if the registry does not 

86 know about individual components but does know about the composite. 

87 

88 Raises 

89 ------ 

90 ValueError 

91 Raised if ``run`` or ``components`` is provided but ``id`` is not, or 

92 if a component dataset is inconsistent with the storage class, or if 

93 ``id`` is provided but ``run`` is not. 

94 """ 

95 

96 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components", "hasParentId") 

97 

98 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *, 

99 id: Optional[int] = None, 

100 run: Optional[str] = None, hash: Optional[bytes] = None, 

101 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True, 

102 hasParentId: bool = False) -> DatasetRef: 

103 self = super().__new__(cls) 

104 assert isinstance(datasetType, DatasetType) 

105 self.id = id 

106 self.datasetType = datasetType 

107 self.hasParentId = hasParentId 

108 if conform: 

109 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

110 else: 

111 self.dataId = dataId 

112 if self.id is not None: 

113 self._components = dict() 

114 if components is not None: 

115 self._components.update(components) 

116 for k, v in self._components.items(): 

117 expectedStorageClass = self.datasetType.storageClass.components.get(k) 

118 if expectedStorageClass is None: 

119 raise ValueError(f"{k} is not a valid component for " 

120 f"storage class {self.datasetType.storageClass.name}.") 

121 if not isinstance(v, DatasetRef): 

122 # It's easy to accidentally pass DatasetType or 

123 # StorageClass; make that error message friendly. 

124 raise ValueError(f"Component {k}={v} is not a DatasetRef.") 

125 if v.id is None: 

126 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).") 

127 if expectedStorageClass != v.datasetType.storageClass: 

128 raise ValueError(f"Storage class mismatch for component {k}: " 

129 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}") 

130 if run is None: 

131 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

132 f"type={datasetType}, and dataId={dataId}.") 

133 self.run = run 

134 else: 

135 self._components = None 

136 if components: 

137 raise ValueError("'components' cannot be provided unless 'id' is.") 

138 if run is not None: 

139 raise ValueError("'run' cannot be provided unless 'id' is.") 

140 self.run = None 

141 if hash is not None: 

142 # We only set self._hash if we know it; this plays nicely with 

143 # the @immutable decorator, which allows an attribute to be set 

144 # only one time. 

145 self._hash = hash 

146 return self 

147 

148 def __eq__(self, other: Any) -> bool: 

149 try: 

150 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

151 except AttributeError: 

152 return NotImplemented 

153 

154 def __hash__(self) -> int: 

155 return hash((self.datasetType, self.dataId, self.id)) 

156 

157 @property 

158 def hash(self) -> bytes: 

159 """Secure hash of the `DatasetType` name and data ID (`bytes`). 

160 """ 

161 if not hasattr(self, "_hash"): 

162 message = hashlib.blake2b(digest_size=32) 

163 message.update(self.datasetType.name.encode("utf8")) 

164 self.dataId.fingerprint(message.update) 

165 self._hash = message.digest() 

166 return self._hash 

167 

168 @property 

169 def components(self) -> Optional[Mapping[str, DatasetRef]]: 

170 """Named `DatasetRef` components (`~collections.abc.Mapping` or 

171 `None`). 

172 

173 For resolved `DatasetRef` instances, this is a read-only mapping. For 

174 unresolved instances, this is always `None`. 

175 """ 

176 if self._components is None: 

177 return None 

178 return MappingProxyType(self._components) 

179 

180 @property 

181 def dimensions(self) -> DimensionGraph: 

182 """The dimensions associated with the underlying `DatasetType` 

183 """ 

184 return self.datasetType.dimensions 

185 

186 def __repr__(self) -> str: 

187 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

188 # DataCoordinate's __repr__ - while adhering to the guidelines for 

189 # __repr__ - is much harder to users to read, while its __str__ just 

190 # produces a dict that can also be passed to DatasetRef's constructor. 

191 if self.id is not None: 

192 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, " 

193 f"components={self._components})") 

194 else: 

195 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

196 

197 def __str__(self) -> str: 

198 s = f"{self.datasetType.name}@{self.dataId!s}" 

199 if self.id is not None: 

200 s += f" (id={self.id})" 

201 return s 

202 

203 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: 

204 return ((self.datasetType, self.dataId), 

205 {"id": self.id, "run": self.run, "components": self._components}) 

206 

207 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None 

208 ) -> DatasetRef: 

209 """Return a new `DatasetRef` with the same data ID and dataset type 

210 and the given ID and run. 

211 

212 Parameters 

213 ---------- 

214 id : `int` 

215 The unique integer identifier assigned when the dataset is created. 

216 run : `str` 

217 The run this dataset was associated with when it was created. 

218 components : `dict`, optional 

219 A dictionary mapping component name to a `DatasetRef` for that 

220 component. If ``self`` is already a resolved `DatasetRef`, 

221 its components will be merged with this dictionary, with this 

222 dictionary taking precedence. 

223 

224 Returns 

225 ------- 

226 ref : `DatasetRef` 

227 A new `DatasetRef`. 

228 """ 

229 if self._components is not None: 

230 newComponents = self._components.copy() 

231 else: 

232 newComponents = {} 

233 if components: 

234 newComponents.update(components) 

235 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

236 id=id, run=run, hash=self.hash, components=newComponents, conform=False) 

237 

238 def unresolved(self) -> DatasetRef: 

239 """Return a new `DatasetRef` with the same data ID and dataset type, 

240 but no ID, run, or components. 

241 

242 Returns 

243 ------- 

244 ref : `DatasetRef` 

245 A new `DatasetRef`. 

246 

247 Notes 

248 ----- 

249 This can be used to compare only the data ID and dataset type of a 

250 pair of `DatasetRef` instances, regardless of whether either is 

251 resolved:: 

252 

253 if ref1.unresolved() == ref2.unresolved(): 

254 ... 

255 """ 

256 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False) 

257 

258 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef: 

259 """Return a new `DatasetRef` with the given expanded data ID. 

260 

261 Parameters 

262 ---------- 

263 dataId : `ExpandedDataCoordinate` 

264 Data ID for the new `DatasetRef`. Must compare equal to the 

265 original data ID. 

266 

267 Returns 

268 ------- 

269 ref : `DatasetRef` 

270 A new `DatasetRef` with the given data ID. 

271 """ 

272 assert dataId == self.dataId 

273 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

274 id=self.id, run=self.run, hash=self.hash, components=self.components, 

275 conform=False) 

276 

277 def isComponent(self) -> bool: 

278 """Boolean indicating whether this `DatasetRef` refers to a 

279 component of a composite. 

280 

281 Returns 

282 ------- 

283 isComponent : `bool` 

284 `True` if this `DatasetRef` is a component, `False` otherwise. 

285 """ 

286 return self.datasetType.isComponent() 

287 

288 def isComposite(self) -> bool: 

289 """Boolean indicating whether this `DatasetRef` is a composite type. 

290 

291 Returns 

292 ------- 

293 isComposite : `bool` 

294 `True` if this `DatasetRef` is a composite type, `False` 

295 otherwise. 

296 """ 

297 return self.datasetType.isComposite() 

298 

299 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

300 """Name keys to use when looking up this DatasetRef in a configuration. 

301 

302 The names are returned in order of priority. 

303 

304 Returns 

305 ------- 

306 names : `tuple` of `LookupKey` 

307 Tuple of the `DatasetType` name and the `StorageClass` name. 

308 If ``instrument`` is defined in the dataId, each of those names 

309 is added to the start of the tuple with a key derived from the 

310 value of ``instrument``. 

311 """ 

312 # Special case the instrument Dimension since we allow configs 

313 # to include the instrument name in the hierarchy. 

314 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

315 

316 if "instrument" in self.dataId: 

317 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

318 for n in names) + names 

319 

320 return names 

321 

322 def allRefs(self, parents: bool = True) -> Iterator[DatasetRef]: 

323 """Return all the nested component `DatasetRef` and optionally the 

324 parent. 

325 

326 Parameters 

327 ---------- 

328 parents : `bool`, optional 

329 If `True` (default) include the given dataset in the output 

330 iterable. If `False`, include only its components. This does 

331 not propagate recursively - only the outermost level of parents 

332 is ignored if ``parents`` is `False`. 

333 

334 Yields 

335 ------ 

336 ref : `DatasetRef` 

337 Itself (only if ``parent`` is `True`) or one of its (recursive) 

338 children. 

339 

340 Notes 

341 ----- 

342 If ``parents`` is `True`, components are guaranteed to be yielded 

343 before their parents. 

344 """ 

345 if self.components is None: 

346 raise AmbiguousDatasetError(f"Unresolved ref {self} cannot be flattened.") 

347 yield from DatasetRef.flatten(self.components.values(), parents=True) 

348 if parents: 

349 yield self 

350 

351 @staticmethod 

352 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]: 

353 """Recursively transform an iterable over `DatasetRef` to include 

354 nested component `DatasetRef` instances. 

355 

356 Parameters 

357 ---------- 

358 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

359 Input iterable to process. Must contain only resolved `DatasetRef` 

360 instances (i.e. with `DatasetRef.components` not `None`). 

361 parents : `bool`, optional 

362 If `True` (default) include the given datasets in the output 

363 iterable. If `False`, include only their components. This does 

364 not propagate recursively - only the outermost level of parents 

365 is ignored if ``parents`` is `False`. 

366 

367 Yields 

368 ------ 

369 ref : `DatasetRef` 

370 Either one of the given `DatasetRef` instances (only if ``parent`` 

371 is `True`) or one of its (recursive) children. 

372 

373 Notes 

374 ----- 

375 If ``parents`` is `True`, components are guaranteed to be yielded 

376 before their parents. 

377 """ 

378 for ref in refs: 

379 for subref in ref.allRefs(parents): 

380 yield subref 

381 

382 @staticmethod 

383 def groupByType(refs: Iterable[DatasetRef], *, recursive: bool = True 

384 ) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

385 """Group an iterable of `DatasetRef` by `DatasetType`. 

386 

387 Parameters 

388 ---------- 

389 refs : `Iterable` [ `DatasetRef` ] 

390 `DatasetRef` instances to group. 

391 recursive : `bool`, optional 

392 If `True` (default), also group any `DatasetRef` instances found in 

393 the `DatasetRef.components` dictionaries of ``refs``, recursively. 

394 `True` also checks that references are "resolved" (unresolved 

395 references never have components). 

396 

397 Returns 

398 ------- 

399 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

400 Grouped `DatasetRef` instances. 

401 

402 Raises 

403 ------ 

404 AmbiguousDatasetError 

405 Raised if ``recursive is True``, and one or more refs has 

406 ``DatasetRef.components is None`` (as is always the case for 

407 unresolved `DatasetRef` objects). 

408 """ 

409 result = NamedKeyDict() 

410 iter = DatasetRef.flatten(refs) if recursive else refs 

411 for ref in iter: 

412 result.setdefault(ref.datasetType, []).append(ref) 

413 return result 

414 

415 def getCheckedId(self) -> int: 

416 """Return ``self.id``, or raise if it is `None`. 

417 

418 This trivial method exists to allow operations that would otherwise be 

419 natural list comprehensions to check that the ID is not `None` as well. 

420 

421 Returns 

422 ------- 

423 id : `int` 

424 ``self.id`` if it is not `None`. 

425 

426 Raises 

427 ------ 

428 AmbiguousDatasetError 

429 Raised if ``ref.id`` is `None`. 

430 """ 

431 if self.id is None: 

432 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

433 f"a resolved reference is required.") 

434 return self.id 

435 

436 datasetType: DatasetType 

437 """The definition of this dataset (`DatasetType`). 

438 

439 Cannot be changed after a `DatasetRef` is constructed. 

440 """ 

441 

442 dataId: DataCoordinate 

443 """A mapping of `Dimension` primary key values that labels the dataset 

444 within a Collection (`DataCoordinate`). 

445 

446 Cannot be changed after a `DatasetRef` is constructed. 

447 """ 

448 

449 run: Optional[setattr] 

450 """The name of the run that produced the dataset. 

451 

452 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

453 `unresolved` to add or remove this information when creating a new 

454 `DatasetRef`. 

455 """ 

456 

457 id: Optional[int] 

458 """Primary key of the dataset (`int` or `None`). 

459 

460 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

461 `unresolved` to add or remove this information when creating a new 

462 `DatasetRef`. 

463 """