Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef"] 

24 

25import hashlib 

26from typing import ( 

27 Any, 

28 Dict, 

29 Iterable, 

30 Iterator, 

31 List, 

32 Mapping, 

33 Optional, 

34 Tuple, 

35) 

36 

37from types import MappingProxyType 

38from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate 

39from ..configSupport import LookupKey 

40from ..utils import immutable 

41from ..named import NamedKeyDict 

42from .type import DatasetType 

43 

44 

45class AmbiguousDatasetError(Exception): 

46 """Exception raised when a `DatasetRef` is not resolved (has no ID, run, or 

47 components), but the requested operation requires one of them. 

48 """ 

49 

50 

51@immutable 

52class DatasetRef: 

53 """Reference to a Dataset in a `Registry`. 

54 

55 A `DatasetRef` may point to a Dataset that currently does not yet exist 

56 (e.g., because it is a predicted input for provenance). 

57 

58 Parameters 

59 ---------- 

60 datasetType : `DatasetType` 

61 The `DatasetType` for this Dataset. 

62 dataId : `DataCoordinate` 

63 A mapping of dimensions that labels the Dataset within a Collection. 

64 id : `int`, optional 

65 The unique integer identifier assigned when the dataset is created. 

66 run : `str`, optional 

67 The name of the run this dataset was associated with when it was 

68 created. Must be provided if ``id`` is. 

69 hash : `bytes`, optional 

70 A hash of the dataset type and data ID. Should only be provided if 

71 copying from another `DatasetRef` with the same dataset type and data 

72 ID. 

73 components : `dict`, optional 

74 A dictionary mapping component name to a `DatasetRef` for that 

75 component. Should not be passed unless ``id`` is also provided (i.e. 

76 if this is a "resolved" reference). 

77 conform : `bool`, optional 

78 If `True` (default), call `DataCoordinate.standardize` to ensure that 

79 the data ID's dimensions are consistent with the dataset type's. 

80 `DatasetRef` instances for which those dimensions are not equal should 

81 not be created in new code, but are still supported for backwards 

82 compatibility. New code should only pass `False` if it can guarantee 

83 that the dimensions are already consistent. 

84 hasParentId : `bool`, optional 

85 If `True` this `DatasetRef` is a component that has the ``id`` 

86 of the composite parent. This is set if the registry does not 

87 know about individual components but does know about the composite. 

88 

89 Raises 

90 ------ 

91 ValueError 

92 Raised if ``run`` or ``components`` is provided but ``id`` is not, or 

93 if a component dataset is inconsistent with the storage class, or if 

94 ``id`` is provided but ``run`` is not. 

95 """ 

96 

97 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components", "hasParentId") 

98 

99 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *, 

100 id: Optional[int] = None, 

101 run: Optional[str] = None, hash: Optional[bytes] = None, 

102 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True, 

103 hasParentId: bool = False) -> DatasetRef: 

104 self = super().__new__(cls) 

105 assert isinstance(datasetType, DatasetType) 

106 self.id = id 

107 self.datasetType = datasetType 

108 self.hasParentId = hasParentId 

109 if conform: 

110 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

111 else: 

112 self.dataId = dataId 

113 if self.id is not None: 

114 self._components = dict() 

115 if components is not None: 

116 self._components.update(components) 

117 for k, v in self._components.items(): 

118 expectedStorageClass = self.datasetType.storageClass.components.get(k) 

119 if expectedStorageClass is None: 

120 raise ValueError(f"{k} is not a valid component for " 

121 f"storage class {self.datasetType.storageClass.name}.") 

122 if not isinstance(v, DatasetRef): 

123 # It's easy to accidentally pass DatasetType or 

124 # StorageClass; make that error message friendly. 

125 raise ValueError(f"Component {k}={v} is not a DatasetRef.") 

126 if v.id is None: 

127 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).") 

128 if expectedStorageClass != v.datasetType.storageClass: 

129 raise ValueError(f"Storage class mismatch for component {k}: " 

130 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}") 

131 if run is None: 

132 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

133 f"type={datasetType}, and dataId={dataId}.") 

134 self.run = run 

135 else: 

136 self._components = None 

137 if components: 

138 raise ValueError("'components' cannot be provided unless 'id' is.") 

139 if run is not None: 

140 raise ValueError("'run' cannot be provided unless 'id' is.") 

141 self.run = None 

142 if hash is not None: 

143 # We only set self._hash if we know it; this plays nicely with 

144 # the @immutable decorator, which allows an attribute to be set 

145 # only one time. 

146 self._hash = hash 

147 return self 

148 

149 def __eq__(self, other: Any) -> bool: 

150 try: 

151 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

152 except AttributeError: 

153 return NotImplemented 

154 

155 def __hash__(self) -> int: 

156 return hash((self.datasetType, self.dataId, self.id)) 

157 

158 @property 

159 def hash(self) -> bytes: 

160 """Secure hash of the `DatasetType` name and data ID (`bytes`). 

161 """ 

162 if not hasattr(self, "_hash"): 

163 message = hashlib.blake2b(digest_size=32) 

164 message.update(self.datasetType.name.encode("utf8")) 

165 self.dataId.fingerprint(message.update) 

166 self._hash = message.digest() 

167 return self._hash 

168 

169 @property 

170 def components(self) -> Optional[Mapping[str, DatasetRef]]: 

171 """Named `DatasetRef` components (`~collections.abc.Mapping` or 

172 `None`). 

173 

174 For resolved `DatasetRef` instances, this is a read-only mapping. For 

175 unresolved instances, this is always `None`. 

176 """ 

177 if self._components is None: 

178 return None 

179 return MappingProxyType(self._components) 

180 

181 @property 

182 def dimensions(self) -> DimensionGraph: 

183 """The dimensions associated with the underlying `DatasetType` 

184 """ 

185 return self.datasetType.dimensions 

186 

187 def __repr__(self) -> str: 

188 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

189 # DataCoordinate's __repr__ - while adhering to the guidelines for 

190 # __repr__ - is much harder to users to read, while its __str__ just 

191 # produces a dict that can also be passed to DatasetRef's constructor. 

192 if self.id is not None: 

193 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, " 

194 f"components={self._components})") 

195 else: 

196 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

197 

198 def __str__(self) -> str: 

199 s = f"{self.datasetType.name}@{self.dataId!s}" 

200 if self.id is not None: 

201 s += f" (id={self.id})" 

202 return s 

203 

204 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: 

205 return ((self.datasetType, self.dataId), 

206 {"id": self.id, "run": self.run, "components": self._components}) 

207 

208 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None 

209 ) -> DatasetRef: 

210 """Return a new `DatasetRef` with the same data ID and dataset type 

211 and the given ID and run. 

212 

213 Parameters 

214 ---------- 

215 id : `int` 

216 The unique integer identifier assigned when the dataset is created. 

217 run : `str` 

218 The run this dataset was associated with when it was created. 

219 components : `dict`, optional 

220 A dictionary mapping component name to a `DatasetRef` for that 

221 component. If ``self`` is already a resolved `DatasetRef`, 

222 its components will be merged with this dictionary, with this 

223 dictionary taking precedence. 

224 

225 Returns 

226 ------- 

227 ref : `DatasetRef` 

228 A new `DatasetRef`. 

229 """ 

230 if self._components is not None: 

231 newComponents = self._components.copy() 

232 else: 

233 newComponents = {} 

234 if components: 

235 newComponents.update(components) 

236 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

237 id=id, run=run, hash=self.hash, components=newComponents, conform=False) 

238 

239 def unresolved(self) -> DatasetRef: 

240 """Return a new `DatasetRef` with the same data ID and dataset type, 

241 but no ID, run, or components. 

242 

243 Returns 

244 ------- 

245 ref : `DatasetRef` 

246 A new `DatasetRef`. 

247 

248 Notes 

249 ----- 

250 This can be used to compare only the data ID and dataset type of a 

251 pair of `DatasetRef` instances, regardless of whether either is 

252 resolved:: 

253 

254 if ref1.unresolved() == ref2.unresolved(): 

255 ... 

256 """ 

257 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False) 

258 

259 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef: 

260 """Return a new `DatasetRef` with the given expanded data ID. 

261 

262 Parameters 

263 ---------- 

264 dataId : `ExpandedDataCoordinate` 

265 Data ID for the new `DatasetRef`. Must compare equal to the 

266 original data ID. 

267 

268 Returns 

269 ------- 

270 ref : `DatasetRef` 

271 A new `DatasetRef` with the given data ID. 

272 """ 

273 assert dataId == self.dataId 

274 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

275 id=self.id, run=self.run, hash=self.hash, components=self.components, 

276 conform=False) 

277 

278 def isComponent(self) -> bool: 

279 """Boolean indicating whether this `DatasetRef` refers to a 

280 component of a composite. 

281 

282 Returns 

283 ------- 

284 isComponent : `bool` 

285 `True` if this `DatasetRef` is a component, `False` otherwise. 

286 """ 

287 return self.datasetType.isComponent() 

288 

289 def isComposite(self) -> bool: 

290 """Boolean indicating whether this `DatasetRef` is a composite type. 

291 

292 Returns 

293 ------- 

294 isComposite : `bool` 

295 `True` if this `DatasetRef` is a composite type, `False` 

296 otherwise. 

297 """ 

298 return self.datasetType.isComposite() 

299 

300 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

301 """Name keys to use when looking up this DatasetRef in a configuration. 

302 

303 The names are returned in order of priority. 

304 

305 Returns 

306 ------- 

307 names : `tuple` of `LookupKey` 

308 Tuple of the `DatasetType` name and the `StorageClass` name. 

309 If ``instrument`` is defined in the dataId, each of those names 

310 is added to the start of the tuple with a key derived from the 

311 value of ``instrument``. 

312 """ 

313 # Special case the instrument Dimension since we allow configs 

314 # to include the instrument name in the hierarchy. 

315 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

316 

317 # mypy doesn't think this could return True, because even though 

318 # __contains__ can take an object of any type, it seems hard-coded to 

319 # assume it will return False if the type doesn't match the key type 

320 # of the Mapping. 

321 if "instrument" in self.dataId: # type: ignore 

322 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

323 for n in names) + names 

324 

325 return names 

326 

327 def allRefs(self, parents: bool = True) -> Iterator[DatasetRef]: 

328 """Return all the nested component `DatasetRef` and optionally the 

329 parent. 

330 

331 Parameters 

332 ---------- 

333 parents : `bool`, optional 

334 If `True` (default) include the given dataset in the output 

335 iterable. If `False`, include only its components. This does 

336 not propagate recursively - only the outermost level of parents 

337 is ignored if ``parents`` is `False`. 

338 

339 Yields 

340 ------ 

341 ref : `DatasetRef` 

342 Itself (only if ``parent`` is `True`) or one of its (recursive) 

343 children. 

344 

345 Notes 

346 ----- 

347 If ``parents`` is `True`, components are guaranteed to be yielded 

348 before their parents. 

349 """ 

350 if self.components is None: 

351 raise AmbiguousDatasetError(f"Unresolved ref {self} cannot be flattened.") 

352 yield from DatasetRef.flatten(self.components.values(), parents=True) 

353 if parents: 

354 yield self 

355 

356 @staticmethod 

357 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]: 

358 """Recursively transform an iterable over `DatasetRef` to include 

359 nested component `DatasetRef` instances. 

360 

361 Parameters 

362 ---------- 

363 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

364 Input iterable to process. Must contain only resolved `DatasetRef` 

365 instances (i.e. with `DatasetRef.components` not `None`). 

366 parents : `bool`, optional 

367 If `True` (default) include the given datasets in the output 

368 iterable. If `False`, include only their components. This does 

369 not propagate recursively - only the outermost level of parents 

370 is ignored if ``parents`` is `False`. 

371 

372 Yields 

373 ------ 

374 ref : `DatasetRef` 

375 Either one of the given `DatasetRef` instances (only if ``parent`` 

376 is `True`) or one of its (recursive) children. 

377 

378 Notes 

379 ----- 

380 If ``parents`` is `True`, components are guaranteed to be yielded 

381 before their parents. 

382 """ 

383 for ref in refs: 

384 for subref in ref.allRefs(parents): 

385 yield subref 

386 

387 @staticmethod 

388 def groupByType(refs: Iterable[DatasetRef], *, recursive: bool = True 

389 ) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

390 """Group an iterable of `DatasetRef` by `DatasetType`. 

391 

392 Parameters 

393 ---------- 

394 refs : `Iterable` [ `DatasetRef` ] 

395 `DatasetRef` instances to group. 

396 recursive : `bool`, optional 

397 If `True` (default), also group any `DatasetRef` instances found in 

398 the `DatasetRef.components` dictionaries of ``refs``, recursively. 

399 `True` also checks that references are "resolved" (unresolved 

400 references never have components). 

401 

402 Returns 

403 ------- 

404 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

405 Grouped `DatasetRef` instances. 

406 

407 Raises 

408 ------ 

409 AmbiguousDatasetError 

410 Raised if ``recursive is True``, and one or more refs has 

411 ``DatasetRef.components is None`` (as is always the case for 

412 unresolved `DatasetRef` objects). 

413 """ 

414 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

415 iter = DatasetRef.flatten(refs) if recursive else refs 

416 for ref in iter: 

417 result.setdefault(ref.datasetType, []).append(ref) 

418 return result 

419 

420 def getCheckedId(self) -> int: 

421 """Return ``self.id``, or raise if it is `None`. 

422 

423 This trivial method exists to allow operations that would otherwise be 

424 natural list comprehensions to check that the ID is not `None` as well. 

425 

426 Returns 

427 ------- 

428 id : `int` 

429 ``self.id`` if it is not `None`. 

430 

431 Raises 

432 ------ 

433 AmbiguousDatasetError 

434 Raised if ``ref.id`` is `None`. 

435 """ 

436 if self.id is None: 

437 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

438 f"a resolved reference is required.") 

439 return self.id 

440 

441 datasetType: DatasetType 

442 """The definition of this dataset (`DatasetType`). 

443 

444 Cannot be changed after a `DatasetRef` is constructed. 

445 """ 

446 

447 dataId: DataCoordinate 

448 """A mapping of `Dimension` primary key values that labels the dataset 

449 within a Collection (`DataCoordinate`). 

450 

451 Cannot be changed after a `DatasetRef` is constructed. 

452 """ 

453 

454 run: Optional[str] 

455 """The name of the run that produced the dataset. 

456 

457 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

458 `unresolved` to add or remove this information when creating a new 

459 `DatasetRef`. 

460 """ 

461 

462 id: Optional[int] 

463 """Primary key of the dataset (`int` or `None`). 

464 

465 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

466 `unresolved` to add or remove this information when creating a new 

467 `DatasetRef`. 

468 """ 

469 

470 _components: Optional[Dict[str, DatasetRef]]