Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef", "FakeDatasetRef"] 

24 

25import hashlib 

26from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Tuple 

27 

28from types import MappingProxyType 

29from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate 

30from ..configSupport import LookupKey 

31from ..utils import immutable, NamedKeyDict 

32from .type import DatasetType 

33 

34 

35class AmbiguousDatasetError(Exception): 

36 """Exception raised when a `DatasetRef` is not resolved (has no ID, run, or 

37 components), but the requested operation requires one of them. 

38 """ 

39 

40 

41@immutable 

42class DatasetRef: 

43 """Reference to a Dataset in a `Registry`. 

44 

45 A `DatasetRef` may point to a Dataset that currently does not yet exist 

46 (e.g., because it is a predicted input for provenance). 

47 

48 Parameters 

49 ---------- 

50 datasetType : `DatasetType` 

51 The `DatasetType` for this Dataset. 

52 dataId : `DataCoordinate` 

53 A mapping of dimensions that labels the Dataset within a Collection. 

54 id : `int`, optional 

55 The unique integer identifier assigned when the dataset is created. 

56 run : `str`, optional 

57 The name of the run this dataset was associated with when it was 

58 created. Must be provided if ``id`` is. 

59 hash : `bytes`, optional 

60 A hash of the dataset type and data ID. Should only be provided if 

61 copying from another `DatasetRef` with the same dataset type and data 

62 ID. 

63 components : `dict`, optional 

64 A dictionary mapping component name to a `DatasetRef` for that 

65 component. Should not be passed unless ``id`` is also provided (i.e. 

66 if this is a "resolved" reference). 

67 conform : `bool`, optional 

68 If `True` (default), call `DataCoordinate.standardize` to ensure that 

69 the data ID's dimensions are consistent with the dataset type's. 

70 `DatasetRef` instances for which those dimensions are not equal should 

71 not be created in new code, but are still supported for backwards 

72 compatibility. New code should only pass `False` if it can guarantee 

73 that the dimensions are already consistent. 

74 

75 Raises 

76 ------ 

77 ValueError 

78 Raised if ``run`` or ``components`` is provided but ``id`` is not, or 

79 if a component dataset is inconsistent with the storage class, or if 

80 ``id`` is provided but ``run`` is not. 

81 """ 

82 

83 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components") 

84 

85 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *, 

86 id: Optional[int] = None, 

87 run: Optional[str] = None, hash: Optional[bytes] = None, 

88 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True) -> DatasetRef: 

89 self = super().__new__(cls) 

90 assert isinstance(datasetType, DatasetType) 

91 self.id = id 

92 self.datasetType = datasetType 

93 if conform: 

94 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

95 else: 

96 self.dataId = dataId 

97 if self.id is not None: 

98 self._components = dict() 

99 if components is not None: 

100 self._components.update(components) 

101 for k, v in self._components.items(): 

102 expectedStorageClass = self.datasetType.storageClass.components.get(k) 

103 if expectedStorageClass is None: 

104 raise ValueError(f"{k} is not a valid component for " 

105 f"storage class {self.datasetType.storageClass.name}.") 

106 if not isinstance(v, DatasetRef): 

107 # It's easy to accidentally pass DatasetType or 

108 # StorageClass; make that error message friendly. 

109 raise ValueError(f"Component {k}={v} is not a DatasetRef.") 

110 if v.id is None: 

111 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).") 

112 if expectedStorageClass != v.datasetType.storageClass: 

113 raise ValueError(f"Storage class mismatch for component {k}: " 

114 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}") 

115 if run is None: 

116 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

117 f"type={datasetType}, and dataId={dataId}.") 

118 self.run = run 

119 else: 

120 self._components = None 

121 if components: 

122 raise ValueError("'components' cannot be provided unless 'id' is.") 

123 if run is not None: 

124 raise ValueError("'run' cannot be provided unless 'id' is.") 

125 self.run = None 

126 if hash is not None: 

127 # We only set self._hash if we know it; this plays nicely with 

128 # the @immutable decorator, which allows an attribute to be set 

129 # only one time. 

130 self._hash = hash 

131 return self 

132 

133 def __eq__(self, other: DatasetRef): 

134 try: 

135 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

136 except AttributeError: 

137 return NotImplemented 

138 

139 def __hash__(self) -> int: 

140 return hash((self.datasetType, self.dataId, self.id)) 

141 

142 @property 

143 def hash(self) -> bytes: 

144 """Secure hash of the `DatasetType` name and data ID (`bytes`). 

145 """ 

146 if not hasattr(self, "_hash"): 

147 message = hashlib.blake2b(digest_size=32) 

148 message.update(self.datasetType.name.encode("utf8")) 

149 self.dataId.fingerprint(message.update) 

150 self._hash = message.digest() 

151 return self._hash 

152 

153 @property 

154 def components(self) -> Optional[Mapping[str, DatasetRef]]: 

155 """Named `DatasetRef` components (`~collections.abc.Mapping` or 

156 `None`). 

157 

158 For resolved `DatasetRef` instances, this is a read-only mapping. For 

159 unresolved instances, this is always `None`. 

160 """ 

161 if self._components is None: 

162 return None 

163 return MappingProxyType(self._components) 

164 

165 @property 

166 def dimensions(self) -> DimensionGraph: 

167 """The dimensions associated with the underlying `DatasetType` 

168 """ 

169 return self.datasetType.dimensions 

170 

171 def __repr__(self) -> str: 

172 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

173 # DataCoordinate's __repr__ - while adhering to the guidelines for 

174 # __repr__ - is much harder to users to read, while its __str__ just 

175 # produces a dict that can also be passed to DatasetRef's constructor. 

176 if self.id is not None: 

177 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, " 

178 f"components={self._components})") 

179 else: 

180 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

181 

182 def __str__(self) -> str: 

183 s = f"{self.datasetType.name}@{self.dataId!s}" 

184 if self.id is not None: 

185 s += f" (id={self.id})" 

186 return s 

187 

188 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: 

189 return ((self.datasetType, self.dataId), 

190 {"id": self.id, "run": self.run, "components": self._components}) 

191 

192 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None 

193 ) -> DatasetRef: 

194 """Return a new `DatasetRef` with the same data ID and dataset type 

195 and the given ID and run. 

196 

197 Parameters 

198 ---------- 

199 id : `int` 

200 The unique integer identifier assigned when the dataset is created. 

201 run : `str` 

202 The run this dataset was associated with when it was created. 

203 components : `dict`, optional 

204 A dictionary mapping component name to a `DatasetRef` for that 

205 component. If ``self`` is already a resolved `DatasetRef`, 

206 its components will be merged with this dictionary, with this 

207 dictionary taking precedence. 

208 

209 Returns 

210 ------- 

211 ref : `DatasetRef` 

212 A new `DatasetRef`. 

213 """ 

214 if self._components is not None: 

215 newComponents = self._components.copy() 

216 else: 

217 newComponents = {} 

218 if components: 

219 newComponents.update(components) 

220 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

221 id=id, run=run, hash=self.hash, components=newComponents, conform=False) 

222 

223 def unresolved(self) -> DatasetRef: 

224 """Return a new `DatasetRef` with the same data ID and dataset type, 

225 but no ID, run, or components. 

226 

227 Returns 

228 ------- 

229 ref : `DatasetRef` 

230 A new `DatasetRef`. 

231 

232 Notes 

233 ----- 

234 This can be used to compare only the data ID and dataset type of a 

235 pair of `DatasetRef` instances, regardless of whether either is 

236 resolved:: 

237 

238 if ref1.unresolved() == ref2.unresolved(): 

239 ... 

240 """ 

241 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False) 

242 

243 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef: 

244 """Return a new `DatasetRef` with the given expanded data ID. 

245 

246 Parameters 

247 ---------- 

248 dataId : `ExpandedDataCoordinate` 

249 Data ID for the new `DatasetRef`. Must compare equal to the 

250 original data ID. 

251 

252 Returns 

253 ------- 

254 ref : `DatasetRef` 

255 A new `DatasetRef` with the given data ID. 

256 """ 

257 assert dataId == self.dataId 

258 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

259 id=self.id, run=self.run, hash=self.hash, components=self.components, 

260 conform=False) 

261 

262 def isComponent(self) -> bool: 

263 """Boolean indicating whether this `DatasetRef` refers to a 

264 component of a composite. 

265 

266 Returns 

267 ------- 

268 isComponent : `bool` 

269 `True` if this `DatasetRef` is a component, `False` otherwise. 

270 """ 

271 return self.datasetType.isComponent() 

272 

273 def isComposite(self) -> bool: 

274 """Boolean indicating whether this `DatasetRef` is a composite type. 

275 

276 Returns 

277 ------- 

278 isComposite : `bool` 

279 `True` if this `DatasetRef` is a composite type, `False` 

280 otherwise. 

281 """ 

282 return self.datasetType.isComposite() 

283 

284 def _lookupNames(self) -> Tuple[LookupKey]: 

285 """Name keys to use when looking up this DatasetRef in a configuration. 

286 

287 The names are returned in order of priority. 

288 

289 Returns 

290 ------- 

291 names : `tuple` of `LookupKey` 

292 Tuple of the `DatasetType` name and the `StorageClass` name. 

293 If ``instrument`` is defined in the dataId, each of those names 

294 is added to the start of the tuple with a key derived from the 

295 value of ``instrument``. 

296 """ 

297 # Special case the instrument Dimension since we allow configs 

298 # to include the instrument name in the hierarchy. 

299 names = self.datasetType._lookupNames() 

300 

301 if "instrument" in self.dataId: 

302 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

303 for n in names) + names 

304 

305 return names 

306 

307 @staticmethod 

308 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]: 

309 """Recursively transform an iterable over `DatasetRef` to include 

310 nested component `DatasetRef` instances. 

311 

312 Parameters 

313 ---------- 

314 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

315 Input iterable to process. Must contain only resolved `DatasetRef` 

316 instances (i.e. with `DatasetRef.components` not `None`). 

317 parents : `bool`, optional 

318 If `True` (default) include the given datasets in the output 

319 iterable. If `False`, include only their components. This does 

320 not propagate recursively - only the outermost level of parents 

321 is ignored if ``parents`` is `False`. 

322 

323 Yields 

324 ------ 

325 ref : `DatasetRef` 

326 Either one of the given `DatasetRef` instances (only if ``parent`` 

327 is `True`) or on of its (recursive) children. 

328 

329 Notes 

330 ----- 

331 If ``parents`` is `True`, components are guaranteed to be yielded 

332 before their parents. 

333 """ 

334 for ref in refs: 

335 if ref.components is None: 

336 raise AmbiguousDatasetError(f"Unresolved ref {ref} passed to 'flatten'.") 

337 yield from DatasetRef.flatten(ref.components.values(), parents=True) 

338 if parents: 

339 yield ref 

340 

341 @staticmethod 

342 def groupByType(refs: Iterable[DatasetRef], *, recursive: bool = True 

343 ) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

344 """Group an iterable of `DatasetRef` by `DatasetType`. 

345 

346 Parameters 

347 ---------- 

348 refs : `Iterable` [ `DatasetRef` ] 

349 `DatasetRef` instances to group. 

350 recursive : `bool`, optional 

351 If `True` (default), also group any `DatasetRef` instances found in 

352 the `DatasetRef.components` dictionaries of ``refs``, recursively. 

353 `True` also checks that references are "resolved" (unresolved 

354 references never have components). 

355 

356 Returns 

357 ------- 

358 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

359 Grouped `DatasetRef` instances. 

360 

361 Raises 

362 ------ 

363 AmbiguousDatasetError 

364 Raised if ``recursive is True``, and one or more refs has 

365 ``DatasetRef.components is None`` (as is always the case for 

366 unresolved `DatasetRef` objects). 

367 """ 

368 result = NamedKeyDict() 

369 iter = DatasetRef.flatten(refs) if recursive else refs 

370 for ref in iter: 

371 result.setdefault(ref.datasetType, []).append(ref) 

372 return result 

373 

374 def getCheckedId(self) -> int: 

375 """Return ``self.id``, or raise if it is `None`. 

376 

377 This trivial method exists to allow operations that would otherwise be 

378 natural list comprehensions to check that the ID is not `None` as well. 

379 

380 Returns 

381 ------- 

382 id : `int` 

383 ``self.id`` if it is not `None`. 

384 

385 Raises 

386 ------ 

387 AmbiguousDatasetError 

388 Raised if ``ref.id`` is `None`. 

389 """ 

390 if self.id is None: 

391 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

392 f"a resolved reference is required.") 

393 return self.id 

394 

395 datasetType: DatasetType 

396 """The definition of this dataset (`DatasetType`). 

397 

398 Cannot be changed after a `DatasetRef` is constructed. 

399 """ 

400 

401 dataId: DataCoordinate 

402 """A mapping of `Dimension` primary key values that labels the dataset 

403 within a Collection (`DataCoordinate`). 

404 

405 Cannot be changed after a `DatasetRef` is constructed. 

406 """ 

407 

408 run: Optional[setattr] 

409 """The name of the run that produced the dataset. 

410 

411 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

412 `unresolved` to add or remove this information when creating a new 

413 `DatasetRef`. 

414 """ 

415 

416 id: Optional[int] 

417 """Primary key of the dataset (`int` or `None`). 

418 

419 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

420 `unresolved` to add or remove this information when creating a new 

421 `DatasetRef`. 

422 """ 

423 

424 

425@immutable 

426class FakeDatasetRef: 

427 """A fake `DatasetRef` that can be used internally by butler where 

428 only the dataset ID is available. 

429 

430 Should only be used when registry can not be used to create a full 

431 `DatasetRef` from the ID. A particular use case is during dataset 

432 deletion when solely the ID is available. 

433 

434 Parameters 

435 ---------- 

436 id : `int` 

437 The dataset ID. 

438 """ 

439 __slots__ = ("id",) 

440 

441 def __new__(cls, id: int): 

442 self = super().__new__(cls) 

443 self.id = id 

444 return self 

445 

446 def __str__(self): 

447 return f"dataset_id={self.id}" 

448 

449 def __repr__(self): 

450 return f"FakeDatasetRef({self.id})" 

451 

452 def __eq__(self, other: FakeDatasetRef): 

453 try: 

454 return self.id == other.id 

455 except AttributeError: 

456 return NotImplemented 

457 

458 def __hash__(self) -> int: 

459 return hash(self.id) 

460 

461 @property 

462 def components(self): 

463 return {} 

464 

465 @staticmethod 

466 def flatten(refs: Iterable[FakeDatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]: 

467 return DatasetRef.flatten(refs, parents=parents)