Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef", "FakeDatasetRef"] 

24 

25import hashlib 

26from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Tuple 

27 

28from types import MappingProxyType 

29from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate 

30from ..configSupport import LookupKey 

31from ..utils import immutable, NamedKeyDict 

32from .type import DatasetType 

33 

34 

35class AmbiguousDatasetError(Exception): 

36 """Exception raised when a `DatasetRef` is not resolved (has no ID, run, or 

37 components), but the requested operation requires one of them. 

38 """ 

39 

40 

41@immutable 

42class DatasetRef: 

43 """Reference to a Dataset in a `Registry`. 

44 

45 A `DatasetRef` may point to a Dataset that currently does not yet exist 

46 (e.g., because it is a predicted input for provenance). 

47 

48 Parameters 

49 ---------- 

50 datasetType : `DatasetType` 

51 The `DatasetType` for this Dataset. 

52 dataId : `DataCoordinate` 

53 A mapping of dimensions that labels the Dataset within a Collection. 

54 id : `int`, optional 

55 The unique integer identifier assigned when the dataset is created. 

56 run : `str`, optional 

57 The name of the run this dataset was associated with when it was 

58 created. Must be provided if ``id`` is. 

59 hash : `bytes`, optional 

60 A hash of the dataset type and data ID. Should only be provided if 

61 copying from another `DatasetRef` with the same dataset type and data 

62 ID. 

63 components : `dict`, optional 

64 A dictionary mapping component name to a `DatasetRef` for that 

65 component. Should not be passed unless ``id`` is also provided (i.e. 

66 if this is a "resolved" reference). 

67 conform : `bool`, optional 

68 If `True` (default), call `DataCoordinate.standardize` to ensure that 

69 the data ID's dimensions are consistent with the dataset type's. 

70 `DatasetRef` instances for which those dimensions are not equal should 

71 not be created in new code, but are still supported for backwards 

72 compatibility. New code should only pass `False` if it can guarantee 

73 that the dimensions are already consistent. 

74 hasParentId : `bool`, optional 

75 If `True` this `DatasetRef` is a component that has the ``id`` 

76 of the composite parent. This is set if the registry does not 

77 know about individual components but does know about the composite. 

78 

79 Raises 

80 ------ 

81 ValueError 

82 Raised if ``run`` or ``components`` is provided but ``id`` is not, or 

83 if a component dataset is inconsistent with the storage class, or if 

84 ``id`` is provided but ``run`` is not. 

85 """ 

86 

87 __slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components", "hasParentId") 

88 

89 def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *, 

90 id: Optional[int] = None, 

91 run: Optional[str] = None, hash: Optional[bytes] = None, 

92 components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True, 

93 hasParentId: bool = False) -> DatasetRef: 

94 self = super().__new__(cls) 

95 assert isinstance(datasetType, DatasetType) 

96 self.id = id 

97 self.datasetType = datasetType 

98 self.hasParentId = hasParentId 

99 if conform: 

100 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

101 else: 

102 self.dataId = dataId 

103 if self.id is not None: 

104 self._components = dict() 

105 if components is not None: 

106 self._components.update(components) 

107 for k, v in self._components.items(): 

108 expectedStorageClass = self.datasetType.storageClass.components.get(k) 

109 if expectedStorageClass is None: 

110 raise ValueError(f"{k} is not a valid component for " 

111 f"storage class {self.datasetType.storageClass.name}.") 

112 if not isinstance(v, DatasetRef): 

113 # It's easy to accidentally pass DatasetType or 

114 # StorageClass; make that error message friendly. 

115 raise ValueError(f"Component {k}={v} is not a DatasetRef.") 

116 if v.id is None: 

117 raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).") 

118 if expectedStorageClass != v.datasetType.storageClass: 

119 raise ValueError(f"Storage class mismatch for component {k}: " 

120 f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}") 

121 if run is None: 

122 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

123 f"type={datasetType}, and dataId={dataId}.") 

124 self.run = run 

125 else: 

126 self._components = None 

127 if components: 

128 raise ValueError("'components' cannot be provided unless 'id' is.") 

129 if run is not None: 

130 raise ValueError("'run' cannot be provided unless 'id' is.") 

131 self.run = None 

132 if hash is not None: 

133 # We only set self._hash if we know it; this plays nicely with 

134 # the @immutable decorator, which allows an attribute to be set 

135 # only one time. 

136 self._hash = hash 

137 return self 

138 

139 def __eq__(self, other: DatasetRef): 

140 try: 

141 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

142 except AttributeError: 

143 return NotImplemented 

144 

145 def __hash__(self) -> int: 

146 return hash((self.datasetType, self.dataId, self.id)) 

147 

148 @property 

149 def hash(self) -> bytes: 

150 """Secure hash of the `DatasetType` name and data ID (`bytes`). 

151 """ 

152 if not hasattr(self, "_hash"): 

153 message = hashlib.blake2b(digest_size=32) 

154 message.update(self.datasetType.name.encode("utf8")) 

155 self.dataId.fingerprint(message.update) 

156 self._hash = message.digest() 

157 return self._hash 

158 

159 @property 

160 def components(self) -> Optional[Mapping[str, DatasetRef]]: 

161 """Named `DatasetRef` components (`~collections.abc.Mapping` or 

162 `None`). 

163 

164 For resolved `DatasetRef` instances, this is a read-only mapping. For 

165 unresolved instances, this is always `None`. 

166 """ 

167 if self._components is None: 

168 return None 

169 return MappingProxyType(self._components) 

170 

171 @property 

172 def dimensions(self) -> DimensionGraph: 

173 """The dimensions associated with the underlying `DatasetType` 

174 """ 

175 return self.datasetType.dimensions 

176 

177 def __repr__(self) -> str: 

178 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

179 # DataCoordinate's __repr__ - while adhering to the guidelines for 

180 # __repr__ - is much harder to users to read, while its __str__ just 

181 # produces a dict that can also be passed to DatasetRef's constructor. 

182 if self.id is not None: 

183 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, " 

184 f"components={self._components})") 

185 else: 

186 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

187 

188 def __str__(self) -> str: 

189 s = f"{self.datasetType.name}@{self.dataId!s}" 

190 if self.id is not None: 

191 s += f" (id={self.id})" 

192 return s 

193 

194 def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: 

195 return ((self.datasetType, self.dataId), 

196 {"id": self.id, "run": self.run, "components": self._components}) 

197 

198 def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None 

199 ) -> DatasetRef: 

200 """Return a new `DatasetRef` with the same data ID and dataset type 

201 and the given ID and run. 

202 

203 Parameters 

204 ---------- 

205 id : `int` 

206 The unique integer identifier assigned when the dataset is created. 

207 run : `str` 

208 The run this dataset was associated with when it was created. 

209 components : `dict`, optional 

210 A dictionary mapping component name to a `DatasetRef` for that 

211 component. If ``self`` is already a resolved `DatasetRef`, 

212 its components will be merged with this dictionary, with this 

213 dictionary taking precedence. 

214 

215 Returns 

216 ------- 

217 ref : `DatasetRef` 

218 A new `DatasetRef`. 

219 """ 

220 if self._components is not None: 

221 newComponents = self._components.copy() 

222 else: 

223 newComponents = {} 

224 if components: 

225 newComponents.update(components) 

226 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

227 id=id, run=run, hash=self.hash, components=newComponents, conform=False) 

228 

229 def unresolved(self) -> DatasetRef: 

230 """Return a new `DatasetRef` with the same data ID and dataset type, 

231 but no ID, run, or components. 

232 

233 Returns 

234 ------- 

235 ref : `DatasetRef` 

236 A new `DatasetRef`. 

237 

238 Notes 

239 ----- 

240 This can be used to compare only the data ID and dataset type of a 

241 pair of `DatasetRef` instances, regardless of whether either is 

242 resolved:: 

243 

244 if ref1.unresolved() == ref2.unresolved(): 

245 ... 

246 """ 

247 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False) 

248 

249 def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef: 

250 """Return a new `DatasetRef` with the given expanded data ID. 

251 

252 Parameters 

253 ---------- 

254 dataId : `ExpandedDataCoordinate` 

255 Data ID for the new `DatasetRef`. Must compare equal to the 

256 original data ID. 

257 

258 Returns 

259 ------- 

260 ref : `DatasetRef` 

261 A new `DatasetRef` with the given data ID. 

262 """ 

263 assert dataId == self.dataId 

264 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

265 id=self.id, run=self.run, hash=self.hash, components=self.components, 

266 conform=False) 

267 

268 def isComponent(self) -> bool: 

269 """Boolean indicating whether this `DatasetRef` refers to a 

270 component of a composite. 

271 

272 Returns 

273 ------- 

274 isComponent : `bool` 

275 `True` if this `DatasetRef` is a component, `False` otherwise. 

276 """ 

277 return self.datasetType.isComponent() 

278 

279 def isComposite(self) -> bool: 

280 """Boolean indicating whether this `DatasetRef` is a composite type. 

281 

282 Returns 

283 ------- 

284 isComposite : `bool` 

285 `True` if this `DatasetRef` is a composite type, `False` 

286 otherwise. 

287 """ 

288 return self.datasetType.isComposite() 

289 

290 def _lookupNames(self) -> Tuple[LookupKey]: 

291 """Name keys to use when looking up this DatasetRef in a configuration. 

292 

293 The names are returned in order of priority. 

294 

295 Returns 

296 ------- 

297 names : `tuple` of `LookupKey` 

298 Tuple of the `DatasetType` name and the `StorageClass` name. 

299 If ``instrument`` is defined in the dataId, each of those names 

300 is added to the start of the tuple with a key derived from the 

301 value of ``instrument``. 

302 """ 

303 # Special case the instrument Dimension since we allow configs 

304 # to include the instrument name in the hierarchy. 

305 names = self.datasetType._lookupNames() 

306 

307 if "instrument" in self.dataId: 

308 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

309 for n in names) + names 

310 

311 return names 

312 

313 @staticmethod 

314 def flatten(refs: Iterable[DatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]: 

315 """Recursively transform an iterable over `DatasetRef` to include 

316 nested component `DatasetRef` instances. 

317 

318 Parameters 

319 ---------- 

320 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

321 Input iterable to process. Must contain only resolved `DatasetRef` 

322 instances (i.e. with `DatasetRef.components` not `None`). 

323 parents : `bool`, optional 

324 If `True` (default) include the given datasets in the output 

325 iterable. If `False`, include only their components. This does 

326 not propagate recursively - only the outermost level of parents 

327 is ignored if ``parents`` is `False`. 

328 

329 Yields 

330 ------ 

331 ref : `DatasetRef` 

332 Either one of the given `DatasetRef` instances (only if ``parent`` 

333 is `True`) or on of its (recursive) children. 

334 

335 Notes 

336 ----- 

337 If ``parents`` is `True`, components are guaranteed to be yielded 

338 before their parents. 

339 """ 

340 for ref in refs: 

341 if ref.components is None: 

342 raise AmbiguousDatasetError(f"Unresolved ref {ref} passed to 'flatten'.") 

343 yield from DatasetRef.flatten(ref.components.values(), parents=True) 

344 if parents: 

345 yield ref 

346 

347 @staticmethod 

348 def groupByType(refs: Iterable[DatasetRef], *, recursive: bool = True 

349 ) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

350 """Group an iterable of `DatasetRef` by `DatasetType`. 

351 

352 Parameters 

353 ---------- 

354 refs : `Iterable` [ `DatasetRef` ] 

355 `DatasetRef` instances to group. 

356 recursive : `bool`, optional 

357 If `True` (default), also group any `DatasetRef` instances found in 

358 the `DatasetRef.components` dictionaries of ``refs``, recursively. 

359 `True` also checks that references are "resolved" (unresolved 

360 references never have components). 

361 

362 Returns 

363 ------- 

364 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

365 Grouped `DatasetRef` instances. 

366 

367 Raises 

368 ------ 

369 AmbiguousDatasetError 

370 Raised if ``recursive is True``, and one or more refs has 

371 ``DatasetRef.components is None`` (as is always the case for 

372 unresolved `DatasetRef` objects). 

373 """ 

374 result = NamedKeyDict() 

375 iter = DatasetRef.flatten(refs) if recursive else refs 

376 for ref in iter: 

377 result.setdefault(ref.datasetType, []).append(ref) 

378 return result 

379 

380 def getCheckedId(self) -> int: 

381 """Return ``self.id``, or raise if it is `None`. 

382 

383 This trivial method exists to allow operations that would otherwise be 

384 natural list comprehensions to check that the ID is not `None` as well. 

385 

386 Returns 

387 ------- 

388 id : `int` 

389 ``self.id`` if it is not `None`. 

390 

391 Raises 

392 ------ 

393 AmbiguousDatasetError 

394 Raised if ``ref.id`` is `None`. 

395 """ 

396 if self.id is None: 

397 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

398 f"a resolved reference is required.") 

399 return self.id 

400 

401 datasetType: DatasetType 

402 """The definition of this dataset (`DatasetType`). 

403 

404 Cannot be changed after a `DatasetRef` is constructed. 

405 """ 

406 

407 dataId: DataCoordinate 

408 """A mapping of `Dimension` primary key values that labels the dataset 

409 within a Collection (`DataCoordinate`). 

410 

411 Cannot be changed after a `DatasetRef` is constructed. 

412 """ 

413 

414 run: Optional[setattr] 

415 """The name of the run that produced the dataset. 

416 

417 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

418 `unresolved` to add or remove this information when creating a new 

419 `DatasetRef`. 

420 """ 

421 

422 id: Optional[int] 

423 """Primary key of the dataset (`int` or `None`). 

424 

425 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

426 `unresolved` to add or remove this information when creating a new 

427 `DatasetRef`. 

428 """ 

429 

430 

431@immutable 

432class FakeDatasetRef: 

433 """A fake `DatasetRef` that can be used internally by butler where 

434 only the dataset ID is available. 

435 

436 Should only be used when registry can not be used to create a full 

437 `DatasetRef` from the ID. A particular use case is during dataset 

438 deletion when solely the ID is available. 

439 

440 Parameters 

441 ---------- 

442 id : `int` 

443 The dataset ID. 

444 """ 

445 __slots__ = ("id",) 

446 

447 def __new__(cls, id: int): 

448 self = super().__new__(cls) 

449 self.id = id 

450 return self 

451 

452 def __str__(self): 

453 return f"dataset_id={self.id}" 

454 

455 def __repr__(self): 

456 return f"FakeDatasetRef({self.id})" 

457 

458 def __eq__(self, other: FakeDatasetRef): 

459 try: 

460 return self.id == other.id 

461 except AttributeError: 

462 return NotImplemented 

463 

464 def __hash__(self) -> int: 

465 return hash(self.id) 

466 

467 @property 

468 def components(self): 

469 return {} 

470 

471 @staticmethod 

472 def flatten(refs: Iterable[FakeDatasetRef], *, parents: bool = True) -> Iterator[DatasetRef]: 

473 return DatasetRef.flatten(refs, parents=parents)