Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef"] 

24 

25from typing import ( 

26 TYPE_CHECKING, 

27 Any, 

28 Dict, 

29 Iterable, 

30 List, 

31 Optional, 

32 Tuple, 

33) 

34 

35from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse 

36from ..configSupport import LookupKey 

37from ..utils import immutable 

38from ..named import NamedKeyDict 

39from .type import DatasetType 

40from ..json import from_json_generic, to_json_generic 

41 

42if TYPE_CHECKING: 42 ↛ 43line 42 didn't jump to line 43, because the condition on line 42 was never true

43 from ...registry import Registry 

44 

45 

46class AmbiguousDatasetError(Exception): 

47 """Exception raised when a `DatasetRef` is not resolved (has no ID or run), 

48 but the requested operation requires one of them. 

49 """ 

50 

51 

52@immutable 

53class DatasetRef: 

54 """Reference to a Dataset in a `Registry`. 

55 

56 A `DatasetRef` may point to a Dataset that currently does not yet exist 

57 (e.g., because it is a predicted input for provenance). 

58 

59 Parameters 

60 ---------- 

61 datasetType : `DatasetType` 

62 The `DatasetType` for this Dataset. 

63 dataId : `DataCoordinate` 

64 A mapping of dimensions that labels the Dataset within a Collection. 

65 id : `int`, optional 

66 The unique integer identifier assigned when the dataset is created. 

67 run : `str`, optional 

68 The name of the run this dataset was associated with when it was 

69 created. Must be provided if ``id`` is. 

70 conform : `bool`, optional 

71 If `True` (default), call `DataCoordinate.standardize` to ensure that 

72 the data ID's dimensions are consistent with the dataset type's. 

73 `DatasetRef` instances for which those dimensions are not equal should 

74 not be created in new code, but are still supported for backwards 

75 compatibility. New code should only pass `False` if it can guarantee 

76 that the dimensions are already consistent. 

77 

78 Raises 

79 ------ 

80 ValueError 

81 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

82 provided but ``run`` is not. 

83 """ 

84 

85 __slots__ = ("id", "datasetType", "dataId", "run",) 

86 

87 def __init__( 

88 self, 

89 datasetType: DatasetType, dataId: DataCoordinate, *, 

90 id: Optional[int] = None, 

91 run: Optional[str] = None, 

92 conform: bool = True 

93 ): 

94 self.id = id 

95 self.datasetType = datasetType 

96 if conform: 

97 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

98 else: 

99 self.dataId = dataId 

100 if self.id is not None: 

101 if run is None: 

102 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

103 f"type={datasetType}, and dataId={dataId}.") 

104 self.run = run 

105 else: 

106 if run is not None: 

107 raise ValueError("'run' cannot be provided unless 'id' is.") 

108 self.run = None 

109 

110 def __eq__(self, other: Any) -> bool: 

111 try: 

112 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

113 except AttributeError: 

114 return NotImplemented 

115 

116 def __hash__(self) -> int: 

117 return hash((self.datasetType, self.dataId, self.id)) 

118 

119 @property 

120 def dimensions(self) -> DimensionGraph: 

121 """The dimensions associated with the underlying `DatasetType` 

122 """ 

123 return self.datasetType.dimensions 

124 

125 def __repr__(self) -> str: 

126 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

127 # DataCoordinate's __repr__ - while adhering to the guidelines for 

128 # __repr__ - is much harder to users to read, while its __str__ just 

129 # produces a dict that can also be passed to DatasetRef's constructor. 

130 if self.id is not None: 

131 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

132 else: 

133 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

134 

135 def __str__(self) -> str: 

136 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]" 

137 if self.id is not None: 

138 s += f" (id={self.id})" 

139 return s 

140 

141 def __lt__(self, other: Any) -> bool: 

142 # Sort by run, DatasetType name and then by DataCoordinate 

143 # The __str__ representation is probably close enough but we 

144 # need to ensure that sorting a DatasetRef matches what you would 

145 # get if you sorted DatasetType+DataCoordinate 

146 if not isinstance(other, type(self)): 

147 return NotImplemented 

148 

149 # Group by run if defined, takes precedence over DatasetType 

150 self_run = "" if self.run is None else self.run 

151 other_run = "" if other.run is None else other.run 

152 

153 # Compare tuples in the priority order 

154 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

155 

156 def to_simple(self, minimal: bool = False) -> Dict: 

157 """Convert this class to a simple python type suitable for 

158 serialization. 

159 

160 Parameters 

161 ---------- 

162 minimal : `bool`, optional 

163 Use minimal serialization. Requires Registry to convert 

164 back to a full type. 

165 

166 Returns 

167 ------- 

168 simple : `dict` or `int` 

169 The object converted to a dictionary. 

170 """ 

171 if minimal and self.id is not None: 

172 # The only thing needed to uniquely define a DatasetRef 

173 # is the integer id so that can be used directly if it is 

174 # resolved and if it is not a component DatasetRef. 

175 # Store is in a dict to allow us to easily add the planned 

176 # origin information later without having to support 

177 # an int and dict in simple form. 

178 simple: Dict[str, Any] = {"id": self.id} 

179 if self.isComponent(): 

180 # We can still be a little minimalist with a component 

181 # but we will also need to record the datasetType component 

182 simple["component"] = self.datasetType.component() 

183 return simple 

184 

185 # Convert to a dict form 

186 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal), 

187 "dataId": self.dataId.to_simple(), 

188 } 

189 

190 # Only include the id entry if it is defined 

191 if self.id is not None: 

192 as_dict["run"] = self.run 

193 as_dict["id"] = self.id 

194 

195 return as_dict 

196 

197 @classmethod 

198 def from_simple(cls, simple: Dict, 

199 universe: Optional[DimensionUniverse] = None, 

200 registry: Optional[Registry] = None) -> DatasetRef: 

201 """Construct a new object from the data returned from the `to_simple` 

202 method. 

203 

204 Parameters 

205 ---------- 

206 simple : `dict` of [`str`, `Any`] 

207 The value returned by `to_simple()`. 

208 universe : `DimensionUniverse` 

209 The special graph of all known dimensions. 

210 Can be `None` if a registry is provided. 

211 registry : `lsst.daf.butler.Registry`, optional 

212 Registry to use to convert simple form of a DatasetRef to 

213 a full `DatasetRef`. Can be `None` if a full description of 

214 the type is provided along with a universe. 

215 

216 Returns 

217 ------- 

218 ref : `DatasetRef` 

219 Newly-constructed object. 

220 """ 

221 

222 # Minimalist component will just specify component and id and 

223 # require registry to reconstruct 

224 if set(simple).issubset({"id", "component"}): 

225 if registry is None: 

226 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

227 ref = registry.getDataset(simple["id"]) 

228 if ref is None: 

229 raise RuntimeError(f"No matching dataset found in registry for id {simple['id']}") 

230 if "component" in simple: 

231 ref = ref.makeComponentRef(simple["component"]) 

232 return ref 

233 

234 if universe is None and registry is None: 

235 raise ValueError("One of universe or registry must be provided.") 

236 

237 if universe is None and registry is not None: 

238 universe = registry.dimensions 

239 

240 if universe is None: 

241 # this is for mypy 

242 raise ValueError("Unable to determine a usable universe") 

243 

244 datasetType = DatasetType.from_simple(simple["datasetType"], universe=universe, registry=registry) 

245 dataId = DataCoordinate.from_simple(simple["dataId"], universe=universe) 

246 return cls(datasetType, dataId, 

247 id=simple["id"], run=simple["run"]) 

248 

249 to_json = to_json_generic 

250 from_json = classmethod(from_json_generic) 

251 

252 @classmethod 

253 def _unpickle( 

254 cls, 

255 datasetType: DatasetType, 

256 dataId: DataCoordinate, 

257 id: Optional[int], 

258 run: Optional[str], 

259 ) -> DatasetRef: 

260 """A custom factory method for use by `__reduce__` as a workaround for 

261 its lack of support for keyword arguments. 

262 """ 

263 return cls(datasetType, dataId, id=id, run=run) 

264 

265 def __reduce__(self) -> tuple: 

266 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

267 

268 def __deepcopy__(self, memo: dict) -> DatasetRef: 

269 # DatasetRef is recursively immutable; see note in @immutable 

270 # decorator. 

271 return self 

272 

273 def resolved(self, id: int, run: str) -> DatasetRef: 

274 """Return a new `DatasetRef` with the same data ID and dataset type 

275 and the given ID and run. 

276 

277 Parameters 

278 ---------- 

279 id : `int` 

280 The unique integer identifier assigned when the dataset is created. 

281 run : `str` 

282 The run this dataset was associated with when it was created. 

283 

284 Returns 

285 ------- 

286 ref : `DatasetRef` 

287 A new `DatasetRef`. 

288 """ 

289 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

290 id=id, run=run, conform=False) 

291 

292 def unresolved(self) -> DatasetRef: 

293 """Return a new `DatasetRef` with the same data ID and dataset type, 

294 but no ID or run. 

295 

296 Returns 

297 ------- 

298 ref : `DatasetRef` 

299 A new `DatasetRef`. 

300 

301 Notes 

302 ----- 

303 This can be used to compare only the data ID and dataset type of a 

304 pair of `DatasetRef` instances, regardless of whether either is 

305 resolved:: 

306 

307 if ref1.unresolved() == ref2.unresolved(): 

308 ... 

309 """ 

310 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

311 

312 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

313 """Return a new `DatasetRef` with the given expanded data ID. 

314 

315 Parameters 

316 ---------- 

317 dataId : `DataCoordinate` 

318 Data ID for the new `DatasetRef`. Must compare equal to the 

319 original data ID. 

320 

321 Returns 

322 ------- 

323 ref : `DatasetRef` 

324 A new `DatasetRef` with the given data ID. 

325 """ 

326 assert dataId == self.dataId 

327 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

328 id=self.id, run=self.run, 

329 conform=False) 

330 

331 def isComponent(self) -> bool: 

332 """Boolean indicating whether this `DatasetRef` refers to a 

333 component of a composite. 

334 

335 Returns 

336 ------- 

337 isComponent : `bool` 

338 `True` if this `DatasetRef` is a component, `False` otherwise. 

339 """ 

340 return self.datasetType.isComponent() 

341 

342 def isComposite(self) -> bool: 

343 """Boolean indicating whether this `DatasetRef` is a composite type. 

344 

345 Returns 

346 ------- 

347 isComposite : `bool` 

348 `True` if this `DatasetRef` is a composite type, `False` 

349 otherwise. 

350 """ 

351 return self.datasetType.isComposite() 

352 

353 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

354 """Name keys to use when looking up this DatasetRef in a configuration. 

355 

356 The names are returned in order of priority. 

357 

358 Returns 

359 ------- 

360 names : `tuple` of `LookupKey` 

361 Tuple of the `DatasetType` name and the `StorageClass` name. 

362 If ``instrument`` is defined in the dataId, each of those names 

363 is added to the start of the tuple with a key derived from the 

364 value of ``instrument``. 

365 """ 

366 # Special case the instrument Dimension since we allow configs 

367 # to include the instrument name in the hierarchy. 

368 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

369 

370 # mypy doesn't think this could return True, because even though 

371 # __contains__ can take an object of any type, it seems hard-coded to 

372 # assume it will return False if the type doesn't match the key type 

373 # of the Mapping. 

374 if "instrument" in self.dataId: # type: ignore 

375 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

376 for n in names) + names 

377 

378 return names 

379 

380 @staticmethod 

381 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

382 """Group an iterable of `DatasetRef` by `DatasetType`. 

383 

384 Parameters 

385 ---------- 

386 refs : `Iterable` [ `DatasetRef` ] 

387 `DatasetRef` instances to group. 

388 

389 Returns 

390 ------- 

391 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

392 Grouped `DatasetRef` instances. 

393 """ 

394 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

395 for ref in refs: 

396 result.setdefault(ref.datasetType, []).append(ref) 

397 return result 

398 

399 def getCheckedId(self) -> int: 

400 """Return ``self.id``, or raise if it is `None`. 

401 

402 This trivial method exists to allow operations that would otherwise be 

403 natural list comprehensions to check that the ID is not `None` as well. 

404 

405 Returns 

406 ------- 

407 id : `int` 

408 ``self.id`` if it is not `None`. 

409 

410 Raises 

411 ------ 

412 AmbiguousDatasetError 

413 Raised if ``ref.id`` is `None`. 

414 """ 

415 if self.id is None: 

416 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

417 f"a resolved reference is required.") 

418 return self.id 

419 

420 def makeCompositeRef(self) -> DatasetRef: 

421 """Create a `DatasetRef` of the composite from a component ref. 

422 

423 Requires that this `DatasetRef` is a component. 

424 

425 Returns 

426 ------- 

427 ref : `DatasetRef` 

428 A `DatasetRef` with a dataset type that corresponds to the 

429 composite parent of this component, and the same ID and run 

430 (which may be `None`, if they are `None` in ``self``). 

431 """ 

432 # Assume that the data ID does not need to be standardized 

433 # and should match whatever this ref already has. 

434 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId, 

435 id=self.id, run=self.run, conform=False) 

436 

437 def makeComponentRef(self, name: str) -> DatasetRef: 

438 """Create a `DatasetRef` that corresponds to a component of this 

439 dataset. 

440 

441 Parameters 

442 ---------- 

443 name : `str` 

444 Name of the component. 

445 

446 Returns 

447 ------- 

448 ref : `DatasetRef` 

449 A `DatasetRef` with a dataset type that corresponds to the given 

450 component, and the same ID and run 

451 (which may be `None`, if they are `None` in ``self``). 

452 """ 

453 # Assume that the data ID does not need to be standardized 

454 # and should match whatever this ref already has. 

455 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

456 id=self.id, run=self.run, conform=False) 

457 

458 datasetType: DatasetType 

459 """The definition of this dataset (`DatasetType`). 

460 

461 Cannot be changed after a `DatasetRef` is constructed. 

462 """ 

463 

464 dataId: DataCoordinate 

465 """A mapping of `Dimension` primary key values that labels the dataset 

466 within a Collection (`DataCoordinate`). 

467 

468 Cannot be changed after a `DatasetRef` is constructed. 

469 """ 

470 

471 run: Optional[str] 

472 """The name of the run that produced the dataset. 

473 

474 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

475 `unresolved` to add or remove this information when creating a new 

476 `DatasetRef`. 

477 """ 

478 

479 id: Optional[int] 

480 """Primary key of the dataset (`int` or `None`). 

481 

482 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

483 `unresolved` to add or remove this information when creating a new 

484 `DatasetRef`. 

485 """