Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef"] 

24 

25from typing import ( 

26 TYPE_CHECKING, 

27 Any, 

28 Dict, 

29 Iterable, 

30 List, 

31 Optional, 

32 Tuple, 

33) 

34 

35from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse 

36from ..configSupport import LookupKey 

37from ..utils import immutable 

38from ..named import NamedKeyDict 

39from .type import DatasetType 

40from ..json import from_json_generic, to_json_generic 

41 

42if TYPE_CHECKING: 42 ↛ 43line 42 didn't jump to line 43, because the condition on line 42 was never true

43 from ...registry import Registry 

44 

45 

46class AmbiguousDatasetError(Exception): 

47 """Raised when a `DatasetRef` is not resolved but should be. 

48 

49 This happens when the `DatasetRef` has no ID or run but the requested 

50 operation requires one of them. 

51 """ 

52 

53 

54@immutable 

55class DatasetRef: 

56 """Reference to a Dataset in a `Registry`. 

57 

58 A `DatasetRef` may point to a Dataset that currently does not yet exist 

59 (e.g., because it is a predicted input for provenance). 

60 

61 Parameters 

62 ---------- 

63 datasetType : `DatasetType` 

64 The `DatasetType` for this Dataset. 

65 dataId : `DataCoordinate` 

66 A mapping of dimensions that labels the Dataset within a Collection. 

67 id : `int`, optional 

68 The unique integer identifier assigned when the dataset is created. 

69 run : `str`, optional 

70 The name of the run this dataset was associated with when it was 

71 created. Must be provided if ``id`` is. 

72 conform : `bool`, optional 

73 If `True` (default), call `DataCoordinate.standardize` to ensure that 

74 the data ID's dimensions are consistent with the dataset type's. 

75 `DatasetRef` instances for which those dimensions are not equal should 

76 not be created in new code, but are still supported for backwards 

77 compatibility. New code should only pass `False` if it can guarantee 

78 that the dimensions are already consistent. 

79 

80 Raises 

81 ------ 

82 ValueError 

83 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

84 provided but ``run`` is not. 

85 """ 

86 

87 __slots__ = ("id", "datasetType", "dataId", "run",) 

88 

89 def __init__( 

90 self, 

91 datasetType: DatasetType, dataId: DataCoordinate, *, 

92 id: Optional[int] = None, 

93 run: Optional[str] = None, 

94 conform: bool = True 

95 ): 

96 self.id = id 

97 self.datasetType = datasetType 

98 if conform: 

99 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

100 else: 

101 self.dataId = dataId 

102 if self.id is not None: 

103 if run is None: 

104 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

105 f"type={datasetType}, and dataId={dataId}.") 

106 self.run = run 

107 else: 

108 if run is not None: 

109 raise ValueError("'run' cannot be provided unless 'id' is.") 

110 self.run = None 

111 

112 def __eq__(self, other: Any) -> bool: 

113 try: 

114 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

115 except AttributeError: 

116 return NotImplemented 

117 

118 def __hash__(self) -> int: 

119 return hash((self.datasetType, self.dataId, self.id)) 

120 

121 @property 

122 def dimensions(self) -> DimensionGraph: 

123 """Dimensions associated with the underlying `DatasetType`.""" 

124 return self.datasetType.dimensions 

125 

126 def __repr__(self) -> str: 

127 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

128 # DataCoordinate's __repr__ - while adhering to the guidelines for 

129 # __repr__ - is much harder to users to read, while its __str__ just 

130 # produces a dict that can also be passed to DatasetRef's constructor. 

131 if self.id is not None: 

132 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

133 else: 

134 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

135 

136 def __str__(self) -> str: 

137 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]" 

138 if self.id is not None: 

139 s += f" (id={self.id})" 

140 return s 

141 

142 def __lt__(self, other: Any) -> bool: 

143 # Sort by run, DatasetType name and then by DataCoordinate 

144 # The __str__ representation is probably close enough but we 

145 # need to ensure that sorting a DatasetRef matches what you would 

146 # get if you sorted DatasetType+DataCoordinate 

147 if not isinstance(other, type(self)): 

148 return NotImplemented 

149 

150 # Group by run if defined, takes precedence over DatasetType 

151 self_run = "" if self.run is None else self.run 

152 other_run = "" if other.run is None else other.run 

153 

154 # Compare tuples in the priority order 

155 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

156 

157 def to_simple(self, minimal: bool = False) -> Dict: 

158 """Convert this class to a simple python type. 

159 

160 This makes it suitable for serialization. 

161 

162 Parameters 

163 ---------- 

164 minimal : `bool`, optional 

165 Use minimal serialization. Requires Registry to convert 

166 back to a full type. 

167 

168 Returns 

169 ------- 

170 simple : `dict` or `int` 

171 The object converted to a dictionary. 

172 """ 

173 if minimal and self.id is not None: 

174 # The only thing needed to uniquely define a DatasetRef 

175 # is the integer id so that can be used directly if it is 

176 # resolved and if it is not a component DatasetRef. 

177 # Store is in a dict to allow us to easily add the planned 

178 # origin information later without having to support 

179 # an int and dict in simple form. 

180 simple: Dict[str, Any] = {"id": self.id} 

181 if self.isComponent(): 

182 # We can still be a little minimalist with a component 

183 # but we will also need to record the datasetType component 

184 simple["component"] = self.datasetType.component() 

185 return simple 

186 

187 # Convert to a dict form 

188 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal), 

189 "dataId": self.dataId.to_simple(), 

190 } 

191 

192 # Only include the id entry if it is defined 

193 if self.id is not None: 

194 as_dict["run"] = self.run 

195 as_dict["id"] = self.id 

196 

197 return as_dict 

198 

199 @classmethod 

200 def from_simple(cls, simple: Dict, 

201 universe: Optional[DimensionUniverse] = None, 

202 registry: Optional[Registry] = None) -> DatasetRef: 

203 """Construct a new object from simplified form. 

204 

205 Generally this is data returned from the `to_simple` method. 

206 

207 Parameters 

208 ---------- 

209 simple : `dict` of [`str`, `Any`] 

210 The value returned by `to_simple()`. 

211 universe : `DimensionUniverse` 

212 The special graph of all known dimensions. 

213 Can be `None` if a registry is provided. 

214 registry : `lsst.daf.butler.Registry`, optional 

215 Registry to use to convert simple form of a DatasetRef to 

216 a full `DatasetRef`. Can be `None` if a full description of 

217 the type is provided along with a universe. 

218 

219 Returns 

220 ------- 

221 ref : `DatasetRef` 

222 Newly-constructed object. 

223 """ 

224 # Minimalist component will just specify component and id and 

225 # require registry to reconstruct 

226 if set(simple).issubset({"id", "component"}): 

227 if registry is None: 

228 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

229 ref = registry.getDataset(simple["id"]) 

230 if ref is None: 

231 raise RuntimeError(f"No matching dataset found in registry for id {simple['id']}") 

232 if "component" in simple: 

233 ref = ref.makeComponentRef(simple["component"]) 

234 return ref 

235 

236 if universe is None and registry is None: 

237 raise ValueError("One of universe or registry must be provided.") 

238 

239 if universe is None and registry is not None: 

240 universe = registry.dimensions 

241 

242 if universe is None: 

243 # this is for mypy 

244 raise ValueError("Unable to determine a usable universe") 

245 

246 datasetType = DatasetType.from_simple(simple["datasetType"], universe=universe, registry=registry) 

247 dataId = DataCoordinate.from_simple(simple["dataId"], universe=universe) 

248 return cls(datasetType, dataId, 

249 id=simple["id"], run=simple["run"]) 

250 

251 to_json = to_json_generic 

252 from_json = classmethod(from_json_generic) 

253 

254 @classmethod 

255 def _unpickle( 

256 cls, 

257 datasetType: DatasetType, 

258 dataId: DataCoordinate, 

259 id: Optional[int], 

260 run: Optional[str], 

261 ) -> DatasetRef: 

262 """Create new `DatasetRef`. 

263 

264 A custom factory method for use by `__reduce__` as a workaround for 

265 its lack of support for keyword arguments. 

266 """ 

267 return cls(datasetType, dataId, id=id, run=run) 

268 

269 def __reduce__(self) -> tuple: 

270 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

271 

272 def __deepcopy__(self, memo: dict) -> DatasetRef: 

273 # DatasetRef is recursively immutable; see note in @immutable 

274 # decorator. 

275 return self 

276 

277 def resolved(self, id: int, run: str) -> DatasetRef: 

278 """Return resolved `DatasetRef`. 

279 

280 This is a new `DatasetRef` with the same data ID and dataset type 

281 and the given ID and run. 

282 

283 Parameters 

284 ---------- 

285 id : `int` 

286 The unique integer identifier assigned when the dataset is created. 

287 run : `str` 

288 The run this dataset was associated with when it was created. 

289 

290 Returns 

291 ------- 

292 ref : `DatasetRef` 

293 A new `DatasetRef`. 

294 """ 

295 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

296 id=id, run=run, conform=False) 

297 

298 def unresolved(self) -> DatasetRef: 

299 """Return unresolved `DatasetRef`. 

300 

301 This is a new `DatasetRef` with the same data ID and dataset type, 

302 but no ID or run. 

303 

304 Returns 

305 ------- 

306 ref : `DatasetRef` 

307 A new `DatasetRef`. 

308 

309 Notes 

310 ----- 

311 This can be used to compare only the data ID and dataset type of a 

312 pair of `DatasetRef` instances, regardless of whether either is 

313 resolved:: 

314 

315 if ref1.unresolved() == ref2.unresolved(): 

316 ... 

317 """ 

318 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

319 

320 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

321 """Return a new `DatasetRef` with the given expanded data ID. 

322 

323 Parameters 

324 ---------- 

325 dataId : `DataCoordinate` 

326 Data ID for the new `DatasetRef`. Must compare equal to the 

327 original data ID. 

328 

329 Returns 

330 ------- 

331 ref : `DatasetRef` 

332 A new `DatasetRef` with the given data ID. 

333 """ 

334 assert dataId == self.dataId 

335 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

336 id=self.id, run=self.run, 

337 conform=False) 

338 

339 def isComponent(self) -> bool: 

340 """Indicate whether this `DatasetRef` refers to a component. 

341 

342 Returns 

343 ------- 

344 isComponent : `bool` 

345 `True` if this `DatasetRef` is a component, `False` otherwise. 

346 """ 

347 return self.datasetType.isComponent() 

348 

349 def isComposite(self) -> bool: 

350 """Boolean indicating whether this `DatasetRef` is a composite type. 

351 

352 Returns 

353 ------- 

354 isComposite : `bool` 

355 `True` if this `DatasetRef` is a composite type, `False` 

356 otherwise. 

357 """ 

358 return self.datasetType.isComposite() 

359 

360 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

361 """Name keys to use when looking up this DatasetRef in a configuration. 

362 

363 The names are returned in order of priority. 

364 

365 Returns 

366 ------- 

367 names : `tuple` of `LookupKey` 

368 Tuple of the `DatasetType` name and the `StorageClass` name. 

369 If ``instrument`` is defined in the dataId, each of those names 

370 is added to the start of the tuple with a key derived from the 

371 value of ``instrument``. 

372 """ 

373 # Special case the instrument Dimension since we allow configs 

374 # to include the instrument name in the hierarchy. 

375 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

376 

377 # mypy doesn't think this could return True, because even though 

378 # __contains__ can take an object of any type, it seems hard-coded to 

379 # assume it will return False if the type doesn't match the key type 

380 # of the Mapping. 

381 if "instrument" in self.dataId: # type: ignore 

382 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

383 for n in names) + names 

384 

385 return names 

386 

387 @staticmethod 

388 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

389 """Group an iterable of `DatasetRef` by `DatasetType`. 

390 

391 Parameters 

392 ---------- 

393 refs : `Iterable` [ `DatasetRef` ] 

394 `DatasetRef` instances to group. 

395 

396 Returns 

397 ------- 

398 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

399 Grouped `DatasetRef` instances. 

400 """ 

401 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

402 for ref in refs: 

403 result.setdefault(ref.datasetType, []).append(ref) 

404 return result 

405 

406 def getCheckedId(self) -> int: 

407 """Return ``self.id``, or raise if it is `None`. 

408 

409 This trivial method exists to allow operations that would otherwise be 

410 natural list comprehensions to check that the ID is not `None` as well. 

411 

412 Returns 

413 ------- 

414 id : `int` 

415 ``self.id`` if it is not `None`. 

416 

417 Raises 

418 ------ 

419 AmbiguousDatasetError 

420 Raised if ``ref.id`` is `None`. 

421 """ 

422 if self.id is None: 

423 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

424 f"a resolved reference is required.") 

425 return self.id 

426 

427 def makeCompositeRef(self) -> DatasetRef: 

428 """Create a `DatasetRef` of the composite from a component ref. 

429 

430 Requires that this `DatasetRef` is a component. 

431 

432 Returns 

433 ------- 

434 ref : `DatasetRef` 

435 A `DatasetRef` with a dataset type that corresponds to the 

436 composite parent of this component, and the same ID and run 

437 (which may be `None`, if they are `None` in ``self``). 

438 """ 

439 # Assume that the data ID does not need to be standardized 

440 # and should match whatever this ref already has. 

441 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId, 

442 id=self.id, run=self.run, conform=False) 

443 

444 def makeComponentRef(self, name: str) -> DatasetRef: 

445 """Create a `DatasetRef` that corresponds to a component. 

446 

447 Parameters 

448 ---------- 

449 name : `str` 

450 Name of the component. 

451 

452 Returns 

453 ------- 

454 ref : `DatasetRef` 

455 A `DatasetRef` with a dataset type that corresponds to the given 

456 component, and the same ID and run 

457 (which may be `None`, if they are `None` in ``self``). 

458 """ 

459 # Assume that the data ID does not need to be standardized 

460 # and should match whatever this ref already has. 

461 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

462 id=self.id, run=self.run, conform=False) 

463 

464 datasetType: DatasetType 

465 """The definition of this dataset (`DatasetType`). 

466 

467 Cannot be changed after a `DatasetRef` is constructed. 

468 """ 

469 

470 dataId: DataCoordinate 

471 """A mapping of `Dimension` primary key values that labels the dataset 

472 within a Collection (`DataCoordinate`). 

473 

474 Cannot be changed after a `DatasetRef` is constructed. 

475 """ 

476 

477 run: Optional[str] 

478 """The name of the run that produced the dataset. 

479 

480 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

481 `unresolved` to add or remove this information when creating a new 

482 `DatasetRef`. 

483 """ 

484 

485 id: Optional[int] 

486 """Primary key of the dataset (`int` or `None`). 

487 

488 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

489 `unresolved` to add or remove this information when creating a new 

490 `DatasetRef`. 

491 """