Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["AmbiguousDatasetError", "DatasetRef", "SerializedDatasetRef"] 

24 

25from typing import ( 

26 TYPE_CHECKING, 

27 Any, 

28 Dict, 

29 Iterable, 

30 List, 

31 Optional, 

32 Tuple, 

33) 

34 

35from pydantic import BaseModel, StrictStr, ConstrainedInt, validator 

36 

37from ..dimensions import DataCoordinate, DimensionGraph, DimensionUniverse 

38from ..configSupport import LookupKey 

39from ..utils import immutable 

40from ..named import NamedKeyDict 

41from .type import DatasetType, SerializedDatasetType 

42from ..json import from_json_pydantic, to_json_pydantic 

43 

44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true

45 from ...registry import Registry 

46 

47 

48class AmbiguousDatasetError(Exception): 

49 """Raised when a `DatasetRef` is not resolved but should be. 

50 

51 This happens when the `DatasetRef` has no ID or run but the requested 

52 operation requires one of them. 

53 """ 

54 

55 

56class PositiveInt(ConstrainedInt): 

57 ge = 0 

58 strict = True 

59 

60 

61class SerializedDatasetRef(BaseModel): 

62 """Simplified model of a `DatasetRef` suitable for serialization.""" 

63 

64 id: Optional[PositiveInt] = None 

65 datasetType: Optional[SerializedDatasetType] = None 

66 dataId: Optional[Dict[str, Any]] = None # Do not use specialist pydantic model for this 

67 run: Optional[StrictStr] = None 

68 component: Optional[StrictStr] = None 

69 

70 @validator("dataId") 

71 def check_dataId(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

72 if (d := "datasetType") in values and values[d] is None: 

73 raise ValueError("Can not specify 'dataId' without specifying 'datasetType'") 

74 return v 

75 

76 @validator("run") 

77 def check_run(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

78 if v and (i := "id") in values and values[i] is None: 

79 raise ValueError("'run' cannot be provided unless 'id' is.") 

80 return v 

81 

82 @validator("component") 

83 def check_component(cls, v: Any, values: Dict[str, Any]) -> Any: # noqa: N805 

84 # Component should not be given if datasetType is given 

85 if v and (d := "datasetType") in values and values[d] is not None: 

86 raise ValueError(f"datasetType ({values[d]}) can not be set if component is given ({v}).") 

87 return v 

88 

89 

90@immutable 

91class DatasetRef: 

92 """Reference to a Dataset in a `Registry`. 

93 

94 A `DatasetRef` may point to a Dataset that currently does not yet exist 

95 (e.g., because it is a predicted input for provenance). 

96 

97 Parameters 

98 ---------- 

99 datasetType : `DatasetType` 

100 The `DatasetType` for this Dataset. 

101 dataId : `DataCoordinate` 

102 A mapping of dimensions that labels the Dataset within a Collection. 

103 id : `int`, optional 

104 The unique integer identifier assigned when the dataset is created. 

105 run : `str`, optional 

106 The name of the run this dataset was associated with when it was 

107 created. Must be provided if ``id`` is. 

108 conform : `bool`, optional 

109 If `True` (default), call `DataCoordinate.standardize` to ensure that 

110 the data ID's dimensions are consistent with the dataset type's. 

111 `DatasetRef` instances for which those dimensions are not equal should 

112 not be created in new code, but are still supported for backwards 

113 compatibility. New code should only pass `False` if it can guarantee 

114 that the dimensions are already consistent. 

115 

116 Raises 

117 ------ 

118 ValueError 

119 Raised if ``run`` is provided but ``id`` is not, or if ``id`` is 

120 provided but ``run`` is not. 

121 """ 

122 

123 _serializedType = SerializedDatasetRef 

124 __slots__ = ("id", "datasetType", "dataId", "run",) 

125 

126 def __init__( 

127 self, 

128 datasetType: DatasetType, dataId: DataCoordinate, *, 

129 id: Optional[int] = None, 

130 run: Optional[str] = None, 

131 conform: bool = True 

132 ): 

133 self.id = id 

134 self.datasetType = datasetType 

135 if conform: 

136 self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

137 else: 

138 self.dataId = dataId 

139 if self.id is not None: 

140 if run is None: 

141 raise ValueError(f"Cannot provide id without run for dataset with id={id}, " 

142 f"type={datasetType}, and dataId={dataId}.") 

143 self.run = run 

144 else: 

145 if run is not None: 

146 raise ValueError("'run' cannot be provided unless 'id' is.") 

147 self.run = None 

148 

149 def __eq__(self, other: Any) -> bool: 

150 try: 

151 return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

152 except AttributeError: 

153 return NotImplemented 

154 

155 def __hash__(self) -> int: 

156 return hash((self.datasetType, self.dataId, self.id)) 

157 

158 @property 

159 def dimensions(self) -> DimensionGraph: 

160 """Dimensions associated with the underlying `DatasetType`.""" 

161 return self.datasetType.dimensions 

162 

163 def __repr__(self) -> str: 

164 # We delegate to __str__ (i.e use "!s") for the data ID) below because 

165 # DataCoordinate's __repr__ - while adhering to the guidelines for 

166 # __repr__ - is much harder to users to read, while its __str__ just 

167 # produces a dict that can also be passed to DatasetRef's constructor. 

168 if self.id is not None: 

169 return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r})") 

170 else: 

171 return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

172 

173 def __str__(self) -> str: 

174 s = f"{self.datasetType.name}@{self.dataId!s}, sc={self.datasetType.storageClass.name}]" 

175 if self.id is not None: 

176 s += f" (id={self.id})" 

177 return s 

178 

179 def __lt__(self, other: Any) -> bool: 

180 # Sort by run, DatasetType name and then by DataCoordinate 

181 # The __str__ representation is probably close enough but we 

182 # need to ensure that sorting a DatasetRef matches what you would 

183 # get if you sorted DatasetType+DataCoordinate 

184 if not isinstance(other, type(self)): 

185 return NotImplemented 

186 

187 # Group by run if defined, takes precedence over DatasetType 

188 self_run = "" if self.run is None else self.run 

189 other_run = "" if other.run is None else other.run 

190 

191 # Compare tuples in the priority order 

192 return (self_run, self.datasetType, self.dataId) < (other_run, other.datasetType, other.dataId) 

193 

194 def to_simple(self, minimal: bool = False) -> SerializedDatasetRef: 

195 """Convert this class to a simple python type. 

196 

197 This makes it suitable for serialization. 

198 

199 Parameters 

200 ---------- 

201 minimal : `bool`, optional 

202 Use minimal serialization. Requires Registry to convert 

203 back to a full type. 

204 

205 Returns 

206 ------- 

207 simple : `dict` or `int` 

208 The object converted to a dictionary. 

209 """ 

210 if minimal and self.id is not None: 

211 # The only thing needed to uniquely define a DatasetRef 

212 # is the integer id so that can be used directly if it is 

213 # resolved and if it is not a component DatasetRef. 

214 # Store is in a dict to allow us to easily add the planned 

215 # origin information later without having to support 

216 # an int and dict in simple form. 

217 simple: Dict[str, Any] = {"id": self.id} 

218 if self.isComponent(): 

219 # We can still be a little minimalist with a component 

220 # but we will also need to record the datasetType component 

221 simple["component"] = self.datasetType.component() 

222 return SerializedDatasetRef(**simple) 

223 

224 # Convert to a dict form 

225 as_dict: Dict[str, Any] = {"datasetType": self.datasetType.to_simple(minimal=minimal), 

226 "dataId": self.dataId.to_simple(), 

227 } 

228 

229 # Only include the id entry if it is defined 

230 if self.id is not None: 

231 as_dict["run"] = self.run 

232 as_dict["id"] = self.id 

233 

234 return SerializedDatasetRef(**as_dict) 

235 

236 @classmethod 

237 def from_simple(cls, simple: SerializedDatasetRef, 

238 universe: Optional[DimensionUniverse] = None, 

239 registry: Optional[Registry] = None) -> DatasetRef: 

240 """Construct a new object from simplified form. 

241 

242 Generally this is data returned from the `to_simple` method. 

243 

244 Parameters 

245 ---------- 

246 simple : `dict` of [`str`, `Any`] 

247 The value returned by `to_simple()`. 

248 universe : `DimensionUniverse` 

249 The special graph of all known dimensions. 

250 Can be `None` if a registry is provided. 

251 registry : `lsst.daf.butler.Registry`, optional 

252 Registry to use to convert simple form of a DatasetRef to 

253 a full `DatasetRef`. Can be `None` if a full description of 

254 the type is provided along with a universe. 

255 

256 Returns 

257 ------- 

258 ref : `DatasetRef` 

259 Newly-constructed object. 

260 """ 

261 # Minimalist component will just specify component and id and 

262 # require registry to reconstruct 

263 if set(simple.dict(exclude_unset=True, exclude_defaults=True)).issubset({"id", "component"}): 

264 if registry is None: 

265 raise ValueError("Registry is required to construct component DatasetRef from integer id") 

266 if simple.id is None: 

267 raise ValueError("For minimal DatasetRef the ID must be defined.") 

268 ref = registry.getDataset(simple.id) 

269 if ref is None: 

270 raise RuntimeError(f"No matching dataset found in registry for id {simple.id}") 

271 if simple.component: 

272 ref = ref.makeComponentRef(simple.component) 

273 return ref 

274 

275 if universe is None and registry is None: 

276 raise ValueError("One of universe or registry must be provided.") 

277 

278 if universe is None and registry is not None: 

279 universe = registry.dimensions 

280 

281 if universe is None: 

282 # this is for mypy 

283 raise ValueError("Unable to determine a usable universe") 

284 

285 if simple.datasetType is None: 

286 # mypy 

287 raise ValueError("The DatasetType must be specified to construct a DatasetRef") 

288 datasetType = DatasetType.from_simple(simple.datasetType, universe=universe, registry=registry) 

289 

290 if simple.dataId is None: 

291 # mypy 

292 raise ValueError("The DataId must be specified to construct a DatasetRef") 

293 dataId = DataCoordinate.from_simple(simple.dataId, universe=universe) 

294 return cls(datasetType, dataId, 

295 id=simple.id, run=simple.run) 

296 

297 to_json = to_json_pydantic 

298 from_json = classmethod(from_json_pydantic) 

299 

300 @classmethod 

301 def _unpickle( 

302 cls, 

303 datasetType: DatasetType, 

304 dataId: DataCoordinate, 

305 id: Optional[int], 

306 run: Optional[str], 

307 ) -> DatasetRef: 

308 """Create new `DatasetRef`. 

309 

310 A custom factory method for use by `__reduce__` as a workaround for 

311 its lack of support for keyword arguments. 

312 """ 

313 return cls(datasetType, dataId, id=id, run=run) 

314 

315 def __reduce__(self) -> tuple: 

316 return (self._unpickle, (self.datasetType, self.dataId, self.id, self.run)) 

317 

318 def __deepcopy__(self, memo: dict) -> DatasetRef: 

319 # DatasetRef is recursively immutable; see note in @immutable 

320 # decorator. 

321 return self 

322 

323 def resolved(self, id: int, run: str) -> DatasetRef: 

324 """Return resolved `DatasetRef`. 

325 

326 This is a new `DatasetRef` with the same data ID and dataset type 

327 and the given ID and run. 

328 

329 Parameters 

330 ---------- 

331 id : `int` 

332 The unique integer identifier assigned when the dataset is created. 

333 run : `str` 

334 The run this dataset was associated with when it was created. 

335 

336 Returns 

337 ------- 

338 ref : `DatasetRef` 

339 A new `DatasetRef`. 

340 """ 

341 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

342 id=id, run=run, conform=False) 

343 

344 def unresolved(self) -> DatasetRef: 

345 """Return unresolved `DatasetRef`. 

346 

347 This is a new `DatasetRef` with the same data ID and dataset type, 

348 but no ID or run. 

349 

350 Returns 

351 ------- 

352 ref : `DatasetRef` 

353 A new `DatasetRef`. 

354 

355 Notes 

356 ----- 

357 This can be used to compare only the data ID and dataset type of a 

358 pair of `DatasetRef` instances, regardless of whether either is 

359 resolved:: 

360 

361 if ref1.unresolved() == ref2.unresolved(): 

362 ... 

363 """ 

364 return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, conform=False) 

365 

366 def expanded(self, dataId: DataCoordinate) -> DatasetRef: 

367 """Return a new `DatasetRef` with the given expanded data ID. 

368 

369 Parameters 

370 ---------- 

371 dataId : `DataCoordinate` 

372 Data ID for the new `DatasetRef`. Must compare equal to the 

373 original data ID. 

374 

375 Returns 

376 ------- 

377 ref : `DatasetRef` 

378 A new `DatasetRef` with the given data ID. 

379 """ 

380 assert dataId == self.dataId 

381 return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

382 id=self.id, run=self.run, 

383 conform=False) 

384 

385 def isComponent(self) -> bool: 

386 """Indicate whether this `DatasetRef` refers to a component. 

387 

388 Returns 

389 ------- 

390 isComponent : `bool` 

391 `True` if this `DatasetRef` is a component, `False` otherwise. 

392 """ 

393 return self.datasetType.isComponent() 

394 

395 def isComposite(self) -> bool: 

396 """Boolean indicating whether this `DatasetRef` is a composite type. 

397 

398 Returns 

399 ------- 

400 isComposite : `bool` 

401 `True` if this `DatasetRef` is a composite type, `False` 

402 otherwise. 

403 """ 

404 return self.datasetType.isComposite() 

405 

406 def _lookupNames(self) -> Tuple[LookupKey, ...]: 

407 """Name keys to use when looking up this DatasetRef in a configuration. 

408 

409 The names are returned in order of priority. 

410 

411 Returns 

412 ------- 

413 names : `tuple` of `LookupKey` 

414 Tuple of the `DatasetType` name and the `StorageClass` name. 

415 If ``instrument`` is defined in the dataId, each of those names 

416 is added to the start of the tuple with a key derived from the 

417 value of ``instrument``. 

418 """ 

419 # Special case the instrument Dimension since we allow configs 

420 # to include the instrument name in the hierarchy. 

421 names: Tuple[LookupKey, ...] = self.datasetType._lookupNames() 

422 

423 # mypy doesn't think this could return True, because even though 

424 # __contains__ can take an object of any type, it seems hard-coded to 

425 # assume it will return False if the type doesn't match the key type 

426 # of the Mapping. 

427 if "instrument" in self.dataId: # type: ignore 

428 names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

429 for n in names) + names 

430 

431 return names 

432 

433 @staticmethod 

434 def groupByType(refs: Iterable[DatasetRef]) -> NamedKeyDict[DatasetType, List[DatasetRef]]: 

435 """Group an iterable of `DatasetRef` by `DatasetType`. 

436 

437 Parameters 

438 ---------- 

439 refs : `Iterable` [ `DatasetRef` ] 

440 `DatasetRef` instances to group. 

441 

442 Returns 

443 ------- 

444 grouped : `NamedKeyDict` [ `DatasetType`, `list` [ `DatasetRef` ] ] 

445 Grouped `DatasetRef` instances. 

446 """ 

447 result: NamedKeyDict[DatasetType, List[DatasetRef]] = NamedKeyDict() 

448 for ref in refs: 

449 result.setdefault(ref.datasetType, []).append(ref) 

450 return result 

451 

452 def getCheckedId(self) -> int: 

453 """Return ``self.id``, or raise if it is `None`. 

454 

455 This trivial method exists to allow operations that would otherwise be 

456 natural list comprehensions to check that the ID is not `None` as well. 

457 

458 Returns 

459 ------- 

460 id : `int` 

461 ``self.id`` if it is not `None`. 

462 

463 Raises 

464 ------ 

465 AmbiguousDatasetError 

466 Raised if ``ref.id`` is `None`. 

467 """ 

468 if self.id is None: 

469 raise AmbiguousDatasetError(f"ID for dataset {self} is `None`; " 

470 f"a resolved reference is required.") 

471 return self.id 

472 

473 def makeCompositeRef(self) -> DatasetRef: 

474 """Create a `DatasetRef` of the composite from a component ref. 

475 

476 Requires that this `DatasetRef` is a component. 

477 

478 Returns 

479 ------- 

480 ref : `DatasetRef` 

481 A `DatasetRef` with a dataset type that corresponds to the 

482 composite parent of this component, and the same ID and run 

483 (which may be `None`, if they are `None` in ``self``). 

484 """ 

485 # Assume that the data ID does not need to be standardized 

486 # and should match whatever this ref already has. 

487 return DatasetRef(self.datasetType.makeCompositeDatasetType(), self.dataId, 

488 id=self.id, run=self.run, conform=False) 

489 

490 def makeComponentRef(self, name: str) -> DatasetRef: 

491 """Create a `DatasetRef` that corresponds to a component. 

492 

493 Parameters 

494 ---------- 

495 name : `str` 

496 Name of the component. 

497 

498 Returns 

499 ------- 

500 ref : `DatasetRef` 

501 A `DatasetRef` with a dataset type that corresponds to the given 

502 component, and the same ID and run 

503 (which may be `None`, if they are `None` in ``self``). 

504 """ 

505 # Assume that the data ID does not need to be standardized 

506 # and should match whatever this ref already has. 

507 return DatasetRef(self.datasetType.makeComponentDatasetType(name), self.dataId, 

508 id=self.id, run=self.run, conform=False) 

509 

510 datasetType: DatasetType 

511 """The definition of this dataset (`DatasetType`). 

512 

513 Cannot be changed after a `DatasetRef` is constructed. 

514 """ 

515 

516 dataId: DataCoordinate 

517 """A mapping of `Dimension` primary key values that labels the dataset 

518 within a Collection (`DataCoordinate`). 

519 

520 Cannot be changed after a `DatasetRef` is constructed. 

521 """ 

522 

523 run: Optional[str] 

524 """The name of the run that produced the dataset. 

525 

526 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

527 `unresolved` to add or remove this information when creating a new 

528 `DatasetRef`. 

529 """ 

530 

531 id: Optional[int] 

532 """Primary key of the dataset (`int` or `None`). 

533 

534 Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

535 `unresolved` to add or remove this information when creating a new 

536 `DatasetRef`. 

537 """