Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

# This file is part of daf_butler. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

from __future__ import annotations 

 

__all__ = ["DatasetRef"] 

 

import hashlib 

from typing import Any, Dict, Mapping, Optional, Tuple 

 

from types import MappingProxyType 

from ..dimensions import DataCoordinate, DimensionGraph, ExpandedDataCoordinate 

from ..configSupport import LookupKey 

from ..utils import immutable 

from .type import DatasetType 

 

 

@immutable 

class DatasetRef: 

"""Reference to a Dataset in a `Registry`. 

 

A `DatasetRef` may point to a Dataset that currently does not yet exist 

(e.g., because it is a predicted input for provenance). 

 

Parameters 

---------- 

datasetType : `DatasetType` 

The `DatasetType` for this Dataset. 

dataId : `DataCoordinate` 

A mapping of dimensions that labels the Dataset within a Collection. 

id : `int`, optional 

The unique integer identifier assigned when the dataset is created. 

run : `str`, optional 

The name of the run this dataset was associated with when it was 

created. 

hash : `bytes`, optional 

A hash of the dataset type and data ID. Should only be provided if 

copying from another `DatasetRef` with the same dataset type and data 

ID. 

components : `dict`, optional 

A dictionary mapping component name to a `DatasetRef` for that 

component. Should not be passed unless ``id`` is also provided (i.e. 

if this is a "resolved" reference). 

conform : `bool`, optional 

If `True` (default), call `DataCoordinate.standardize` to ensure that 

the data ID's dimensions are consistent with the dataset type's. 

`DatasetRef` instances for which those dimensions are not equal should 

not be created in new code, but are still supported for backwards 

compatibility. New code should only pass `False` if it can guarantee 

that the dimensions are already consistent. 

 

Raises 

------ 

ValueError 

Raised if ``run`` or ``components`` is provided but ``id`` is not, or 

if a component dataset is inconsistent with the storage class. 

""" 

 

__slots__ = ("id", "datasetType", "dataId", "run", "_hash", "_components") 

 

def __new__(cls, datasetType: DatasetType, dataId: DataCoordinate, *, 

id: Optional[int] = None, 

run: Optional[str] = None, hash: Optional[bytes] = None, 

components: Optional[Mapping[str, DatasetRef]] = None, conform: bool = True) -> DatasetRef: 

self = super().__new__(cls) 

assert isinstance(datasetType, DatasetType) 

self.id = id 

self.datasetType = datasetType 

if conform: 

self.dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions) 

else: 

self.dataId = dataId 

if self.id is not None: 

self._components = dict() 

if components is not None: 

self._components.update(components) 

for k, v in self._components.items(): 

expectedStorageClass = self.datasetType.storageClass.components.get(k) 

if expectedStorageClass is None: 

raise ValueError(f"{k} is not a valid component for " 

f"storage class {self.datasetType.storageClass.name}.") 

if not isinstance(v, DatasetRef): 

# It's easy to accidentally pass DatasetType or 

# StorageClass; make that error message friendly. 

raise ValueError(f"Component {k}={v} is not a DatasetRef.") 

if v.id is None: 

raise ValueError(f"DatasetRef components must be resolved ({k}={v} isn't).") 

if expectedStorageClass != v.datasetType.storageClass: 

raise ValueError(f"Storage class mismatch for component {k}: " 

f"{v.datasetType.storageClass.name} != {expectedStorageClass.name}") 

# TODO: it would be nice to guarantee that id and run should be 

# either both None or not None together. We can't easily do that 

# yet because the Query infrastructure has a hard time obtaining 

# run strings, so we allow run to be `None` here, but that will 

# change. 

self.run = run 

else: 

self._components = None 

if components: 

raise ValueError("'components' cannot be provided unless 'id' is.") 

if run is not None: 

raise ValueError("'run' cannot be provided unless 'id' is.") 

self.run = None 

if hash is not None: 

# We only set self._hash if we know it; this plays nicely with 

# the @immutable decorator, which allows an attribute to be set 

# only one time. 

self._hash = hash 

return self 

 

def __eq__(self, other: DatasetRef): 

try: 

return (self.datasetType, self.dataId, self.id) == (other.datasetType, other.dataId, other.id) 

except AttributeError: 

return NotImplemented 

 

def __hash__(self) -> int: 

return hash(self.datasetType, self.dataId, self.id) 

 

@property 

def hash(self) -> bytes: 

"""Secure hash of the `DatasetType` name and data ID (`bytes`). 

""" 

if not hasattr(self, "_hash"): 

message = hashlib.blake2b(digest_size=32) 

message.update(self.datasetType.name.encode("utf8")) 

self.dataId.fingerprint(message.update) 

self._hash = message.digest() 

return self._hash 

 

@property 

def components(self) -> Optional[Mapping[str, DatasetRef]]: 

"""Named `DatasetRef` components (`~collections.abc.Mapping` or 

`None`). 

 

For resolved `DatasetRef` instances, this is a read-only mapping that 

can be updated in-place via `Registry.attachComponent()`. For 

unresolved instances, this is always `None`. 

""" 

if self._components is None: 

return None 

return MappingProxyType(self._components) 

 

@property 

def dimensions(self) -> DimensionGraph: 

"""The dimensions associated with the underlying `DatasetType` 

""" 

return self.datasetType.dimensions 

 

def __repr__(self) -> str: 

# We delegate to __str__ (i.e use "!s") for the data ID) below because 

# DataCoordinate's __repr__ - while adhering to the guidelines for 

# __repr__ - is much harder to users to read, while its __str__ just 

# produces a dict that can also be passed to DatasetRef's constructor. 

if self.id is not None: 

return (f"DatasetRef({self.datasetType!r}, {self.dataId!s}, id={self.id}, run={self.run!r}, " 

f"components={self._components})") 

else: 

return f"DatasetRef({self.datasetType!r}, {self.dataId!s})" 

 

def __str__(self) -> str: 

s = f"{self.datasetType.name}@{self.dataId!s}" 

if self.id is not None: 

s += f" (id={self.id})" 

return s 

 

def __getnewargs_ex__(self) -> Tuple[Tuple[Any, ...], Dict[str, Any]]: 

return ((self.datasetType, self.dataId), 

{"id": self.id, "run": self.run, "components": self._components}) 

 

def resolved(self, id: int, run: str, components: Optional[Mapping[str, DatasetRef]] = None 

) -> DatasetRef: 

"""Return a new `DatasetRef` with the same data ID and dataset type 

and the given ID and run. 

 

Parameters 

---------- 

id : `int` 

The unique integer identifier assigned when the dataset is created. 

run : `str` 

The run this dataset was associated with when it was created. 

components : `dict`, optional 

A dictionary mapping component name to a `DatasetRef` for that 

component. If ``self`` is already a resolved `DatasetRef`, 

its components will be merged with this dictionary, with this 

dictionary taking precedence. 

 

Returns 

------- 

ref : `DatasetRef` 

A new `DatasetRef`. 

""" 

if self._components is not None: 

newComponents = self._components.copy() 

else: 

newComponents = {} 

if components: 

newComponents.update(components) 

return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, 

id=id, run=run, hash=self.hash, components=newComponents, conform=False) 

 

def unresolved(self) -> DatasetRef: 

"""Return a new `DatasetRef` with the same data ID and dataset type, 

but no ID, run, or components. 

 

Returns 

------- 

ref : `DatasetRef` 

A new `DatasetRef`. 

 

Notes 

----- 

This can be used to compare only the data ID and dataset type of a 

pair of `DatasetRef` instances, regardless of whether either is 

resolved:: 

 

if ref1.unresolved() == ref2.unresolved(): 

... 

""" 

return DatasetRef(datasetType=self.datasetType, dataId=self.dataId, hash=self.hash, conform=False) 

 

def expanded(self, dataId: ExpandedDataCoordinate) -> DatasetRef: 

"""Return a new `DatasetRef` with the given expanded data ID. 

 

Parameters 

---------- 

dataId : `ExpandedDataCoordinate` 

Data ID for the new `DatasetRef`. Must compare equal to the 

original data ID. 

 

Returns 

------- 

ref : `DatasetRef` 

A new `DatasetRef` with the given data ID. 

""" 

assert dataId == self.dataId 

return DatasetRef(datasetType=self.datasetType, dataId=dataId, 

id=self.id, run=self.run, hash=self.hash, components=self.components, 

conform=False) 

 

def isComponent(self) -> bool: 

"""Boolean indicating whether this `DatasetRef` refers to a 

component of a composite. 

 

Returns 

------- 

isComponent : `bool` 

`True` if this `DatasetRef` is a component, `False` otherwise. 

""" 

return self.datasetType.isComponent() 

 

def isComposite(self) -> bool: 

"""Boolean indicating whether this `DatasetRef` is a composite type. 

 

Returns 

------- 

isComposite : `bool` 

`True` if this `DatasetRef` is a composite type, `False` 

otherwise. 

""" 

return self.datasetType.isComposite() 

 

def _lookupNames(self) -> Tuple[LookupKey]: 

"""Name keys to use when looking up this DatasetRef in a configuration. 

 

The names are returned in order of priority. 

 

Returns 

------- 

names : `tuple` of `LookupKey` 

Tuple of the `DatasetType` name and the `StorageClass` name. 

If ``instrument`` is defined in the dataId, each of those names 

is added to the start of the tuple with a key derived from the 

value of ``instrument``. 

""" 

# Special case the instrument Dimension since we allow configs 

# to include the instrument name in the hierarchy. 

names = self.datasetType._lookupNames() 

 

if "instrument" in self.dataId: 

names = tuple(n.clone(dataId={"instrument": self.dataId["instrument"]}) 

for n in names) + names 

 

return names 

 

datasetType: DatasetType 

"""The definition of this dataset (`DatasetType`). 

 

Cannot be changed after a `DatasetRef` is constructed. 

""" 

 

dataId: DataCoordinate 

"""A mapping of `Dimension` primary key values that labels the dataset 

within a Collection (`DataCoordinate`). 

 

Cannot be changed after a `DatasetRef` is constructed. 

""" 

 

run: Optional[setattr] 

"""The name of the run that produced the dataset. 

 

Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

`unresolved` to add or remove this information when creating a new 

`DatasetRef`. 

""" 

 

id: Optional[int] 

"""Primary key of the dataset (`int` or `None`). 

 

Cannot be changed after a `DatasetRef` is constructed; use `resolved` or 

`unresolved` to add or remove this information when creating a new 

`DatasetRef`. 

"""