Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

# This file is part of pipe_base. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

 

"""Module defining GraphBuilder class and related methods. 

""" 

 

__all__ = ['GraphBuilder'] 

 

# ------------------------------- 

# Imports of standard modules -- 

# ------------------------------- 

import copy 

from collections import namedtuple 

from itertools import chain 

import logging 

 

# ----------------------------- 

# Imports for other modules -- 

# ----------------------------- 

from .graph import QuantumGraphTaskNodes, QuantumGraph 

from lsst.daf.butler import Quantum, DatasetRef, DimensionSet 

 

# ---------------------------------- 

# Local non-exported definitions -- 

# ---------------------------------- 

 

_LOG = logging.getLogger(__name__.partition(".")[2]) 

 

# Tuple containing TaskDef, its input dataset types and output dataset types 

# 

# Attributes 

# ---------- 

# taskDef : `TaskDef` 

# inputs : `set` of `DatasetType` 

# outputs : `set` of `DatasetType` 

# initTnputs : `set` of `DatasetType` 

# initOutputs : `set` of `DatasetType` 

# perDatasetTypeDimensions : `~lsst.daf.butler.DimensionSet` 

# prerequisite : `set` of `DatasetType` 

_TaskDatasetTypes = namedtuple("_TaskDatasetTypes", ("taskDef", "inputs", "outputs", 

"initInputs", "initOutputs", 

"perDatasetTypeDimensions", "prerequisite")) 

 

 

class GraphBuilderError(Exception): 

"""Base class for exceptions generated by graph builder. 

""" 

pass 

 

 

class OutputExistsError(GraphBuilderError): 

"""Exception generated when output datasets already exist. 

""" 

 

def __init__(self, taskName, refs): 

refs = ', '.join(str(ref) for ref in refs) 

msg = "Output datasets already exist for task {}: {}".format(taskName, refs) 

GraphBuilderError.__init__(self, msg) 

 

 

class PrerequisiteMissingError(GraphBuilderError): 

"""Exception generated when a prerequisite dataset does not exist. 

""" 

pass 

 

 

class GraphBuilder(object): 

""" 

GraphBuilder class is responsible for building task execution graph from 

a Pipeline. 

 

Parameters 

---------- 

taskFactory : `TaskFactory` 

Factory object used to load/instantiate PipelineTasks 

registry : `~lsst.daf.butler.Registry` 

Data butler instance. 

skipExisting : `bool`, optional 

If ``True`` (default) then Quantum is not created if all its outputs 

already exist, otherwise exception is raised. 

""" 

 

def __init__(self, taskFactory, registry, skipExisting=True): 

self.taskFactory = taskFactory 

self.registry = registry 

self.dimensions = registry.dimensions 

self.skipExisting = skipExisting 

 

def _loadTaskClass(self, taskDef): 

"""Make sure task class is loaded. 

 

Load task class, update task name to make sure it is fully-qualified, 

do not update original taskDef in a Pipeline though. 

 

Parameters 

---------- 

taskDef : `TaskDef` 

 

Returns 

------- 

`TaskDef` instance, may be the same as parameter if task class is 

already loaded. 

""" 

if taskDef.taskClass is None: 

tClass, tName = self.taskFactory.loadTaskClass(taskDef.taskName) 

taskDef = copy.copy(taskDef) 

taskDef.taskClass = tClass 

taskDef.taskName = tName 

return taskDef 

 

def makeGraph(self, pipeline, originInfo, userQuery): 

"""Create execution graph for a pipeline. 

 

Parameters 

---------- 

pipeline : `Pipeline` 

Pipeline definition, task names/classes and their configs. 

originInfo : `~lsst.daf.butler.DatasetOriginInfo` 

Object which provides names of the input/output collections. 

userQuery : `str` 

String which defunes user-defined selection for registry, should be 

empty or `None` if there is no restrictions on data selection. 

 

Returns 

------- 

graph : `QuantumGraph` 

 

Raises 

------ 

UserExpressionError 

Raised when user expression cannot be parsed. 

OutputExistsError 

Raised when output datasets already exist. 

Exception 

Other exceptions types may be raised by underlying registry 

classes. 

""" 

 

# make sure all task classes are loaded 

taskList = [self._loadTaskClass(taskDef) for taskDef in pipeline] 

 

# collect inputs/outputs from each task 

taskDatasets = [] 

for taskDef in taskList: 

taskClass = taskDef.taskClass 

inputs = {k: v.makeDatasetType(self.registry.dimensions) 

for k, v in taskClass.getInputDatasetTypes(taskDef.config).items()} 

prerequisite = set(inputs[k] for k in taskClass.getPrerequisiteDatasetTypes(taskDef.config)) 

taskIo = [inputs.values()] 

for attr in ("Output", "InitInput", "InitOutput"): 

getter = getattr(taskClass, f"get{attr}DatasetTypes") 

ioObject = getter(taskDef.config) or {} 

taskIo.append(set(dsTypeDescr.makeDatasetType(self.registry.dimensions) 

for dsTypeDescr in ioObject.values())) 

perDatasetTypeDimensions = DimensionSet(self.registry.dimensions, 

taskClass.getPerDatasetTypeDimensions(taskDef.config)) 

taskDatasets.append(_TaskDatasetTypes(taskDef, *taskIo, prerequisite=prerequisite, 

perDatasetTypeDimensions=perDatasetTypeDimensions)) 

 

perDatasetTypeDimensions = self._extractPerDatasetTypeDimensions(taskDatasets) 

 

# categorize dataset types for the full Pipeline 

required, optional, prerequisite, initInputs, initOutputs = self._makeFullIODatasetTypes(taskDatasets) 

 

# make a graph 

return self._makeGraph(taskDatasets, required, optional, prerequisite, initInputs, initOutputs, 

originInfo, userQuery, perDatasetTypeDimensions=perDatasetTypeDimensions) 

 

def _extractPerDatasetTypeDimensions(self, taskDatasets): 

"""Return the complete set of all per-DatasetType dimensions declared 

by any task. 

 

Per-DatasetType dimensions are those that need not have the same values 

for different Datasets within a Quantum. 

 

Parameters 

---------- 

taskDatasets : sequence of `_TaskDatasetTypes` 

Information for each task in the pipeline. 

 

Returns 

------- 

perDatasetTypeDimensions : `~lsst.daf.butler.DimensionSet` 

All per-DatasetType dimensions. 

 

Raises 

------ 

ValueError 

Raised if tasks disagree on whether a dimension is declared 

per-DatasetType. 

""" 

# Empty dimension set, just used to construct more DimensionSets via 

# union method. 

noDimensions = DimensionSet(self.registry.dimensions, ()) 

# Construct pipeline-wide perDatasetTypeDimensions set from union of 

# all Task-level perDatasetTypeDimensions. 

perDatasetTypeDimensions = noDimensions.union( 

*[taskDs.perDatasetTypeDimensions for taskDs in taskDatasets] 

) 

# Check that no tasks want any of these as common (i.e. not 

# per-DatasetType) dimensions. 

for taskDs in taskDatasets: 

allTaskDimensions = noDimensions.union( 

*[datasetType.dimensions for datasetType in chain(taskDs.inputs, taskDs.outputs)] 

) 

commonTaskDimensions = allTaskDimensions - taskDs.perDatasetTypeDimensions 

if not commonTaskDimensions.isdisjoint(perDatasetTypeDimensions): 

overlap = commonTaskDimensions.intersections(perDatasetTypeDimensions) 

raise ValueError( 

f"Task {taskDs.taskDef.taskName} uses dimensions {overlap} without declaring them " 

f"per-DatasetType, but they are declared per-DatasetType by another task." 

) 

return perDatasetTypeDimensions 

 

def _makeFullIODatasetTypes(self, taskDatasets): 

"""Returns full set of input and output dataset types for all tasks. 

 

Parameters 

---------- 

taskDatasets : sequence of `_TaskDatasetTypes` 

Tasks with their inputs, outputs, initInputs and initOutputs. 

 

Returns 

------- 

required : `set` of `~lsst.daf.butler.DatasetType` 

Datasets that must exist in the repository in order to generate 

a QuantumGraph node that consumes them. 

optional : `set` of `~lsst.daf.butler.DatasetType` 

Datasets that will be produced by the graph, but may exist in the 

repository. If ``self.skipExisting`` is `True` and all outputs of 

a particular node already exist, it will be skipped. Otherwise 

pre-existing datasets of these types will cause 

`OutputExistsError` to be raised. 

prerequisite : `set` of `~lsst.daf.butler.DatasetType` 

Datasets that must exist in the repository, but whose absence 

should cause `PrerequisiteMissingError` to be raised if they 

are needed by any graph node that would otherwise be created. 

initInputs : `set` of `~lsst.daf.butler.DatasetType` 

Datasets used as init method inputs by the pipeline. 

initOutputs : `set` of `~lsst.daf.butler.DatasetType` 

Datasets used as init method outputs by the pipeline. 

""" 

# to build initial dataset graph we have to collect info about all 

# datasets to be used by this pipeline 

allDatasetTypes = {} 

required = set() 

optional = set() 

prerequisite = set() 

initInputs = set() 

initOutputs = set() 

for taskDs in taskDatasets: 

for ioType, ioSet in zip(("inputs", "outputs", "prerequisite", "initInputs", "initOutputs"), 

(required, optional, prerequisite, initInputs, initOutputs)): 

for dsType in getattr(taskDs, ioType): 

ioSet.add(dsType.name) 

allDatasetTypes[dsType.name] = dsType 

 

# Any dataset the pipeline produces can't be required or prerequisite 

required -= optional 

prerequisite -= optional 

 

# remove initOutputs from initInputs 

initInputs -= initOutputs 

 

required = set(allDatasetTypes[name] for name in required) 

optional = set(allDatasetTypes[name] for name in optional) 

prerequisite = set(allDatasetTypes[name] for name in prerequisite) 

initInputs = set(allDatasetTypes[name] for name in initInputs) 

initOutputs = set(allDatasetTypes[name] for name in initOutputs) 

return required, optional, prerequisite, initInputs, initOutputs 

 

def _makeGraph(self, taskDatasets, required, optional, prerequisite, 

initInputs, initOutputs, originInfo, userQuery, 

perDatasetTypeDimensions=()): 

"""Make QuantumGraph instance. 

 

Parameters 

---------- 

taskDatasets : sequence of `_TaskDatasetTypes` 

Tasks with their inputs and outputs. 

required : `set` of `~lsst.daf.butler.DatasetType` 

Datasets that must exist in the repository in order to generate 

a QuantumGraph node that consumes them. 

optional : `set` of `~lsst.daf.butler.DatasetType` 

Datasets that will be produced by the graph, but may exist in 

the repository. If ``self.skipExisting`` and all outputs of a 

particular node already exist, it will be skipped. Otherwise 

pre-existing datasets of these types will cause 

`OutputExistsError` to be raised. 

prerequisite : `set` of `~lsst.daf.butler.DatasetType` 

Datasets that must exist in the repository, but whose absence 

should cause `PrerequisiteMissingError` to be raised if they 

are needed by any graph node that would otherwise be created. 

initInputs : `set` of `DatasetType` 

Datasets which should exist in input repository, and will be used 

in task initialization 

initOutputs : `set` of `DatasetType` 

Datasets which which will be created in task initialization 

originInfo : `DatasetOriginInfo` 

Object which provides names of the input/output collections. 

userQuery : `str` 

String which defines user-defined selection for registry, should be 

empty or `None` if there is no restrictions on data selection. 

perDatasetTypeDimensions : iterable of `Dimension` or `str` 

Dimensions (or names thereof) that may have different values for 

different dataset types within the same quantum. 

 

Returns 

------- 

`QuantumGraph` instance. 

""" 

rows = self.registry.selectMultipleDatasetTypes( 

originInfo, userQuery, 

required=required, optional=optional, prerequisite=prerequisite, 

perDatasetTypeDimensions=perDatasetTypeDimensions 

) 

 

# store result locally for multi-pass algorithm below 

# TODO: change it to single pass 

dimensionVerse = [] 

try: 

for row in rows: 

_LOG.debug("row: %s", row) 

dimensionVerse.append(row) 

except LookupError as err: 

raise PrerequisiteMissingError(str(err)) from err 

 

# Next step is to group by task quantum dimensions 

qgraph = QuantumGraph() 

qgraph._inputDatasetTypes = (required | prerequisite) 

qgraph._outputDatasetTypes = optional 

for dsType in initInputs: 

for collection in originInfo.getInputCollections(dsType.name): 

result = self.registry.find(collection, dsType) 

if result is not None: 

qgraph.initInputs.append(result) 

break 

else: 

raise GraphBuilderError(f"Could not find initInput {dsType.name} in any input" 

" collection") 

for dsType in initOutputs: 

qgraph.initOutputs.append(DatasetRef(dsType, {})) 

 

for taskDss in taskDatasets: 

taskQuantaInputs = {} # key is the quantum dataId (as tuple) 

taskQuantaOutputs = {} # key is the quantum dataId (as tuple) 

qlinks = [] 

for dimensionName in taskDss.taskDef.config.quantum.dimensions: 

dimension = self.dimensions[dimensionName] 

qlinks += dimension.links() 

_LOG.debug("task %s qdimensions: %s", taskDss.taskDef.label, qlinks) 

 

# some rows will be non-unique for subset of dimensions, create 

# temporary structure to remove duplicates 

for row in dimensionVerse: 

qkey = tuple((col, row.dataId[col]) for col in qlinks) 

_LOG.debug("qkey: %s", qkey) 

 

def _datasetRefKey(datasetRef): 

return tuple(sorted(datasetRef.dataId.items())) 

 

qinputs = taskQuantaInputs.setdefault(qkey, {}) 

for dsType in taskDss.inputs: 

datasetRefs = qinputs.setdefault(dsType, {}) 

datasetRef = row.datasetRefs[dsType] 

datasetRefs[_datasetRefKey(datasetRef)] = datasetRef 

_LOG.debug("add input datasetRef: %s %s", dsType.name, datasetRef) 

 

qoutputs = taskQuantaOutputs.setdefault(qkey, {}) 

for dsType in taskDss.outputs: 

datasetRefs = qoutputs.setdefault(dsType, {}) 

datasetRef = row.datasetRefs[dsType] 

datasetRefs[_datasetRefKey(datasetRef)] = datasetRef 

_LOG.debug("add output datasetRef: %s %s", dsType.name, datasetRef) 

 

# all nodes for this task 

quanta = [] 

for qkey in taskQuantaInputs: 

# taskQuantaInputs and taskQuantaOutputs have the same keys 

_LOG.debug("make quantum for qkey: %s", qkey) 

quantum = Quantum(run=None, task=None) 

 

# add all outputs, but check first that outputs don't exist 

outputs = list(chain.from_iterable(datasetRefs.values() 

for datasetRefs in taskQuantaOutputs[qkey].values())) 

for ref in outputs: 

_LOG.debug("add output: %s", ref) 

if self.skipExisting and all(ref.id is not None for ref in outputs): 

_LOG.debug("all output datasetRefs already exist, skip quantum") 

continue 

if any(ref.id is not None for ref in outputs): 

# some outputs exist, can't override them 

raise OutputExistsError(taskDss.taskDef.taskName, outputs) 

 

for ref in outputs: 

quantum.addOutput(ref) 

 

# add all inputs 

for datasetRefs in taskQuantaInputs[qkey].values(): 

for ref in datasetRefs.values(): 

quantum.addPredictedInput(ref) 

_LOG.debug("add input: %s", ref) 

 

quanta.append(quantum) 

 

qgraph.append(QuantumGraphTaskNodes(taskDss.taskDef, quanta)) 

 

return qgraph