Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

# This file is part of daf_butler. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

 

__all__ = ("SingleDatasetQueryBuilder",) 

 

 

import logging 

from sqlalchemy.sql import select, and_, functions, case 

from ..core import DimensionJoin 

from .queryBuilder import QueryBuilder 

 

_LOG = logging.getLogger(__name__) 

 

 

class SingleDatasetQueryBuilder(QueryBuilder): 

"""Specialization of `QueryBuilder` that includes a single join 

to the Dataset table corresponding to a single `DatasetType`. 

 

Most users should call `fromCollections` to construct an instance of this 

class rather than invoking the constructor directly. 

 

Parameters 

---------- 

registry : `SqlRegistry` 

Registry instance the query is being run against. 

datasetType : `DatasetType` 

`DatasetType` of the datasets this query searches for. 

selectableForDataset : `sqlalchemy.sql.expression.FromClause` 

SQLAlchemy object representing the Dataset table or a subquery 

equivalent. 

fromClause : `sqlalchemy.sql.expression.FromClause`, optional 

Initial FROM clause for the query. 

whereClause : SQLAlchemy boolean expression, optional 

Expression to use as the initial WHERE clause. 

addResultColumns : `bool` 

If `True` (default), add result columns to ``self.resultColumns`` 

for the dataset ID and dimension links used to identify this 

`DatasetType. 

""" 

 

def __init__(self, registry, *, datasetType, selectableForDataset, fromClause=None, whereClause=None, 

addResultColumns=True): 

super().__init__(registry, fromClause=fromClause, whereClause=whereClause) 

self._datasetType = datasetType 

self._selectableForDataset = selectableForDataset 

if addResultColumns: 

self.selectDatasetId() 

for link in datasetType.dimensions.links(): 

self.resultColumns.addDimensionLink(selectableForDataset, link) 

 

@classmethod 

def fromSingleCollection(cls, registry, datasetType, collection, addResultColumns=True): 

"""Construct a builder that searches a single collection for datasets 

of a single dataset type. 

 

Parameters 

---------- 

registry : `SqlRegistry` 

Registry instance the query is being run against. 

datasetType : `DatasetType` 

`DatasetType` of the datasets this query searches for. 

collection : `str` 

Name of the collection to search in. 

addResultColumns : `bool` 

If `True` (default), add result columns to ``self.resultColumns`` 

for the dataset ID and dimension links used to identify this 

`DatasetType. 

 

Returns 

------- 

builder : `SingleDatasetQueryBuilder` 

New query builder instance initialized with a 

`~QueryBuilder.fromClause` that either directly includes the 

dataset table or includes a subquery equivalent. 

 

Notes 

----- 

If there is only one collection, then there is a guarantee that 

data IDs are all unique (by data ID we mean the combination of all link 

values relevant for this dataset); in that case the dataset query can 

be written as: 

 

SELECT 

Dataset.dataset_id AS dataset_id, 

Dataset.link1 AS link1, 

... 

Dataset.link1 AS linkN 

FROM Dataset JOIN DatasetCollection 

ON Dataset.dataset_id = DatasetCollection.dataset_id 

WHERE Dataset.dataset_type_name = :dsType_name 

AND DatasetCollection.collection = :collection_name 

""" 

datasetTable = registry._schema.tables["Dataset"] 

datasetCollectionTable = registry._schema.tables["DatasetCollection"] 

fromClause = datasetTable.join( 

datasetCollectionTable, 

datasetTable.columns.dataset_id == datasetCollectionTable.columns.dataset_id 

) 

whereClause = and_(datasetTable.columns.dataset_type_name == datasetType.name, 

datasetCollectionTable.columns.collection == collection) 

return cls(registry, fromClause=fromClause, whereClause=whereClause, datasetType=datasetType, 

selectableForDataset=datasetTable, addResultColumns=addResultColumns) 

 

@classmethod 

def fromCollections(cls, registry, datasetType, collections, addResultColumns=True): 

"""Construct a builder that searches a multiple collections for 

datasets single dataset type. 

 

Parameters 

---------- 

registry : `SqlRegistry` 

Registry instance the query is being run against. 

datasetType : `DatasetType` 

`DatasetType` of the datasets this query searches for. 

collections : `list` of `str` 

List of collections to search, ordered from highest-priority to 

lowest. 

addResultColumns : `bool` 

If `True` (default), add result columns to ``self.resultColumns`` 

for the dataset ID and dimension links used to identify this 

`DatasetType. 

 

Returns 

------- 

builder : `SingleDatasetQueryBuilder` 

New query builder instance initialized with a 

`~QueryBuilder.fromClause` that either directly includes the 

dataset table or includes a subquery equivalent. 

 

Notes 

----- 

If ``len(collections)==1``, this method simply calls 

`fromSingleCollection`. 

 

If there are multiple collections, then there can be multiple matching 

Datasets for the same DataId. In that case we need only one Dataset 

record, which comes from earliest collection (in the user-provided 

order). Here things become complicated; we have to: 

- replace collection names with their order in input list 

- select all combinations of rows from Dataset and DatasetCollection 

which match collection names and dataset type name 

- from those only select rows with lowest collection position if 

there are multiple collections for the same DataId 

 

Replacing collection names with positions is easy: 

 

SELECT dataset_id, 

CASE collection 

WHEN 'collection1' THEN 0 

WHEN 'collection2' THEN 1 

... 

END AS collorder 

FROM DatasetCollection 

 

Combined query will look like (CASE ... END is as above): 

 

SELECT Dataset.dataset_id AS dataset_id, 

CASE DatasetCollection.collection ... END AS collorder, 

Dataset.link1, 

... 

Dataset.linkN 

FROM Dataset JOIN DatasetCollection 

ON Dataset.dataset_id = DatasetCollection.dataset_id 

WHERE Dataset.dataset_type_name = <dsType.name> 

AND DatasetCollection.collection IN (<collections>) 

 

Filtering is complicated; it would be simpler to use Common Table 

Expressions (WITH clause) but not all databases support CTEs, so we 

will have to do with the repeating sub-queries. We use GROUP BY for 

the data ID (link columns) and MIN(collorder) to find ``collorder`` 

for a particular DataId, then join it with previous combined selection: 

 

SELECT 

DS.dataset_id AS dataset_id, 

DS.link1 AS link1, 

... 

DS.linkN AS linkN 

FROM ( 

SELECT Dataset.dataset_id AS dataset_id, 

CASE ... END AS collorder, 

Dataset.link1, 

... 

Dataset.linkN 

FROM Dataset JOIN DatasetCollection 

ON Dataset.dataset_id = DatasetCollection.dataset_id 

WHERE Dataset.dataset_type_name = <dsType.name> 

AND DatasetCollection.collection IN (<collections>) 

) DS 

INNER JOIN ( 

SELECT 

MIN(CASE ... END AS) collorder, 

Dataset.link1, 

... 

Dataset.linkN 

FROM Dataset JOIN DatasetCollection 

ON Dataset.dataset_id = DatasetCollection.dataset_id 

WHERE Dataset.dataset_type_name = <dsType.name> 

AND DatasetCollection.collection IN (<collections>) 

GROUP BY ( 

Dataset.link1, 

... 

Dataset.linkN 

) 

) DSG 

ON (DS.colpos = DSG.colpos 

AND 

DS.link1 = DSG.link1 

AND 

... 

AND 

DS.linkN = DSG.linkN) 

""" 

if len(collections) == 1: 

return cls.fromSingleCollection(registry, datasetType, collections[0], 

addResultColumns=addResultColumns) 

 

# helper method 

def _columns(selectable, names): 

"""Return list of columns for given column names""" 

return [selectable.columns[name].label(name) for name in names] 

 

datasetTable = registry._schema.tables["Dataset"] 

datasetCollectionTable = registry._schema.tables["DatasetCollection"] 

 

# full set of link names for this DatasetType 

links = list(datasetType.dimensions.links()) 

 

# Starting point for both subqueries below: a join of Dataset to 

# DatasetCollection 

subJoin = datasetTable.join( 

datasetCollectionTable, 

datasetTable.columns.dataset_id == datasetCollectionTable.columns.dataset_id 

) 

subWhere = and_(datasetTable.columns.dataset_type_name == datasetType.name, 

datasetCollectionTable.columns.collection.in_(collections)) 

 

# CASE clause that transforms collection name to position in the given 

# list of collections 

collorder = case([ 

(datasetCollectionTable.columns.collection == coll, pos) for pos, coll in enumerate(collections) 

]) 

 

# first GROUP BY sub-query, find minimum `collorder` for each DataId 

columns = [functions.min(collorder).label("collorder")] + _columns(datasetTable, links) 

groupSubq = select(columns).select_from(subJoin).where(subWhere) 

groupSubq = groupSubq.group_by(*links) 

groupSubq = groupSubq.alias("sub1" + datasetType.name) 

 

# next combined sub-query 

columns = [collorder.label("collorder")] + _columns(datasetTable, ["dataset_id"] + links) 

combined = select(columns).select_from(subJoin).where(subWhere) 

combined = combined.alias("sub2" + datasetType.name) 

 

# now join these two 

joinsOn = [groupSubq.columns.collorder == combined.columns.collorder] + \ 

[groupSubq.columns[colName] == combined.columns[colName] for colName in links] 

 

return cls(registry, fromClause=combined.join(groupSubq, and_(*joinsOn)), 

datasetType=datasetType, selectableForDataset=combined, addResultColumns=addResultColumns) 

 

@property 

def datasetType(self): 

"""The dataset type this query searches for (`DatasetType`). 

""" 

return self._datasetType 

 

def relateDimensions(self, otherDimensions, addResultColumns=True): 

"""Add the dimension tables/views necessary to map the dimensions 

of this query's `DatasetType` to another set of dimensions. 

 

Parameters 

---------- 

otherDimensions : `DimensionGraph` or `DimensionSet` 

The dimensions we need to relate the dataset type to. One or more 

`DimensionJoin` tables/views will be added to the query for each 

`Dimension` in ``self.datasetType.dimensions`` that is not 

in ``otherDimensions``. 

addResultColumns : `bool` 

If `True` (default), add result columns to ``self.resultColumns`` 

for the dataset ID and dimension links used to identify this 

`DatasetType. 

 

Returns 

------- 

newLinks : `set` of `str` 

The names of additional dimension link columns provided by the 

subquery via the added joins. This is a subset of 

``otherDimensions.link()`` and disjoint from 

``self.datasetType.dimensions.links()``. 

 

Notes 

----- 

This method can currently only handle a single level of indirection - 

for any "missing" dimension (one that is in 

``self.datasetType.dimensions`` but not in ``otherDimensions``), there 

must be a single `DimensionJoin` that relates that dimension to one or 

more dimensions in ``otherDimensions``. 

""" 

allDimensions = self.datasetType.dimensions.union(otherDimensions) 

missingDimensions = self.datasetType.dimensions.toSet().difference(otherDimensions) 

newLinks = set() 

for missingLink in missingDimensions.links(): 

related = False 

for element in self.registry.dimensions.withLink(missingLink): 

if (isinstance(element, DimensionJoin) and not element.asNeeded and 

element.links().issubset(allDimensions.links())): 

self.joinDimensionElement(element) 

newLinks.update(element.links()) 

related = True 

if not related: 

raise ValueError(f"No join found relating link {missingLink} to {otherDimensions.links()}") 

newLinks -= self.datasetType.dimensions.links() 

if addResultColumns: 

for link in newLinks: 

self.selectDimensionLink(link) 

return newLinks 

 

def selectDatasetId(self): 

"""Add the ``dataset_id`` column to the SELECT clause of the query. 

""" 

self.resultColumns.addDatasetId(self._selectableForDataset, self.datasetType) 

 

def findSelectableForLink(self, link): 

# Docstring inherited from QueryBuilder.findSelectableForLink 

result = super().findSelectableForLink(link) 

if result is None and link in self.datasetType.dimensions.links(): 

result = self._selectableForDataset 

return result 

 

def findSelectableByName(self, name): 

# Docstring inherited from QueryBuilder.findSelectableByName 

result = super().findSelectableByName(name) 

if result is None and (name == self.datasetType.name or name == "Dataset"): 

result = self._selectableForDataset 

return result 

 

def convertResultRow(self, managed, *, expandDataId=True): 

"""Convert a result row for this query to a `DatasetRef`. 

 

Parameters 

---------- 

managed : `ResultsColumnsManager.ManagedRow` 

Intermediate result row object to convert. 

expandDataId : `bool` 

If `True` (default), query the registry again to fully populate 

the `DataId` associated with the returned `DatasetRef`. 

 

Returns 

------- 

ref : `DatasetRef` 

Reference to a dataset identified by the query. 

""" 

return managed.makeDatasetRef(self.datasetType, expandDataId=expandDataId)