Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

# This file is part of daf_butler. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

 

__all__ = ("SqlPreFlight") 

 

import itertools 

import logging 

from sqlalchemy.sql import select, and_, text 

 

from lsst.sphgeom import Region 

from lsst.sphgeom.relationship import DISJOINT 

 

 

_LOG = logging.getLogger(__name__) 

 

 

def _scanDataUnits(dataUnits): 

"""Recursively scan units and their optional dependencies, return their 

names""" 

for dataUnit in dataUnits: 

yield dataUnit.name 

yield from _scanDataUnits(dataUnit.optionalDependencies) 

 

 

def _filterSummarizes(dataUnitJoins): 

"""Filter out DataUnitJoins that summarize other DataUnitJoins. 

 

Parameters 

---------- 

dataUnitJoins : iterable of `DataUnitJoin` 

 

Returns 

------- 

Iterator for DataUnitJoin which do not summarize any of the DataUnitJoins 

in the input set. 

""" 

dataUnitJoins = list(dataUnitJoins) 

dataUnitJoinNames = set(join.name for join in dataUnitJoins) 

for dataUnitJoin in dataUnitJoins: 

summarizes = set(dataUnitJoin.summarizes or []) 

# If it summarizes some other joins and all those joins are in the 

# set of joins then we do not need it. 

61 ↛ 62line 61 didn't jump to line 62, because the condition on line 61 was never true if summarizes and summarizes.issubset(dataUnitJoinNames): 

continue 

yield dataUnitJoin 

 

 

def _filterRegions(rowIter, firstRegionIndex): 

"""Filter result rows that have non-overlapping regions. 

 

Result set generated by query in selectDataUnits() method can include 

set of regions in each row (encoded as bytes). Due to pixel-based 

matching some regions may not overlap, this generator method filters 

rows that have disjoint regions. If result row contains more than two 

regions (this should not happen with our current schema) then row is 

filtered if any of two regions are disjoint. 

 

Parameters 

---------- 

rowIter : iterable 

Iterator for rows returned by the query on registry 

firstRegionIndex : `int` or ``None`` 

If not ``None`` then this is the starting position of the regions 

in the row, all columns starting with this position contain region 

data. All regions are encoded as bytes. 

""" 

total = 0 

86 ↛ 87line 86 didn't jump to line 87, because the condition on line 86 was never true if firstRegionIndex is not None: 

count = 0 

for row in rowIter: 

total += 1 

regions = [Region.decode(region) for region in row[firstRegionIndex:]] 

for reg1, reg2 in itertools.combinations(regions, 2): 

if reg1.relate(reg2) == DISJOINT: 

break 

else: 

count += 1 

yield tuple(row[:firstRegionIndex]) 

_LOG.debug("Total %d rows in result set, %d after region filtering", 

total, count) 

else: 

for row in rowIter: 

total += 1 

yield tuple(row) 

_LOG.debug("Total %d rows in result set, no region filtering", total) 

 

 

class SqlPreFlight: 

"""Class implementing part of preflight solver which extracts 

units data from registry. 

 

This is an implementation detail only to be used by SqlRegistry class, 

not supposed to be used anywhere else. 

 

Parameters 

---------- 

schema : `Schema` 

Schema instance 

connection : `sqlalchmey.Connection` 

Connection to use for database access. 

""" 

def __init__(self, schema, connection): 

self._schema = schema 

self._connection = connection 

 

def selectDataUnits(self, collections, expr, neededDatasetTypes, futureDatasetTypes): 

"""Evaluate a filter expression and lists of 

`DatasetTypes <DatasetType>` and return a set of data unit values. 

 

Returned set consists of combinations of units participating in data 

transformation from ``neededDatasetTypes`` to ``futureDatasetTypes``, 

restricted by existing data and filter expression. 

 

Parameters 

---------- 

collections : `list` of `str` 

An ordered `list` of collections indicating the Collections to 

search for Datasets. 

expr : `str` 

An expression that limits the `DataUnits <DataUnit>` and 

(indirectly) the Datasets returned. 

neededDatasetTypes : `list` of `DatasetType` 

The `list` of `DatasetTypes <DatasetType>` whose DataUnits will 

be included in the returned column set. Output is limited by the 

the Datasets of these DatasetTypes which already exist in the 

registry. 

futureDatasetTypes : `list` of `DatasetType` 

The `list` of `DatasetTypes <DatasetType>` whose DataUnits will 

be included in the returned column set. It is expected that 

Datasets for these DatasetTypes do not exist in the registry, 

but presently this is not checked. 

 

Returns 

------- 

header : `tuple` of `tuple` 

Length of tuple equals the number of columns in the returned 

result set. Each item is a tuple with two elements - DataUnit 

name (e.g. "Visit") and unit value name (e.g. "visit"). 

rows : iterable of `tuple` 

Result set, this can be a single-pass iterator. Each tuple 

contains unit values corresponding to units in a header. 

""" 

 

# for now only a single collection is supported 

163 ↛ 164line 163 didn't jump to line 164, because the condition on line 163 was never true if len(collections) != 1: 

raise ValueError("Only single collection is supported by makeDataGraph()") 

collection = collections[0] 

 

# Collect unit names in both input and output dataset types 

allUnitNames = set(itertools.chain.from_iterable(dsType.dataUnits for dsType in neededDatasetTypes)) 

allUnitNames.update(itertools.chain.from_iterable(dsType.dataUnits for dsType in futureDatasetTypes)) 

_LOG.debug("allUnitNames: %s", allUnitNames) 

 

# Build select column list 

selectColumns = [] 

header = [] # tuple (UnitName, link_name) for each returned column 

for unitName in allUnitNames: 

dataUnit = self._schema.dataUnits[unitName] 

177 ↛ 175line 177 didn't jump to line 175, because the condition on line 177 was never false if dataUnit.table is not None: 

# take link column names, usually there is one 

for link in dataUnit.link: 

header.append((dataUnit.name, link)) 

selectColumns.append(dataUnit.table.c[link]) 

_LOG.debug("selectColumns: %s", selectColumns) 

_LOG.debug("header: %s", header) 

 

# Extend units set with the "optional" superset from schema, so that 

# joins work correctly. This may bring more tables into query than 

# really needed, potential for optimization. 

allUnitNames = set(_scanDataUnits(self._schema.dataUnits[unitName] for unitName in allUnitNames)) 

 

# All DataUnit instances in a subset that we need 

allDataUnits = {unitName: self._schema.dataUnits[unitName] for unitName in allUnitNames} 

 

# joins for all unit tables 

where = [] 

for dataUnit in allDataUnits.values(): 

196 ↛ 197line 196 didn't jump to line 197, because the condition on line 196 was never true if dataUnit.table is None: 

continue 

_LOG.debug("add dataUnit: %s", dataUnit.name) 

 

# join with tables that we depend upon 

for otherUnit in dataUnit.dependencies: 

_LOG.debug(" join with unit: %s", otherUnit.name) 

for name, col in otherUnit.primaryKeyColumns.items(): 

_LOG.debug(" joining on column: %s", name) 

where.append(dataUnit.table.c[name] == col) 

 

# joins between skymap and camera units 

dataUnitJoins = [dataUnitJoin for dataUnitJoin in self._schema.dataUnits.joins.values() 

if dataUnitJoin.lhs.issubset(allUnitNames) and 

dataUnitJoin.rhs.issubset(allUnitNames)] 

_LOG.debug("all dataUnitJoins: %s", [join.name for join in dataUnitJoins]) 

 

# only use most specific joins 

dataUnitJoins = list(_filterSummarizes(dataUnitJoins)) 

_LOG.debug("filtered dataUnitJoins: %s", [join.name for join in dataUnitJoins]) 

 

joinedRegionTables = set() 

firstRegionIndex = None 

for dataUnitJoin in dataUnitJoins: 

# Some `DataUnitJoin`s have an associated region (e.g. they are spatial) 

# in that case they shouldn't be joined separately in the region lookup. 

if dataUnitJoin.spatial: 

continue 

 

# TODO: do not know yet how to handle MultiCameraExposureJoin, 

# skip it for now 

227 ↛ 233line 227 didn't jump to line 233, because the condition on line 227 was never false if dataUnitJoin.lhs == dataUnitJoin.rhs: 

continue 

 

# Look at each side of the DataUnitJoin and join it with 

# corresponding DataUnit tables, including making all necessary 

# joins for special multi-DataUnit region table(s). 

for connection in (dataUnitJoin.lhs, dataUnitJoin.rhs): 

regionHolder = self._schema.dataUnits.getRegionHolder(*connection) 

if len(connection) > 1: 

# if one of the joins is with Visit/Sensor then also bring 

# VisitSensorRegion table in and join it with the units 

if regionHolder.name in joinedRegionTables: 

_LOG.debug("region table already joined with units: %s", regionHolder.name) 

else: 

_LOG.debug("joining region table with units: %s", regionHolder.name) 

joinedRegionTables.add(regionHolder.name) 

 

for dataUnitName in connection: 

dataUnit = self._schema.dataUnits[dataUnitName] 

_LOG.debug(" joining region table with %s", dataUnitName) 

for name, col in dataUnit.primaryKeyColumns.items(): 

_LOG.debug(" joining on column: %s", name) 

where.append(regionHolder.table.c[name] == col) 

 

# now join region table with join table using PKs of all units 

_LOG.debug("join %s with %s", dataUnitJoin.name, connection) 

for colName in self._schema.dataUnits.getPrimaryKeyNames(connection): 

_LOG.debug(" joining on column: %s", colName) 

where.append(dataUnitJoin.table.c[colName] == regionHolder.table.c[colName]) 

 

# We also have to include regions from each side of the join 

# into resultset so that we can filter-out non-overlapping 

# regions. 

firstRegionIndex = len(header) 

selectColumns.append(regionHolder.regionColumn) 

 

_LOG.debug("units where: %s", [str(x) for x in where]) 

 

# join with input datasets to restrict to existing inputs 

dsTable = self._schema._metadata.tables["Dataset"] 

dsCollTable = self._schema._metadata.tables["DatasetCollection"] 

for dsType in neededDatasetTypes: 

_LOG.debug("joining dataset: %s", dsType.name) 

dsAlias = dsTable.alias("ds" + dsType.name) 

dsCollAlias = dsCollTable.alias("dsColl" + dsType.name) 

 

for unitName in dsType.dataUnits: 

dataUnit = allDataUnits[unitName] 

for link in dataUnit.link: 

_LOG.debug("joining on link: %s", link) 

where.append(dsAlias.c[link] == dataUnit.table.c[link]) 

 

where += [dsAlias.c["dataset_id"] == dsCollAlias.c["dataset_id"], 

dsAlias.c["dataset_type_name"] == dsType.name, 

dsCollAlias.c["collection"] == collection] 

_LOG.debug("datasets where: %s", [str(x) for x in where]) 

 

# build full query 

q = select(selectColumns) 

if expr: 

# TODO: potentially transform query from user-friendly expression 

where += [text(expr)] 

289 ↛ 293line 289 didn't jump to line 293, because the condition on line 289 was never false if where: 

where = and_(*where) 

_LOG.debug("full where: %s", where) 

q = q.where(where) 

_LOG.debug("full query: %s", q) 

 

# execute and return header and result iterator 

rows = self._connection.execute(q).fetchall() 

return tuple(header), _filterRegions(rows, firstRegionIndex)