Coverage for python/lsst/daf/butler/column_spec.py: 81%

102 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-05 10:00 +0000

1# This file is part of butler4. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "ColumnSpec", 

32 "IntColumnSpec", 

33 "StringColumnSpec", 

34 "HashColumnSpec", 

35 "FloatColumnSpec", 

36 "BoolColumnSpec", 

37 "UUIDColumnSpec", 

38 "RegionColumnSpec", 

39 "TimespanColumnSpec", 

40 "ColumnType", 

41 "COLLECTION_NAME_MAX_LENGTH", 

42) 

43 

44import textwrap 

45import uuid 

46from abc import ABC, abstractmethod 

47from typing import Annotated, Any, ClassVar, Literal, TypeAlias, Union, final 

48 

49import astropy.time 

50import pyarrow as pa 

51import pydantic 

52from lsst.sphgeom import Region 

53 

54from . import arrow_utils, ddl 

55from ._timespan import Timespan 

56 

57ColumnType: TypeAlias = Literal[ 

58 "int", "string", "hash", "float", "datetime", "bool", "uuid", "timespan", "region" 

59] 

60 

61 

62COLLECTION_NAME_MAX_LENGTH = 64 

63# TODO: DM-42541 would bee a good opportunity to move this constant to a 

64# better home; this file is the least-bad home I can think of for now. Note 

65# that actually changing the value is a (minor) schema change. 

66 

67 

68class _BaseColumnSpec(pydantic.BaseModel, ABC): 

69 """Base class for descriptions of table columns.""" 

70 

71 name: str = pydantic.Field(description="""Name of the column.""") 

72 

73 doc: str = pydantic.Field(default="", description="Documentation for the column.") 

74 

75 type: ColumnType 

76 

77 nullable: bool = pydantic.Field( 

78 default=True, 

79 description="Whether the column may be ``NULL``.", 

80 ) 

81 

82 def to_sql_spec(self, **kwargs: Any) -> ddl.FieldSpec: 

83 """Convert this specification to a SQL-specific one. 

84 

85 Parameters 

86 ---------- 

87 **kwargs 

88 Forwarded to `ddl.FieldSpec`. 

89 

90 Returns 

91 ------- 

92 sql_spec : `ddl.FieldSpec` 

93 A SQL-specific version of this specification. 

94 """ 

95 return ddl.FieldSpec(name=self.name, dtype=ddl.VALID_CONFIG_COLUMN_TYPES[self.type], **kwargs) 

96 

97 @abstractmethod 

98 def to_arrow(self) -> arrow_utils.ToArrow: 

99 """Return an object that converts values of this column to a column in 

100 an Arrow table. 

101 

102 Returns 

103 ------- 

104 converter : `arrow_utils.ToArrow` 

105 A converter object with schema information in Arrow form. 

106 """ 

107 raise NotImplementedError() 

108 

109 def display(self, level: int = 0, tab: str = " ") -> list[str]: 

110 """Return a human-reader-focused string description of this column as 

111 a list of lines. 

112 

113 Parameters 

114 ---------- 

115 level : `int` 

116 Number of indentation tabs for the first line. 

117 tab : `str` 

118 Characters to duplicate ``level`` times to form the actual indent. 

119 

120 Returns 

121 ------- 

122 lines : `list` [ `str` ] 

123 Display lines. 

124 """ 

125 lines = [f"{tab * level}{self.name}: {self.type}"] 

126 if self.doc: 

127 indent = tab * (level + 1) 

128 lines.extend( 

129 textwrap.wrap( 

130 self.doc, 

131 initial_indent=indent, 

132 subsequent_indent=indent, 

133 ) 

134 ) 

135 return lines 

136 

137 def __str__(self) -> str: 

138 return "\n".join(self.display()) 

139 

140 

141@final 

142class IntColumnSpec(_BaseColumnSpec): 

143 """Description of an integer column.""" 

144 

145 pytype: ClassVar[type] = int 

146 

147 type: Literal["int"] = "int" 

148 

149 def to_arrow(self) -> arrow_utils.ToArrow: 

150 # Docstring inherited. 

151 return arrow_utils.ToArrow.for_primitive(self.name, pa.uint64(), nullable=self.nullable) 

152 

153 

154@final 

155class StringColumnSpec(_BaseColumnSpec): 

156 """Description of a string column.""" 

157 

158 pytype: ClassVar[type] = str 

159 

160 type: Literal["string"] = "string" 

161 

162 length: int 

163 """Maximum length of strings.""" 

164 

165 def to_sql_spec(self, **kwargs: Any) -> ddl.FieldSpec: 

166 # Docstring inherited. 

167 return super().to_sql_spec(length=self.length, **kwargs) 

168 

169 def to_arrow(self) -> arrow_utils.ToArrow: 

170 # Docstring inherited. 

171 return arrow_utils.ToArrow.for_primitive(self.name, pa.string(), nullable=self.nullable) 

172 

173 

174@final 

175class HashColumnSpec(_BaseColumnSpec): 

176 """Description of a hash digest.""" 

177 

178 pytype: ClassVar[type] = bytes 

179 

180 type: Literal["hash"] = "hash" 

181 

182 nbytes: int 

183 """Number of bytes for the hash.""" 

184 

185 def to_sql_spec(self, **kwargs: Any) -> ddl.FieldSpec: 

186 # Docstring inherited. 

187 return super().to_sql_spec(nbytes=self.nbytes, **kwargs) 

188 

189 def to_arrow(self) -> arrow_utils.ToArrow: 

190 # Docstring inherited. 

191 return arrow_utils.ToArrow.for_primitive( 

192 self.name, 

193 # The size for Arrow binary columns is a fixed size, not a maximum 

194 # as in SQL, so we use a variable-size column. 

195 pa.binary(), 

196 nullable=self.nullable, 

197 ) 

198 

199 

200@final 

201class FloatColumnSpec(_BaseColumnSpec): 

202 """Description of a float column.""" 

203 

204 pytype: ClassVar[type] = float 

205 

206 type: Literal["float"] = "float" 

207 

208 def to_arrow(self) -> arrow_utils.ToArrow: 

209 # Docstring inherited. 

210 assert self.nullable is not None, "nullable=None should be resolved by validators" 

211 return arrow_utils.ToArrow.for_primitive(self.name, pa.float64(), nullable=self.nullable) 

212 

213 

214@final 

215class BoolColumnSpec(_BaseColumnSpec): 

216 """Description of a bool column.""" 

217 

218 pytype: ClassVar[type] = bool 

219 

220 type: Literal["bool"] = "bool" 

221 

222 def to_arrow(self) -> arrow_utils.ToArrow: 

223 # Docstring inherited. 

224 return arrow_utils.ToArrow.for_primitive(self.name, pa.bool_(), nullable=self.nullable) 

225 

226 

227@final 

228class UUIDColumnSpec(_BaseColumnSpec): 

229 """Description of a UUID column.""" 

230 

231 pytype: ClassVar[type] = uuid.UUID 

232 

233 type: Literal["uuid"] = "uuid" 

234 

235 def to_arrow(self) -> arrow_utils.ToArrow: 

236 # Docstring inherited. 

237 assert self.nullable is not None, "nullable=None should be resolved by validators" 

238 return arrow_utils.ToArrow.for_uuid(self.name, nullable=self.nullable) 

239 

240 

241@final 

242class RegionColumnSpec(_BaseColumnSpec): 

243 """Description of a region column.""" 

244 

245 name: str = "region" 

246 

247 pytype: ClassVar[type] = Region 

248 

249 type: Literal["region"] = "region" 

250 

251 nbytes: int = 2048 

252 """Number of bytes for the encoded region.""" 

253 

254 def to_arrow(self) -> arrow_utils.ToArrow: 

255 # Docstring inherited. 

256 assert self.nullable is not None, "nullable=None should be resolved by validators" 

257 return arrow_utils.ToArrow.for_region(self.name, nullable=self.nullable) 

258 

259 

260@final 

261class TimespanColumnSpec(_BaseColumnSpec): 

262 """Description of a timespan column.""" 

263 

264 name: str = "timespan" 

265 

266 pytype: ClassVar[type] = Timespan 

267 

268 type: Literal["timespan"] = "timespan" 

269 

270 def to_arrow(self) -> arrow_utils.ToArrow: 

271 # Docstring inherited. 

272 return arrow_utils.ToArrow.for_timespan(self.name, nullable=self.nullable) 

273 

274 

275@final 

276class DateTimeColumnSpec(_BaseColumnSpec): 

277 """Description of a time column, stored as integer TAI nanoseconds since 

278 1970-01-01 and represented in Python via `astropy.time.Time`. 

279 """ 

280 

281 pytype: ClassVar[type] = astropy.time.Time 

282 

283 type: Literal["datetime"] = "datetime" 

284 

285 def to_arrow(self) -> arrow_utils.ToArrow: 

286 # Docstring inherited. 

287 assert self.nullable is not None, "nullable=None should be resolved by validators" 

288 return arrow_utils.ToArrow.for_datetime(self.name, nullable=self.nullable) 

289 

290 

291ColumnSpec = Annotated[ 

292 Union[ 

293 IntColumnSpec, 

294 StringColumnSpec, 

295 HashColumnSpec, 

296 FloatColumnSpec, 

297 BoolColumnSpec, 

298 UUIDColumnSpec, 

299 RegionColumnSpec, 

300 TimespanColumnSpec, 

301 DateTimeColumnSpec, 

302 ], 

303 pydantic.Field(discriminator="type"), 

304]