Coverage for python/lsst/daf/butler/column_spec.py: 78%

106 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-02 03:16 -0700

1# This file is part of butler4. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "ColumnSpec", 

32 "IntColumnSpec", 

33 "StringColumnSpec", 

34 "HashColumnSpec", 

35 "FloatColumnSpec", 

36 "BoolColumnSpec", 

37 "UUIDColumnSpec", 

38 "RegionColumnSpec", 

39 "TimespanColumnSpec", 

40 "ColumnType", 

41 "COLLECTION_NAME_MAX_LENGTH", 

42) 

43 

44import textwrap 

45import uuid 

46from abc import ABC, abstractmethod 

47from typing import Annotated, Any, ClassVar, Literal, TypeAlias, Union, final 

48 

49import astropy.time 

50import pyarrow as pa 

51import pydantic 

52from lsst.sphgeom import Region 

53 

54from . import arrow_utils, ddl 

55from ._timespan import Timespan 

56from .name_shrinker import NameShrinker 

57 

58ColumnType: TypeAlias = Literal[ 

59 "int", "string", "hash", "float", "datetime", "bool", "uuid", "timespan", "region" 

60] 

61 

62 

63COLLECTION_NAME_MAX_LENGTH = 64 

64# TODO: DM-42541 would bee a good opportunity to move this constant to a 

65# better home; this file is the least-bad home I can think of for now. Note 

66# that actually changing the value is a (minor) schema change. 

67 

68 

69class _BaseColumnSpec(pydantic.BaseModel, ABC): 

70 """Base class for descriptions of table columns.""" 

71 

72 name: str = pydantic.Field(description="""Name of the column.""") 

73 

74 doc: str = pydantic.Field(default="", description="Documentation for the column.") 

75 

76 type: ColumnType 

77 

78 nullable: bool = pydantic.Field( 

79 default=True, 

80 description="Whether the column may be ``NULL``.", 

81 ) 

82 

83 def to_sql_spec(self, name_shrinker: NameShrinker | None = None, **kwargs: Any) -> ddl.FieldSpec: 

84 """Convert this specification to a SQL-specific one. 

85 

86 Parameters 

87 ---------- 

88 name_shrinker : `NameShrinker`, optional 

89 Object that should be used to shrink the field name to ensure it 

90 fits within database-specific limits. 

91 **kwargs 

92 Forwarded to `ddl.FieldSpec`. 

93 

94 Returns 

95 ------- 

96 sql_spec : `ddl.FieldSpec` 

97 A SQL-specific version of this specification. 

98 """ 

99 name = self.name 

100 if name_shrinker is not None: 

101 name = name_shrinker.shrink(name) 

102 return ddl.FieldSpec(name=name, dtype=ddl.VALID_CONFIG_COLUMN_TYPES[self.type], **kwargs) 

103 

104 @abstractmethod 

105 def to_arrow(self) -> arrow_utils.ToArrow: 

106 """Return an object that converts values of this column to a column in 

107 an Arrow table. 

108 

109 Returns 

110 ------- 

111 converter : `arrow_utils.ToArrow` 

112 A converter object with schema information in Arrow form. 

113 """ 

114 raise NotImplementedError() 

115 

116 def display(self, level: int = 0, tab: str = " ") -> list[str]: 

117 """Return a human-reader-focused string description of this column as 

118 a list of lines. 

119 

120 Parameters 

121 ---------- 

122 level : `int` 

123 Number of indentation tabs for the first line. 

124 tab : `str` 

125 Characters to duplicate ``level`` times to form the actual indent. 

126 

127 Returns 

128 ------- 

129 lines : `list` [ `str` ] 

130 Display lines. 

131 """ 

132 lines = [f"{tab * level}{self.name}: {self.type}"] 

133 if self.doc: 

134 indent = tab * (level + 1) 

135 lines.extend( 

136 textwrap.wrap( 

137 self.doc, 

138 initial_indent=indent, 

139 subsequent_indent=indent, 

140 ) 

141 ) 

142 return lines 

143 

144 def __str__(self) -> str: 

145 return "\n".join(self.display()) 

146 

147 

148@final 

149class IntColumnSpec(_BaseColumnSpec): 

150 """Description of an integer column.""" 

151 

152 pytype: ClassVar[type] = int 

153 

154 type: Literal["int"] = "int" 

155 

156 def to_arrow(self) -> arrow_utils.ToArrow: 

157 # Docstring inherited. 

158 return arrow_utils.ToArrow.for_primitive(self.name, pa.uint64(), nullable=self.nullable) 

159 

160 

161@final 

162class StringColumnSpec(_BaseColumnSpec): 

163 """Description of a string column.""" 

164 

165 pytype: ClassVar[type] = str 

166 

167 type: Literal["string"] = "string" 

168 

169 length: int 

170 """Maximum length of strings.""" 

171 

172 def to_sql_spec(self, name_shrinker: NameShrinker | None = None, **kwargs: Any) -> ddl.FieldSpec: 

173 # Docstring inherited. 

174 return super().to_sql_spec(length=self.length, name_shrinker=name_shrinker, **kwargs) 

175 

176 def to_arrow(self) -> arrow_utils.ToArrow: 

177 # Docstring inherited. 

178 return arrow_utils.ToArrow.for_primitive(self.name, pa.string(), nullable=self.nullable) 

179 

180 

181@final 

182class HashColumnSpec(_BaseColumnSpec): 

183 """Description of a hash digest.""" 

184 

185 pytype: ClassVar[type] = bytes 

186 

187 type: Literal["hash"] = "hash" 

188 

189 nbytes: int 

190 """Number of bytes for the hash.""" 

191 

192 def to_sql_spec(self, name_shrinker: NameShrinker | None = None, **kwargs: Any) -> ddl.FieldSpec: 

193 # Docstring inherited. 

194 return super().to_sql_spec(nbytes=self.nbytes, name_shrinker=name_shrinker, **kwargs) 

195 

196 def to_arrow(self) -> arrow_utils.ToArrow: 

197 # Docstring inherited. 

198 return arrow_utils.ToArrow.for_primitive( 

199 self.name, 

200 # The size for Arrow binary columns is a fixed size, not a maximum 

201 # as in SQL, so we use a variable-size column. 

202 pa.binary(), 

203 nullable=self.nullable, 

204 ) 

205 

206 

207@final 

208class FloatColumnSpec(_BaseColumnSpec): 

209 """Description of a float column.""" 

210 

211 pytype: ClassVar[type] = float 

212 

213 type: Literal["float"] = "float" 

214 

215 def to_arrow(self) -> arrow_utils.ToArrow: 

216 # Docstring inherited. 

217 assert self.nullable is not None, "nullable=None should be resolved by validators" 

218 return arrow_utils.ToArrow.for_primitive(self.name, pa.float64(), nullable=self.nullable) 

219 

220 

221@final 

222class BoolColumnSpec(_BaseColumnSpec): 

223 """Description of a bool column.""" 

224 

225 pytype: ClassVar[type] = bool 

226 

227 type: Literal["bool"] = "bool" 

228 

229 def to_arrow(self) -> arrow_utils.ToArrow: 

230 # Docstring inherited. 

231 return arrow_utils.ToArrow.for_primitive(self.name, pa.bool_(), nullable=self.nullable) 

232 

233 

234@final 

235class UUIDColumnSpec(_BaseColumnSpec): 

236 """Description of a UUID column.""" 

237 

238 pytype: ClassVar[type] = uuid.UUID 

239 

240 type: Literal["uuid"] = "uuid" 

241 

242 def to_arrow(self) -> arrow_utils.ToArrow: 

243 # Docstring inherited. 

244 assert self.nullable is not None, "nullable=None should be resolved by validators" 

245 return arrow_utils.ToArrow.for_uuid(self.name, nullable=self.nullable) 

246 

247 

248@final 

249class RegionColumnSpec(_BaseColumnSpec): 

250 """Description of a region column.""" 

251 

252 name: str = "region" 

253 

254 pytype: ClassVar[type] = Region 

255 

256 type: Literal["region"] = "region" 

257 

258 nbytes: int = 2048 

259 """Number of bytes for the encoded region.""" 

260 

261 def to_arrow(self) -> arrow_utils.ToArrow: 

262 # Docstring inherited. 

263 assert self.nullable is not None, "nullable=None should be resolved by validators" 

264 return arrow_utils.ToArrow.for_region(self.name, nullable=self.nullable) 

265 

266 

267@final 

268class TimespanColumnSpec(_BaseColumnSpec): 

269 """Description of a timespan column.""" 

270 

271 name: str = "timespan" 

272 

273 pytype: ClassVar[type] = Timespan 

274 

275 type: Literal["timespan"] = "timespan" 

276 

277 def to_arrow(self) -> arrow_utils.ToArrow: 

278 # Docstring inherited. 

279 return arrow_utils.ToArrow.for_timespan(self.name, nullable=self.nullable) 

280 

281 

282@final 

283class DateTimeColumnSpec(_BaseColumnSpec): 

284 """Description of a time column, stored as integer TAI nanoseconds since 

285 1970-01-01 and represented in Python via `astropy.time.Time`. 

286 """ 

287 

288 pytype: ClassVar[type] = astropy.time.Time 

289 

290 type: Literal["datetime"] = "datetime" 

291 

292 def to_arrow(self) -> arrow_utils.ToArrow: 

293 # Docstring inherited. 

294 assert self.nullable is not None, "nullable=None should be resolved by validators" 

295 return arrow_utils.ToArrow.for_datetime(self.name, nullable=self.nullable) 

296 

297 

298ColumnSpec = Annotated[ 

299 Union[ 

300 IntColumnSpec, 

301 StringColumnSpec, 

302 HashColumnSpec, 

303 FloatColumnSpec, 

304 BoolColumnSpec, 

305 UUIDColumnSpec, 

306 RegionColumnSpec, 

307 TimespanColumnSpec, 

308 DateTimeColumnSpec, 

309 ], 

310 pydantic.Field(discriminator="type"), 

311]