Coverage for python/lsst/daf/butler/script/ingest_files.py: 23%

54 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-11 03:16 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("ingest_files",) 

30 

31import logging 

32from collections import defaultdict 

33from typing import TYPE_CHECKING, Any 

34 

35from astropy.table import Table 

36from lsst.resources import ResourcePath 

37from lsst.utils import doImport 

38 

39from .._butler import Butler 

40from .._dataset_ref import DatasetIdGenEnum, DatasetRef 

41from .._file_dataset import FileDataset 

42 

43if TYPE_CHECKING: 

44 from .._dataset_type import DatasetType 

45 from ..dimensions import DimensionUniverse 

46 

47log = logging.getLogger(__name__) 

48 

49 

50def ingest_files( 

51 repo: str, 

52 dataset_type: str, 

53 run: str, 

54 table_file: str, 

55 data_id: tuple[str, ...] = (), 

56 formatter: str | None = None, 

57 id_generation_mode: str = "UNIQUE", 

58 prefix: str | None = None, 

59 transfer: str = "auto", 

60) -> None: 

61 """Ingest files from a table. 

62 

63 Parameters 

64 ---------- 

65 repo : `str` 

66 URI string of the Butler repo to use. 

67 dataset_type : `str` 

68 The name of the dataset type for the files to be ingested. This 

69 dataset type must exist. 

70 run : `str` 

71 The run in which the files should be ingested. 

72 table_file : `str` 

73 Path to a table file to read. This file can be in any format that 

74 can be read by Astropy so long as Astropy can determine the format 

75 itself. 

76 data_id : `tuple` of `str` 

77 Tuple of strings of the form ``keyword=value`` that can be used 

78 to define dataId elements that are fixed for all ingested files 

79 found in the table file. This allows those columns to be missing 

80 from the table file. Dimensions given here override table columns. 

81 formatter : `str`, optional 

82 Fully-qualified python class name for the `Formatter` to use 

83 to read the ingested files. If `None` the formatter is read from 

84 datastore configuration based on the dataset type. 

85 id_generation_mode : `str`, optional 

86 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`. 

87 prefix : `str`, optional 

88 Prefix to use when resolving relative paths in table files. The default 

89 is to use the current working directory. 

90 transfer : `str`, optional 

91 Transfer mode to use for ingest. 

92 """ 

93 # Check that the formatter can be imported -- validate this as soon 

94 # as possible before we read a potentially large table file. 

95 if formatter: 

96 doImport(formatter) 

97 else: 

98 formatter = None 

99 

100 # Force empty string prefix (from click) to None for API compatibility. 

101 if not prefix: 

102 prefix = None 

103 

104 # Convert the dataset ID gen mode string to enum. 

105 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode] 

106 

107 # Create the butler with the relevant run attached. 

108 butler = Butler.from_config(repo, run=run) 

109 

110 datasetType = butler.get_dataset_type(dataset_type) 

111 

112 # Convert the k=v strings into a dataId dict. 

113 universe = butler.dimensions 

114 common_data_id = parse_data_id_tuple(data_id, universe) 

115 

116 # Read the table assuming that Astropy can work out the format. 

117 uri = ResourcePath(table_file, forceAbsolute=False) 

118 with uri.as_local() as local_file: 

119 table = Table.read(local_file.ospath) 

120 

121 datasets = extract_datasets_from_table( 

122 table, common_data_id, datasetType, run, formatter, prefix, id_gen_mode 

123 ) 

124 

125 butler.ingest(*datasets, transfer=transfer) 

126 

127 

128def extract_datasets_from_table( 

129 table: Table, 

130 common_data_id: dict, 

131 datasetType: DatasetType, 

132 run: str, 

133 formatter: str | None = None, 

134 prefix: str | None = None, 

135 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

136) -> list[FileDataset]: 

137 """Extract datasets from the supplied table. 

138 

139 Parameters 

140 ---------- 

141 table : `astropy.table.Table` 

142 Table containing the datasets. The first column is assumed to be 

143 the file URI and the remaining columns are dimensions. 

144 common_data_id : `dict` 

145 Data ID values that are common to every row in the table. These 

146 take priority if a dimension in this dataId is also present as 

147 a column in the table. 

148 datasetType : `DatasetType` 

149 The dataset type to be associated with the ingested data. 

150 run : `str` 

151 The name of the run that will be receiving these datasets. 

152 formatter : `str`, optional 

153 Fully-qualified python class name for the `Formatter` to use 

154 to read the ingested files. If `None` the formatter is read from 

155 datastore configuration based on the dataset type. 

156 prefix : `str`, optional 

157 Prefix to be used for relative paths. Can be `None` for current 

158 working directory. 

159 id_generation_mode : `DatasetIdGenEnum`, optional 

160 The mode to use when creating the dataset IDs. 

161 

162 Returns 

163 ------- 

164 datasets : `list` of `FileDataset` 

165 The `FileDataset` objects corresponding to the rows in the table. 

166 The number of elements in this list can be smaller than the number 

167 of rows in the file because one file can appear in multiple rows 

168 with different dataIds. 

169 """ 

170 # The file is the first column and everything else is assumed to 

171 # be dimensions so we need to know the name of that column. 

172 file_column = table.colnames[0] 

173 

174 # Handle multiple dataIds per file by grouping by file. 

175 refs_by_file = defaultdict(list) 

176 n_dataset_refs = 0 

177 for row in table: 

178 # Convert the row to a dataId, remembering to extract the 

179 # path column. 

180 dataId = dict(row) 

181 path = dataId.pop(file_column) 

182 

183 # The command line can override a column. 

184 dataId.update(common_data_id) 

185 

186 # Create the dataset ref that is to be ingested. 

187 ref = DatasetRef(datasetType, dataId, run=run, id_generation_mode=id_generation_mode) # type: ignore 

188 

189 # Convert path to absolute (because otherwise system will 

190 # assume relative to datastore root and that is almost certainly 

191 # never the right default here). 

192 path_uri = ResourcePath(path, root=prefix, forceAbsolute=True) 

193 

194 refs_by_file[path_uri].append(ref) 

195 n_dataset_refs += 1 

196 

197 datasets = [ 

198 FileDataset( 

199 path=file_uri, 

200 refs=refs, 

201 formatter=formatter, 

202 ) 

203 for file_uri, refs in refs_by_file.items() 

204 ] 

205 

206 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets)) 

207 

208 return datasets 

209 

210 

211def parse_data_id_tuple(data_ids: tuple[str, ...], universe: DimensionUniverse) -> dict[str, Any]: 

212 """Convert any additional k=v strings in the dataId tuple to dict 

213 form. 

214 

215 Parameters 

216 ---------- 

217 data_ids : `tuple` of `str` 

218 Strings of keyword=value pairs defining a data ID. 

219 universe : `DimensionUniverse` 

220 The relevant universe. 

221 

222 Returns 

223 ------- 

224 data_id : `dict` 

225 Data ID transformed from string into dictionary. 

226 """ 

227 data_id: dict[str, Any] = {} 

228 for id_str in data_ids: 

229 dimension_str, value = id_str.split("=") 

230 

231 try: 

232 dimension = universe.dimensions[dimension_str] 

233 except KeyError: 

234 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None 

235 

236 # Cast the value to the right python type (since they will be 

237 # strings at this point). 

238 value = dimension.primaryKey.getPythonType()(value) 

239 

240 data_id[dimension_str] = value 

241 return data_id