Coverage for python / lsst / daf / butler / script / ingest_files.py: 24%

54 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-01 08:18 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("ingest_files",) 

30 

31import logging 

32from collections import defaultdict 

33from typing import TYPE_CHECKING, Any 

34 

35from astropy.table import Table 

36 

37from lsst.resources import ResourcePath 

38from lsst.utils import doImport 

39 

40from .._butler import Butler 

41from .._dataset_ref import DatasetIdGenEnum, DatasetRef 

42from .._file_dataset import FileDataset 

43 

44if TYPE_CHECKING: 

45 from .._dataset_type import DatasetType 

46 from ..dimensions import DimensionUniverse 

47 

48log = logging.getLogger(__name__) 

49 

50 

51def ingest_files( 

52 repo: str, 

53 dataset_type: str, 

54 run: str, 

55 table_file: str, 

56 data_id: tuple[str, ...] = (), 

57 formatter: str | None = None, 

58 id_generation_mode: str = "UNIQUE", 

59 prefix: str | None = None, 

60 transfer: str = "auto", 

61 track_file_attrs: bool = True, 

62) -> None: 

63 """Ingest files from a table. 

64 

65 Parameters 

66 ---------- 

67 repo : `str` 

68 URI string of the Butler repo to use. 

69 dataset_type : `str` 

70 The name of the dataset type for the files to be ingested. This 

71 dataset type must exist. 

72 run : `str` 

73 The run in which the files should be ingested. 

74 table_file : `str` 

75 Path to a table file to read. This file can be in any format that 

76 can be read by Astropy so long as Astropy can determine the format 

77 itself. 

78 data_id : `tuple` of `str` 

79 Tuple of strings of the form ``keyword=value`` that can be used 

80 to define dataId elements that are fixed for all ingested files 

81 found in the table file. This allows those columns to be missing 

82 from the table file. Dimensions given here override table columns. 

83 formatter : `str`, optional 

84 Fully-qualified python class name for the `Formatter` to use 

85 to read the ingested files. If `None` the formatter is read from 

86 datastore configuration based on the dataset type. 

87 id_generation_mode : `str`, optional 

88 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`. 

89 prefix : `str`, optional 

90 Prefix to use when resolving relative paths in table files. The default 

91 is to use the current working directory. 

92 transfer : `str`, optional 

93 Transfer mode to use for ingest. 

94 track_file_attrs : `bool`, optional 

95 Control whether file attributes such as the size or checksum should 

96 be tracked by the datastore. Whether this parameter is honored 

97 depends on the specific datastore implementation. 

98 """ 

99 # Check that the formatter can be imported -- validate this as soon 

100 # as possible before we read a potentially large table file. 

101 if formatter: 

102 doImport(formatter) 

103 else: 

104 formatter = None 

105 

106 # Force empty string prefix (from click) to None for API compatibility. 

107 if not prefix: 

108 prefix = None 

109 

110 # Convert the dataset ID gen mode string to enum. 

111 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode] 

112 

113 # Create the butler with the relevant run attached. 

114 with Butler.from_config(repo, run=run) as butler: 

115 datasetType = butler.get_dataset_type(dataset_type) 

116 

117 # Convert the k=v strings into a dataId dict. 

118 universe = butler.dimensions 

119 common_data_id = parse_data_id_tuple(data_id, universe) 

120 

121 # Read the table assuming that Astropy can work out the format. 

122 uri = ResourcePath(table_file, forceAbsolute=False) 

123 with uri.as_local() as local_file: 

124 table = Table.read(local_file.ospath) 

125 

126 datasets = extract_datasets_from_table( 

127 table, common_data_id, datasetType, run, formatter, prefix, id_gen_mode 

128 ) 

129 

130 butler.ingest(*datasets, transfer=transfer, record_validation_info=track_file_attrs) 

131 

132 

133def extract_datasets_from_table( 

134 table: Table, 

135 common_data_id: dict, 

136 datasetType: DatasetType, 

137 run: str, 

138 formatter: str | None = None, 

139 prefix: str | None = None, 

140 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

141) -> list[FileDataset]: 

142 """Extract datasets from the supplied table. 

143 

144 Parameters 

145 ---------- 

146 table : `astropy.table.Table` 

147 Table containing the datasets. The first column is assumed to be 

148 the file URI and the remaining columns are dimensions. 

149 common_data_id : `dict` 

150 Data ID values that are common to every row in the table. These 

151 take priority if a dimension in this dataId is also present as 

152 a column in the table. 

153 datasetType : `DatasetType` 

154 The dataset type to be associated with the ingested data. 

155 run : `str` 

156 The name of the run that will be receiving these datasets. 

157 formatter : `str`, optional 

158 Fully-qualified python class name for the `Formatter` to use 

159 to read the ingested files. If `None` the formatter is read from 

160 datastore configuration based on the dataset type. 

161 prefix : `str`, optional 

162 Prefix to be used for relative paths. Can be `None` for current 

163 working directory. 

164 id_generation_mode : `DatasetIdGenEnum`, optional 

165 The mode to use when creating the dataset IDs. 

166 

167 Returns 

168 ------- 

169 datasets : `list` of `FileDataset` 

170 The `FileDataset` objects corresponding to the rows in the table. 

171 The number of elements in this list can be smaller than the number 

172 of rows in the file because one file can appear in multiple rows 

173 with different dataIds. 

174 """ 

175 # The file is the first column and everything else is assumed to 

176 # be dimensions so we need to know the name of that column. 

177 file_column = table.colnames[0] 

178 

179 # Handle multiple dataIds per file by grouping by file. 

180 refs_by_file = defaultdict(list) 

181 n_dataset_refs = 0 

182 for row in table: 

183 # Convert the row to a dataId, remembering to extract the 

184 # path column. 

185 dataId = dict(row) 

186 path = dataId.pop(file_column) 

187 

188 # The command line can override a column. 

189 dataId.update(common_data_id) 

190 

191 # Create the dataset ref that is to be ingested. 

192 ref = DatasetRef(datasetType, dataId, run=run, id_generation_mode=id_generation_mode) # type: ignore 

193 

194 # Convert path to absolute (because otherwise system will 

195 # assume relative to datastore root and that is almost certainly 

196 # never the right default here). 

197 path_uri = ResourcePath(path, root=prefix, forceAbsolute=True) 

198 

199 refs_by_file[path_uri].append(ref) 

200 n_dataset_refs += 1 

201 

202 datasets = [ 

203 FileDataset( 

204 path=file_uri, 

205 refs=refs, 

206 formatter=formatter, 

207 ) 

208 for file_uri, refs in refs_by_file.items() 

209 ] 

210 

211 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets)) 

212 

213 return datasets 

214 

215 

216def parse_data_id_tuple(data_ids: tuple[str, ...], universe: DimensionUniverse) -> dict[str, Any]: 

217 """Convert any additional k=v strings in the dataId tuple to dict 

218 form. 

219 

220 Parameters 

221 ---------- 

222 data_ids : `tuple` of `str` 

223 Strings of keyword=value pairs defining a data ID. 

224 universe : `DimensionUniverse` 

225 The relevant universe. 

226 

227 Returns 

228 ------- 

229 data_id : `dict` 

230 Data ID transformed from string into dictionary. 

231 """ 

232 data_id: dict[str, Any] = {} 

233 for id_str in data_ids: 

234 dimension_str, value = id_str.split("=") 

235 

236 try: 

237 dimension = universe.dimensions[dimension_str] 

238 except KeyError: 

239 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None 

240 

241 # Cast the value to the right python type (since they will be 

242 # strings at this point). 

243 value = dimension.primaryKey.getPythonType()(value) 

244 

245 data_id[dimension_str] = value 

246 return data_id